From f5895943d91b41b0368830cdb6eaffb8eda0f4c8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Mar 2014 17:44:49 +0000
Subject: KEYS: Move the flags representing required permission to linux/key.h

Move the flags representing required permission to linux/key.h as the perm
parameter of security_key_permission() is in terms of them - and not the
permissions mask flags used in key->perm.

Whilst we're at it:

 (1) Rename them to be KEY_NEED_xxx rather than KEY_xxx to avoid collisions
     with symbols in uapi/linux/input.h.

 (2) Don't use key_perm_t for a mask of required permissions, but rather limit
     it to the permissions mask attached to the key and arguments related
     directly to that.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Dmitry Kasatkin <d.kasatkin@samsung.com>
---
 include/linux/key.h      | 11 +++++++++++
 include/linux/security.h |  6 +++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 80d677483e31..cd0abb8c9c33 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -309,6 +309,17 @@ static inline key_serial_t key_serial(const struct key *key)
 
 extern void key_set_timeout(struct key *, unsigned);
 
+/*
+ * The permissions required on a key that we're looking up.
+ */
+#define	KEY_NEED_VIEW	0x01	/* Require permission to view attributes */
+#define	KEY_NEED_READ	0x02	/* Require permission to read content */
+#define	KEY_NEED_WRITE	0x04	/* Require permission to update / modify */
+#define	KEY_NEED_SEARCH	0x08	/* Require permission to search (keyring) or find (key) */
+#define	KEY_NEED_LINK	0x10	/* Require permission to link */
+#define	KEY_NEED_SETATTR 0x20	/* Require permission to change attributes */
+#define	KEY_NEED_ALL	0x3f	/* All the above permissions */
+
 /**
  * key_is_instantiated - Determine if a key has been positively instantiated
  * @key: The key to check.
diff --git a/include/linux/security.h b/include/linux/security.h
index 5623a7f965b7..3a5ed0cd2751 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1707,7 +1707,7 @@ struct security_operations {
 	void (*key_free) (struct key *key);
 	int (*key_permission) (key_ref_t key_ref,
 			       const struct cred *cred,
-			       key_perm_t perm);
+			       unsigned perm);
 	int (*key_getsecurity)(struct key *key, char **_buffer);
 #endif	/* CONFIG_KEYS */
 
@@ -3026,7 +3026,7 @@ static inline int security_path_chroot(struct path *path)
 int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags);
 void security_key_free(struct key *key);
 int security_key_permission(key_ref_t key_ref,
-			    const struct cred *cred, key_perm_t perm);
+			    const struct cred *cred, unsigned perm);
 int security_key_getsecurity(struct key *key, char **_buffer);
 
 #else
@@ -3044,7 +3044,7 @@ static inline void security_key_free(struct key *key)
 
 static inline int security_key_permission(key_ref_t key_ref,
 					  const struct cred *cred,
-					  key_perm_t perm)
+					  unsigned perm)
 {
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 848ef58695d8c013f24633352586279cfb40e9d9 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 1 Apr 2014 17:02:53 +0300
Subject: net: rfkill: gpio: remove unused and obsolete platform parameters

After upgrading to descriptor based gpios, the gpio numbers
are not used anymore. The power_clk_name and the platform
specific setup and close hooks are not used by anybody, and
we should not encourage use of such things, so removing them.

Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill-gpio.h | 10 ----------
 net/rfkill/rfkill-gpio.c    | 15 +--------------
 2 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill-gpio.h b/include/linux/rfkill-gpio.h
index 4d09f6eab359..20bcb55498cd 100644
--- a/include/linux/rfkill-gpio.h
+++ b/include/linux/rfkill-gpio.h
@@ -27,21 +27,11 @@
  * struct rfkill_gpio_platform_data - platform data for rfkill gpio device.
  * for unused gpio's, the expected value is -1.
  * @name:		name for the gpio rf kill instance
- * @reset_gpio:		GPIO which is used for reseting rfkill switch
- * @shutdown_gpio:	GPIO which is used for shutdown of rfkill switch
- * @power_clk_name:	[optional] name of clk to turn off while blocked
- * @gpio_runtime_close:	clean up platform specific gpio configuration
- * @gpio_runtime_setup:	set up platform specific gpio configuration
  */
 
 struct rfkill_gpio_platform_data {
 	char			*name;
-	int			reset_gpio;
-	int			shutdown_gpio;
-	const char		*power_clk_name;
 	enum rfkill_type	type;
-	void	(*gpio_runtime_close)(struct platform_device *);
-	int	(*gpio_runtime_setup)(struct platform_device *);
 };
 
 #endif /* __RFKILL_GPIO_H */
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index bd2a5b90400c..0adda445dfe7 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -87,7 +87,6 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 {
 	struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
 	struct rfkill_gpio_data *rfkill;
-	const char *clk_name = NULL;
 	struct gpio_desc *gpio;
 	int ret;
 	int len;
@@ -101,7 +100,6 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 		if (ret)
 			return ret;
 	} else if (pdata) {
-		clk_name = pdata->power_clk_name;
 		rfkill->name = pdata->name;
 		rfkill->type = pdata->type;
 	} else {
@@ -120,7 +118,7 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 	snprintf(rfkill->reset_name, len + 6 , "%s_reset", rfkill->name);
 	snprintf(rfkill->shutdown_name, len + 9, "%s_shutdown", rfkill->name);
 
-	rfkill->clk = devm_clk_get(&pdev->dev, clk_name);
+	rfkill->clk = devm_clk_get(&pdev->dev, NULL);
 
 	gpio = devm_gpiod_get_index(&pdev->dev, rfkill->reset_name, 0);
 	if (!IS_ERR(gpio)) {
@@ -146,14 +144,6 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	if (pdata && pdata->gpio_runtime_setup) {
-		ret = pdata->gpio_runtime_setup(pdev);
-		if (ret) {
-			dev_err(&pdev->dev, "can't set up gpio\n");
-			return ret;
-		}
-	}
-
 	rfkill->rfkill_dev = rfkill_alloc(rfkill->name, &pdev->dev,
 					  rfkill->type, &rfkill_gpio_ops,
 					  rfkill);
@@ -174,10 +164,7 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 static int rfkill_gpio_remove(struct platform_device *pdev)
 {
 	struct rfkill_gpio_data *rfkill = platform_get_drvdata(pdev);
-	struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
 
-	if (pdata && pdata->gpio_runtime_close)
-		pdata->gpio_runtime_close(pdev);
 	rfkill_unregister(rfkill->rfkill_dev);
 	rfkill_destroy(rfkill->rfkill_dev);
 
-- 
cgit v1.2.3-71-gd317


From f39d2fa0122e6abd8505a3598f3aa535d0d5aade Mon Sep 17 00:00:00 2001
From: Huang Shijie <b32955@freescale.com>
Date: Mon, 24 Feb 2014 18:37:35 +0800
Subject: mtd: spi-nor: copy the SPI NOR commands to a new header file

This patch adds a new header :spi-nor.h,
and copies all the SPI NOR commands and relative macros into this new header.

This hearder can be used by the m25p80.c and other spi-nor controller,
such as Freescale's Quadspi.

Signed-off-by: Huang Shijie <b32955@freescale.com>
Acked-by: Marek Vasut <marex@denx.de>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/spi-nor.h | 55 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 include/linux/mtd/spi-nor.h

(limited to 'include/linux')

diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
new file mode 100644
index 000000000000..483fc2a086c4
--- /dev/null
+++ b/include/linux/mtd/spi-nor.h
@@ -0,0 +1,55 @@
+#ifndef __LINUX_MTD_SPI_NOR_H
+#define __LINUX_MTD_SPI_NOR_H
+
+/* Flash opcodes. */
+#define	OPCODE_WREN		0x06	/* Write enable */
+#define	OPCODE_RDSR		0x05	/* Read status register */
+#define	OPCODE_WRSR		0x01	/* Write status register 1 byte */
+#define	OPCODE_NORM_READ	0x03	/* Read data bytes (low frequency) */
+#define	OPCODE_FAST_READ	0x0b	/* Read data bytes (high frequency) */
+#define	OPCODE_DUAL_READ        0x3b    /* Read data bytes (Dual SPI) */
+#define	OPCODE_QUAD_READ        0x6b    /* Read data bytes (Quad SPI) */
+#define	OPCODE_PP		0x02	/* Page program (up to 256 bytes) */
+#define	OPCODE_BE_4K		0x20	/* Erase 4KiB block */
+#define	OPCODE_BE_4K_PMC	0xd7	/* Erase 4KiB block on PMC chips */
+#define	OPCODE_BE_32K		0x52	/* Erase 32KiB block */
+#define	OPCODE_CHIP_ERASE	0xc7	/* Erase whole flash chip */
+#define	OPCODE_SE		0xd8	/* Sector erase (usually 64KiB) */
+#define	OPCODE_RDID		0x9f	/* Read JEDEC ID */
+#define	OPCODE_RDCR             0x35    /* Read configuration register */
+
+/* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
+#define	OPCODE_NORM_READ_4B	0x13	/* Read data bytes (low frequency) */
+#define	OPCODE_FAST_READ_4B	0x0c	/* Read data bytes (high frequency) */
+#define	OPCODE_DUAL_READ_4B	0x3c    /* Read data bytes (Dual SPI) */
+#define	OPCODE_QUAD_READ_4B	0x6c    /* Read data bytes (Quad SPI) */
+#define	OPCODE_PP_4B		0x12	/* Page program (up to 256 bytes) */
+#define	OPCODE_SE_4B		0xdc	/* Sector erase (usually 64KiB) */
+
+/* Used for SST flashes only. */
+#define	OPCODE_BP		0x02	/* Byte program */
+#define	OPCODE_WRDI		0x04	/* Write disable */
+#define	OPCODE_AAI_WP		0xad	/* Auto address increment word program */
+
+/* Used for Macronix and Winbond flashes. */
+#define	OPCODE_EN4B		0xb7	/* Enter 4-byte mode */
+#define	OPCODE_EX4B		0xe9	/* Exit 4-byte mode */
+
+/* Used for Spansion flashes only. */
+#define	OPCODE_BRWR		0x17	/* Bank register write */
+
+/* Status Register bits. */
+#define	SR_WIP			1	/* Write in progress */
+#define	SR_WEL			2	/* Write enable latch */
+/* meaning of other SR_* bits may differ between vendors */
+#define	SR_BP0			4	/* Block protect 0 */
+#define	SR_BP1			8	/* Block protect 1 */
+#define	SR_BP2			0x10	/* Block protect 2 */
+#define	SR_SRWD			0x80	/* SR write protect */
+
+#define SR_QUAD_EN_MX           0x40    /* Macronix Quad I/O */
+
+/* Configuration Register bits. */
+#define CR_QUAD_EN_SPAN		0x2     /* Spansion Quad I/O */
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 6e602ef73334550bbbb8be1041a3ce6eecbd42f1 Mon Sep 17 00:00:00 2001
From: Huang Shijie <b32955@freescale.com>
Date: Mon, 24 Feb 2014 18:37:36 +0800
Subject: mtd: spi-nor: add the basic data structures

The spi_nor{} is cloned from the m25p{}.
The spi_nor{} can be used by both the m25p80 and spi-nor controller.

We also add the spi_nor_xfer_cfg{} which can be used by the two
fundamental primitives: read_xfer/write_xfer.

 1) the hooks for spi_nor{}:
    @prepare/unpreare: used to do some work before or after the
             read/write/erase/lock/unlock.
    @read_xfer/write_xfer: We can use these two hooks to code all
             the following hooks if the driver tries to implement them
             by itself.
    @read_reg: used to read the registers, such as read status register,
             read configure register.
    @write_reg: used to write the registers, such as write enable,
             erase sector.
    @read_id: read out the ID info.
    @wait_till_ready: wait till the NOR becomes ready.
    @read: read out the data from the NOR.
    @write: write data to the NOR.
    @erase: erase a sector of the NOR.

 2) Add a new field sst_write_second for the SST NOR write.

Signed-off-by: Huang Shijie <b32955@freescale.com>
Acked-by: Marek Vasut <marex@denx.de>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/spi-nor.h | 110 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 483fc2a086c4..3a3c3872c8cd 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -52,4 +52,114 @@
 /* Configuration Register bits. */
 #define CR_QUAD_EN_SPAN		0x2     /* Spansion Quad I/O */
 
+enum read_mode {
+	SPI_NOR_NORMAL = 0,
+	SPI_NOR_FAST,
+	SPI_NOR_DUAL,
+	SPI_NOR_QUAD,
+};
+
+/**
+ * struct spi_nor_xfer_cfg - Structure for defining a Serial Flash transfer
+ * @wren:		command for "Write Enable", or 0x00 for not required
+ * @cmd:		command for operation
+ * @cmd_pins:		number of pins to send @cmd (1, 2, 4)
+ * @addr:		address for operation
+ * @addr_pins:		number of pins to send @addr (1, 2, 4)
+ * @addr_width:		number of address bytes
+ *			(3,4, or 0 for address not required)
+ * @mode:		mode data
+ * @mode_pins:		number of pins to send @mode (1, 2, 4)
+ * @mode_cycles:	number of mode cycles (0 for mode not required)
+ * @dummy_cycles:	number of dummy cycles (0 for dummy not required)
+ */
+struct spi_nor_xfer_cfg {
+	u8		wren;
+	u8		cmd;
+	u8		cmd_pins;
+	u32		addr;
+	u8		addr_pins;
+	u8		addr_width;
+	u8		mode;
+	u8		mode_pins;
+	u8		mode_cycles;
+	u8		dummy_cycles;
+};
+
+#define	SPI_NOR_MAX_CMD_SIZE	8
+enum spi_nor_ops {
+	SPI_NOR_OPS_READ = 0,
+	SPI_NOR_OPS_WRITE,
+	SPI_NOR_OPS_ERASE,
+	SPI_NOR_OPS_LOCK,
+	SPI_NOR_OPS_UNLOCK,
+};
+
+/**
+ * struct spi_nor - Structure for defining a the SPI NOR layer
+ * @mtd:		point to a mtd_info structure
+ * @lock:		the lock for the read/write/erase/lock/unlock operations
+ * @dev:		point to a spi device, or a spi nor controller device.
+ * @page_size:		the page size of the SPI NOR
+ * @addr_width:		number of address bytes
+ * @erase_opcode:	the opcode for erasing a sector
+ * @read_opcode:	the read opcode
+ * @read_dummy:		the dummy needed by the read operation
+ * @program_opcode:	the program opcode
+ * @flash_read:		the mode of the read
+ * @sst_write_second:	used by the SST write operation
+ * @cfg:		used by the read_xfer/write_xfer
+ * @cmd_buf:		used by the write_reg
+ * @prepare:		[OPTIONAL] do some preparations for the
+ *			read/write/erase/lock/unlock operations
+ * @unprepare:		[OPTIONAL] do some post work after the
+ *			read/write/erase/lock/unlock operations
+ * @read_xfer:		[OPTIONAL] the read fundamental primitive
+ * @write_xfer:		[OPTIONAL] the writefundamental primitive
+ * @read_reg:		[DRIVER-SPECIFIC] read out the register
+ * @write_reg:		[DRIVER-SPECIFIC] write data to the register
+ * @read_id:		[REPLACEABLE] read out the ID data, and find
+ *			the proper spi_device_id
+ * @wait_till_ready:	[REPLACEABLE] wait till the NOR becomes ready
+ * @read:		[DRIVER-SPECIFIC] read data from the SPI NOR
+ * @write:		[DRIVER-SPECIFIC] write data to the SPI NOR
+ * @erase:		[DRIVER-SPECIFIC] erase a sector of the SPI NOR
+ *			at the offset @offs
+ * @priv:		the private data
+ */
+struct spi_nor {
+	struct mtd_info		*mtd;
+	struct mutex		lock;
+	struct device		*dev;
+	u32			page_size;
+	u8			addr_width;
+	u8			erase_opcode;
+	u8			read_opcode;
+	u8			read_dummy;
+	u8			program_opcode;
+	enum read_mode		flash_read;
+	bool			sst_write_second;
+	struct spi_nor_xfer_cfg	cfg;
+	u8			cmd_buf[SPI_NOR_MAX_CMD_SIZE];
+
+	int (*prepare)(struct spi_nor *nor, enum spi_nor_ops ops);
+	void (*unprepare)(struct spi_nor *nor, enum spi_nor_ops ops);
+	int (*read_xfer)(struct spi_nor *nor, struct spi_nor_xfer_cfg *cfg,
+			 u8 *buf, size_t len);
+	int (*write_xfer)(struct spi_nor *nor, struct spi_nor_xfer_cfg *cfg,
+			  u8 *buf, size_t len);
+	int (*read_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len);
+	int (*write_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len,
+			int write_enable);
+	const struct spi_device_id *(*read_id)(struct spi_nor *nor);
+	int (*wait_till_ready)(struct spi_nor *nor);
+
+	int (*read)(struct spi_nor *nor, loff_t from,
+			size_t len, size_t *retlen, u_char *read_buf);
+	void (*write)(struct spi_nor *nor, loff_t to,
+			size_t len, size_t *retlen, const u_char *write_buf);
+	int (*erase)(struct spi_nor *nor, loff_t offs);
+
+	void *priv;
+};
 #endif
-- 
cgit v1.2.3-71-gd317


From b199489d37b21c5e294f95bf265acc5dde3fc3a2 Mon Sep 17 00:00:00 2001
From: Huang Shijie <b32955@freescale.com>
Date: Mon, 24 Feb 2014 18:37:37 +0800
Subject: mtd: spi-nor: add the framework for SPI NOR

This patch cloned most of the m25p80.c. In theory, it adds a new spi-nor layer.

Before this patch, the layer is like:

                   MTD
         ------------------------
                  m25p80
         ------------------------
	       spi bus driver
         ------------------------
	        SPI NOR chip

After this patch, the layer is like:
                   MTD
         ------------------------
                  spi-nor
         ------------------------
                  m25p80
         ------------------------
	       spi bus driver
         ------------------------
	       SPI NOR chip

With the spi-nor controller driver(Freescale Quadspi), it looks like:
                   MTD
         ------------------------
                  spi-nor
         ------------------------
                fsl-quadspi
         ------------------------
	       SPI NOR chip

New APIs:
   spi_nor_scan: used to scan a spi-nor flash.

Signed-off-by: Huang Shijie <b32955@freescale.com>
Acked-by: Marek Vasut <marex@denx.de>
[Brian: rebased to include additional m25p_ids[] entry]
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/Kconfig           |    2 +
 drivers/mtd/Makefile          |    1 +
 drivers/mtd/spi-nor/Kconfig   |    6 +
 drivers/mtd/spi-nor/Makefile  |    1 +
 drivers/mtd/spi-nor/spi-nor.c | 1088 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spi-nor.h   |   20 +
 6 files changed, 1118 insertions(+)
 create mode 100644 drivers/mtd/spi-nor/Kconfig
 create mode 100644 drivers/mtd/spi-nor/Makefile
 create mode 100644 drivers/mtd/spi-nor/spi-nor.c

(limited to 'include/linux')

diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 5d49a2129618..94b821042d9d 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -321,6 +321,8 @@ source "drivers/mtd/onenand/Kconfig"
 
 source "drivers/mtd/lpddr/Kconfig"
 
+source "drivers/mtd/spi-nor/Kconfig"
+
 source "drivers/mtd/ubi/Kconfig"
 
 endif # MTD
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index 4cfb31e6c966..40fd15344387 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -32,4 +32,5 @@ inftl-objs		:= inftlcore.o inftlmount.o
 
 obj-y		+= chips/ lpddr/ maps/ devices/ nand/ onenand/ tests/
 
+obj-$(CONFIG_MTD_SPI_NOR_BASE)	+= spi-nor/
 obj-$(CONFIG_MTD_UBI)		+= ubi/
diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig
new file mode 100644
index 000000000000..41591af67aca
--- /dev/null
+++ b/drivers/mtd/spi-nor/Kconfig
@@ -0,0 +1,6 @@
+config MTD_SPI_NOR_BASE
+	bool "the framework for SPI-NOR support"
+	depends on MTD
+	help
+	  This is the framework for the SPI NOR which can be used by the SPI
+	  device drivers and the SPI-NOR device driver.
diff --git a/drivers/mtd/spi-nor/Makefile b/drivers/mtd/spi-nor/Makefile
new file mode 100644
index 000000000000..7dfe1f9b6940
--- /dev/null
+++ b/drivers/mtd/spi-nor/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_MTD_SPI_NOR_BASE)	+= spi-nor.o
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
new file mode 100644
index 000000000000..50b929095bdb
--- /dev/null
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -0,0 +1,1088 @@
+/*
+ * Cloned most of the code from the m25p80.c
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/math64.h>
+
+#include <linux/mtd/cfi.h>
+#include <linux/mtd/mtd.h>
+#include <linux/of_platform.h>
+#include <linux/spi/flash.h>
+#include <linux/mtd/spi-nor.h>
+
+/* Define max times to check status register before we give up. */
+#define	MAX_READY_WAIT_JIFFIES	(40 * HZ) /* M25P16 specs 40s max chip erase */
+
+#define JEDEC_MFR(_jedec_id)	((_jedec_id) >> 16)
+
+/*
+ * Read the status register, returning its value in the location
+ * Return the status register value.
+ * Returns negative if error occurred.
+ */
+static int read_sr(struct spi_nor *nor)
+{
+	int ret;
+	u8 val;
+
+	ret = nor->read_reg(nor, OPCODE_RDSR, &val, 1);
+	if (ret < 0) {
+		pr_err("error %d reading SR\n", (int) ret);
+		return ret;
+	}
+
+	return val;
+}
+
+/*
+ * Read configuration register, returning its value in the
+ * location. Return the configuration register value.
+ * Returns negative if error occured.
+ */
+static int read_cr(struct spi_nor *nor)
+{
+	int ret;
+	u8 val;
+
+	ret = nor->read_reg(nor, OPCODE_RDCR, &val, 1);
+	if (ret < 0) {
+		dev_err(nor->dev, "error %d reading CR\n", ret);
+		return ret;
+	}
+
+	return val;
+}
+
+/*
+ * Dummy Cycle calculation for different type of read.
+ * It can be used to support more commands with
+ * different dummy cycle requirements.
+ */
+static inline int spi_nor_read_dummy_cycles(struct spi_nor *nor)
+{
+	switch (nor->flash_read) {
+	case SPI_NOR_FAST:
+	case SPI_NOR_DUAL:
+	case SPI_NOR_QUAD:
+		return 1;
+	case SPI_NOR_NORMAL:
+		return 0;
+	}
+	return 0;
+}
+
+/*
+ * Write status register 1 byte
+ * Returns negative if error occurred.
+ */
+static inline int write_sr(struct spi_nor *nor, u8 val)
+{
+	nor->cmd_buf[0] = val;
+	return nor->write_reg(nor, OPCODE_WRSR, nor->cmd_buf, 1, 0);
+}
+
+/*
+ * Set write enable latch with Write Enable command.
+ * Returns negative if error occurred.
+ */
+static inline int write_enable(struct spi_nor *nor)
+{
+	return nor->write_reg(nor, OPCODE_WREN, NULL, 0, 0);
+}
+
+/*
+ * Send write disble instruction to the chip.
+ */
+static inline int write_disable(struct spi_nor *nor)
+{
+	return nor->write_reg(nor, OPCODE_WRDI, NULL, 0, 0);
+}
+
+static inline struct spi_nor *mtd_to_spi_nor(struct mtd_info *mtd)
+{
+	return mtd->priv;
+}
+
+/* Enable/disable 4-byte addressing mode. */
+static inline int set_4byte(struct spi_nor *nor, u32 jedec_id, int enable)
+{
+	int status;
+	bool need_wren = false;
+	u8 cmd;
+
+	switch (JEDEC_MFR(jedec_id)) {
+	case CFI_MFR_ST: /* Micron, actually */
+		/* Some Micron need WREN command; all will accept it */
+		need_wren = true;
+	case CFI_MFR_MACRONIX:
+	case 0xEF /* winbond */:
+		if (need_wren)
+			write_enable(nor);
+
+		cmd = enable ? OPCODE_EN4B : OPCODE_EX4B;
+		status = nor->write_reg(nor, cmd, NULL, 0, 0);
+		if (need_wren)
+			write_disable(nor);
+
+		return status;
+	default:
+		/* Spansion style */
+		nor->cmd_buf[0] = enable << 7;
+		return nor->write_reg(nor, OPCODE_BRWR, nor->cmd_buf, 1, 0);
+	}
+}
+
+static int spi_nor_wait_till_ready(struct spi_nor *nor)
+{
+	unsigned long deadline;
+	int sr;
+
+	deadline = jiffies + MAX_READY_WAIT_JIFFIES;
+
+	do {
+		cond_resched();
+
+		sr = read_sr(nor);
+		if (sr < 0)
+			break;
+		else if (!(sr & SR_WIP))
+			return 0;
+	} while (!time_after_eq(jiffies, deadline));
+
+	return -ETIMEDOUT;
+}
+
+/*
+ * Service routine to read status register until ready, or timeout occurs.
+ * Returns non-zero if error.
+ */
+static int wait_till_ready(struct spi_nor *nor)
+{
+	return nor->wait_till_ready(nor);
+}
+
+/*
+ * Erase the whole flash memory
+ *
+ * Returns 0 if successful, non-zero otherwise.
+ */
+static int erase_chip(struct spi_nor *nor)
+{
+	int ret;
+
+	dev_dbg(nor->dev, " %lldKiB\n", (long long)(nor->mtd->size >> 10));
+
+	/* Wait until finished previous write command. */
+	ret = wait_till_ready(nor);
+	if (ret)
+		return ret;
+
+	/* Send write enable, then erase commands. */
+	write_enable(nor);
+
+	return nor->write_reg(nor, OPCODE_CHIP_ERASE, NULL, 0, 0);
+}
+
+static int spi_nor_lock_and_prep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+	int ret = 0;
+
+	mutex_lock(&nor->lock);
+
+	if (nor->prepare) {
+		ret = nor->prepare(nor, ops);
+		if (ret) {
+			dev_err(nor->dev, "failed in the preparation.\n");
+			mutex_unlock(&nor->lock);
+			return ret;
+		}
+	}
+	return ret;
+}
+
+static void spi_nor_unlock_and_unprep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+	if (nor->unprepare)
+		nor->unprepare(nor, ops);
+	mutex_unlock(&nor->lock);
+}
+
+/*
+ * Erase an address range on the nor chip.  The address range may extend
+ * one or more erase sectors.  Return an error is there is a problem erasing.
+ */
+static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr)
+{
+	struct spi_nor *nor = mtd_to_spi_nor(mtd);
+	u32 addr, len;
+	uint32_t rem;
+	int ret;
+
+	dev_dbg(nor->dev, "at 0x%llx, len %lld\n", (long long)instr->addr,
+			(long long)instr->len);
+
+	div_u64_rem(instr->len, mtd->erasesize, &rem);
+	if (rem)
+		return -EINVAL;
+
+	addr = instr->addr;
+	len = instr->len;
+
+	ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_ERASE);
+	if (ret)
+		return ret;
+
+	/* whole-chip erase? */
+	if (len == mtd->size) {
+		if (erase_chip(nor)) {
+			ret = -EIO;
+			goto erase_err;
+		}
+
+	/* REVISIT in some cases we could speed up erasing large regions
+	 * by using OPCODE_SE instead of OPCODE_BE_4K.  We may have set up
+	 * to use "small sector erase", but that's not always optimal.
+	 */
+
+	/* "sector"-at-a-time erase */
+	} else {
+		while (len) {
+			if (nor->erase(nor, addr)) {
+				ret = -EIO;
+				goto erase_err;
+			}
+
+			addr += mtd->erasesize;
+			len -= mtd->erasesize;
+		}
+	}
+
+	spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_ERASE);
+
+	instr->state = MTD_ERASE_DONE;
+	mtd_erase_callback(instr);
+
+	return ret;
+
+erase_err:
+	spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_ERASE);
+	instr->state = MTD_ERASE_FAILED;
+	return ret;
+}
+
+static int spi_nor_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
+{
+	struct spi_nor *nor = mtd_to_spi_nor(mtd);
+	uint32_t offset = ofs;
+	uint8_t status_old, status_new;
+	int ret = 0;
+
+	ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_LOCK);
+	if (ret)
+		return ret;
+
+	/* Wait until finished previous command */
+	ret = wait_till_ready(nor);
+	if (ret)
+		goto err;
+
+	status_old = read_sr(nor);
+
+	if (offset < mtd->size - (mtd->size / 2))
+		status_new = status_old | SR_BP2 | SR_BP1 | SR_BP0;
+	else if (offset < mtd->size - (mtd->size / 4))
+		status_new = (status_old & ~SR_BP0) | SR_BP2 | SR_BP1;
+	else if (offset < mtd->size - (mtd->size / 8))
+		status_new = (status_old & ~SR_BP1) | SR_BP2 | SR_BP0;
+	else if (offset < mtd->size - (mtd->size / 16))
+		status_new = (status_old & ~(SR_BP0 | SR_BP1)) | SR_BP2;
+	else if (offset < mtd->size - (mtd->size / 32))
+		status_new = (status_old & ~SR_BP2) | SR_BP1 | SR_BP0;
+	else if (offset < mtd->size - (mtd->size / 64))
+		status_new = (status_old & ~(SR_BP2 | SR_BP0)) | SR_BP1;
+	else
+		status_new = (status_old & ~(SR_BP2 | SR_BP1)) | SR_BP0;
+
+	/* Only modify protection if it will not unlock other areas */
+	if ((status_new & (SR_BP2 | SR_BP1 | SR_BP0)) >
+				(status_old & (SR_BP2 | SR_BP1 | SR_BP0))) {
+		write_enable(nor);
+		ret = write_sr(nor, status_new);
+		if (ret)
+			goto err;
+	}
+
+err:
+	spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_LOCK);
+	return ret;
+}
+
+static int spi_nor_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
+{
+	struct spi_nor *nor = mtd_to_spi_nor(mtd);
+	uint32_t offset = ofs;
+	uint8_t status_old, status_new;
+	int ret = 0;
+
+	ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_UNLOCK);
+	if (ret)
+		return ret;
+
+	/* Wait until finished previous command */
+	ret = wait_till_ready(nor);
+	if (ret)
+		goto err;
+
+	status_old = read_sr(nor);
+
+	if (offset+len > mtd->size - (mtd->size / 64))
+		status_new = status_old & ~(SR_BP2 | SR_BP1 | SR_BP0);
+	else if (offset+len > mtd->size - (mtd->size / 32))
+		status_new = (status_old & ~(SR_BP2 | SR_BP1)) | SR_BP0;
+	else if (offset+len > mtd->size - (mtd->size / 16))
+		status_new = (status_old & ~(SR_BP2 | SR_BP0)) | SR_BP1;
+	else if (offset+len > mtd->size - (mtd->size / 8))
+		status_new = (status_old & ~SR_BP2) | SR_BP1 | SR_BP0;
+	else if (offset+len > mtd->size - (mtd->size / 4))
+		status_new = (status_old & ~(SR_BP0 | SR_BP1)) | SR_BP2;
+	else if (offset+len > mtd->size - (mtd->size / 2))
+		status_new = (status_old & ~SR_BP1) | SR_BP2 | SR_BP0;
+	else
+		status_new = (status_old & ~SR_BP0) | SR_BP2 | SR_BP1;
+
+	/* Only modify protection if it will not lock other areas */
+	if ((status_new & (SR_BP2 | SR_BP1 | SR_BP0)) <
+				(status_old & (SR_BP2 | SR_BP1 | SR_BP0))) {
+		write_enable(nor);
+		ret = write_sr(nor, status_new);
+		if (ret)
+			goto err;
+	}
+
+err:
+	spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_UNLOCK);
+	return ret;
+}
+
+struct flash_info {
+	/* JEDEC id zero means "no ID" (most older chips); otherwise it has
+	 * a high byte of zero plus three data bytes: the manufacturer id,
+	 * then a two byte device id.
+	 */
+	u32		jedec_id;
+	u16             ext_id;
+
+	/* The size listed here is what works with OPCODE_SE, which isn't
+	 * necessarily called a "sector" by the vendor.
+	 */
+	unsigned	sector_size;
+	u16		n_sectors;
+
+	u16		page_size;
+	u16		addr_width;
+
+	u16		flags;
+#define	SECT_4K			0x01	/* OPCODE_BE_4K works uniformly */
+#define	SPI_NOR_NO_ERASE	0x02	/* No erase command needed */
+#define	SST_WRITE		0x04	/* use SST byte programming */
+#define	SPI_NOR_NO_FR		0x08	/* Can't do fastread */
+#define	SECT_4K_PMC		0x10	/* OPCODE_BE_4K_PMC works uniformly */
+#define	SPI_NOR_DUAL_READ	0x20    /* Flash supports Dual Read */
+#define	SPI_NOR_QUAD_READ	0x40    /* Flash supports Quad Read */
+};
+
+#define INFO(_jedec_id, _ext_id, _sector_size, _n_sectors, _flags)	\
+	((kernel_ulong_t)&(struct flash_info) {				\
+		.jedec_id = (_jedec_id),				\
+		.ext_id = (_ext_id),					\
+		.sector_size = (_sector_size),				\
+		.n_sectors = (_n_sectors),				\
+		.page_size = 256,					\
+		.flags = (_flags),					\
+	})
+
+#define CAT25_INFO(_sector_size, _n_sectors, _page_size, _addr_width, _flags)	\
+	((kernel_ulong_t)&(struct flash_info) {				\
+		.sector_size = (_sector_size),				\
+		.n_sectors = (_n_sectors),				\
+		.page_size = (_page_size),				\
+		.addr_width = (_addr_width),				\
+		.flags = (_flags),					\
+	})
+
+/* NOTE: double check command sets and memory organization when you add
+ * more nor chips.  This current list focusses on newer chips, which
+ * have been converging on command sets which including JEDEC ID.
+ */
+const struct spi_device_id spi_nor_ids[] = {
+	/* Atmel -- some are (confusingly) marketed as "DataFlash" */
+	{ "at25fs010",  INFO(0x1f6601, 0, 32 * 1024,   4, SECT_4K) },
+	{ "at25fs040",  INFO(0x1f6604, 0, 64 * 1024,   8, SECT_4K) },
+
+	{ "at25df041a", INFO(0x1f4401, 0, 64 * 1024,   8, SECT_4K) },
+	{ "at25df321a", INFO(0x1f4701, 0, 64 * 1024,  64, SECT_4K) },
+	{ "at25df641",  INFO(0x1f4800, 0, 64 * 1024, 128, SECT_4K) },
+
+	{ "at26f004",   INFO(0x1f0400, 0, 64 * 1024,  8, SECT_4K) },
+	{ "at26df081a", INFO(0x1f4501, 0, 64 * 1024, 16, SECT_4K) },
+	{ "at26df161a", INFO(0x1f4601, 0, 64 * 1024, 32, SECT_4K) },
+	{ "at26df321",  INFO(0x1f4700, 0, 64 * 1024, 64, SECT_4K) },
+
+	{ "at45db081d", INFO(0x1f2500, 0, 64 * 1024, 16, SECT_4K) },
+
+	/* EON -- en25xxx */
+	{ "en25f32",    INFO(0x1c3116, 0, 64 * 1024,   64, SECT_4K) },
+	{ "en25p32",    INFO(0x1c2016, 0, 64 * 1024,   64, 0) },
+	{ "en25q32b",   INFO(0x1c3016, 0, 64 * 1024,   64, 0) },
+	{ "en25p64",    INFO(0x1c2017, 0, 64 * 1024,  128, 0) },
+	{ "en25q64",    INFO(0x1c3017, 0, 64 * 1024,  128, SECT_4K) },
+	{ "en25qh256",  INFO(0x1c7019, 0, 64 * 1024,  512, 0) },
+
+	/* ESMT */
+	{ "f25l32pa", INFO(0x8c2016, 0, 64 * 1024, 64, SECT_4K) },
+
+	/* Everspin */
+	{ "mr25h256", CAT25_INFO( 32 * 1024, 1, 256, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+	{ "mr25h10",  CAT25_INFO(128 * 1024, 1, 256, 3, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+
+	/* GigaDevice */
+	{ "gd25q32", INFO(0xc84016, 0, 64 * 1024,  64, SECT_4K) },
+	{ "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
+
+	/* Intel/Numonyx -- xxxs33b */
+	{ "160s33b",  INFO(0x898911, 0, 64 * 1024,  32, 0) },
+	{ "320s33b",  INFO(0x898912, 0, 64 * 1024,  64, 0) },
+	{ "640s33b",  INFO(0x898913, 0, 64 * 1024, 128, 0) },
+
+	/* Macronix */
+	{ "mx25l2005a",  INFO(0xc22012, 0, 64 * 1024,   4, SECT_4K) },
+	{ "mx25l4005a",  INFO(0xc22013, 0, 64 * 1024,   8, SECT_4K) },
+	{ "mx25l8005",   INFO(0xc22014, 0, 64 * 1024,  16, 0) },
+	{ "mx25l1606e",  INFO(0xc22015, 0, 64 * 1024,  32, SECT_4K) },
+	{ "mx25l3205d",  INFO(0xc22016, 0, 64 * 1024,  64, 0) },
+	{ "mx25l3255e",  INFO(0xc29e16, 0, 64 * 1024,  64, SECT_4K) },
+	{ "mx25l6405d",  INFO(0xc22017, 0, 64 * 1024, 128, 0) },
+	{ "mx25l12805d", INFO(0xc22018, 0, 64 * 1024, 256, 0) },
+	{ "mx25l12855e", INFO(0xc22618, 0, 64 * 1024, 256, 0) },
+	{ "mx25l25635e", INFO(0xc22019, 0, 64 * 1024, 512, 0) },
+	{ "mx25l25655e", INFO(0xc22619, 0, 64 * 1024, 512, 0) },
+	{ "mx66l51235l", INFO(0xc2201a, 0, 64 * 1024, 1024, SPI_NOR_QUAD_READ) },
+	{ "mx66l1g55g",  INFO(0xc2261b, 0, 64 * 1024, 2048, SPI_NOR_QUAD_READ) },
+
+	/* Micron */
+	{ "n25q064",     INFO(0x20ba17, 0, 64 * 1024,  128, 0) },
+	{ "n25q128a11",  INFO(0x20bb18, 0, 64 * 1024,  256, 0) },
+	{ "n25q128a13",  INFO(0x20ba18, 0, 64 * 1024,  256, 0) },
+	{ "n25q256a",    INFO(0x20ba19, 0, 64 * 1024,  512, SECT_4K) },
+	{ "n25q512a",    INFO(0x20bb20, 0, 64 * 1024, 1024, SECT_4K) },
+
+	/* PMC */
+	{ "pm25lv512",   INFO(0,        0, 32 * 1024,    2, SECT_4K_PMC) },
+	{ "pm25lv010",   INFO(0,        0, 32 * 1024,    4, SECT_4K_PMC) },
+	{ "pm25lq032",   INFO(0x7f9d46, 0, 64 * 1024,   64, SECT_4K) },
+
+	/* Spansion -- single (large) sector size only, at least
+	 * for the chips listed here (without boot sectors).
+	 */
+	{ "s25sl032p",  INFO(0x010215, 0x4d00,  64 * 1024,  64, 0) },
+	{ "s25sl064p",  INFO(0x010216, 0x4d00,  64 * 1024, 128, 0) },
+	{ "s25fl256s0", INFO(0x010219, 0x4d00, 256 * 1024, 128, 0) },
+	{ "s25fl256s1", INFO(0x010219, 0x4d01,  64 * 1024, 512, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
+	{ "s25fl512s",  INFO(0x010220, 0x4d00, 256 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
+	{ "s70fl01gs",  INFO(0x010221, 0x4d00, 256 * 1024, 256, 0) },
+	{ "s25sl12800", INFO(0x012018, 0x0300, 256 * 1024,  64, 0) },
+	{ "s25sl12801", INFO(0x012018, 0x0301,  64 * 1024, 256, 0) },
+	{ "s25fl129p0", INFO(0x012018, 0x4d00, 256 * 1024,  64, 0) },
+	{ "s25fl129p1", INFO(0x012018, 0x4d01,  64 * 1024, 256, 0) },
+	{ "s25sl004a",  INFO(0x010212,      0,  64 * 1024,   8, 0) },
+	{ "s25sl008a",  INFO(0x010213,      0,  64 * 1024,  16, 0) },
+	{ "s25sl016a",  INFO(0x010214,      0,  64 * 1024,  32, 0) },
+	{ "s25sl032a",  INFO(0x010215,      0,  64 * 1024,  64, 0) },
+	{ "s25sl064a",  INFO(0x010216,      0,  64 * 1024, 128, 0) },
+	{ "s25fl008k",  INFO(0xef4014,      0,  64 * 1024,  16, SECT_4K) },
+	{ "s25fl016k",  INFO(0xef4015,      0,  64 * 1024,  32, SECT_4K) },
+	{ "s25fl064k",  INFO(0xef4017,      0,  64 * 1024, 128, SECT_4K) },
+
+	/* SST -- large erase sizes are "overlays", "sectors" are 4K */
+	{ "sst25vf040b", INFO(0xbf258d, 0, 64 * 1024,  8, SECT_4K | SST_WRITE) },
+	{ "sst25vf080b", INFO(0xbf258e, 0, 64 * 1024, 16, SECT_4K | SST_WRITE) },
+	{ "sst25vf016b", INFO(0xbf2541, 0, 64 * 1024, 32, SECT_4K | SST_WRITE) },
+	{ "sst25vf032b", INFO(0xbf254a, 0, 64 * 1024, 64, SECT_4K | SST_WRITE) },
+	{ "sst25vf064c", INFO(0xbf254b, 0, 64 * 1024, 128, SECT_4K) },
+	{ "sst25wf512",  INFO(0xbf2501, 0, 64 * 1024,  1, SECT_4K | SST_WRITE) },
+	{ "sst25wf010",  INFO(0xbf2502, 0, 64 * 1024,  2, SECT_4K | SST_WRITE) },
+	{ "sst25wf020",  INFO(0xbf2503, 0, 64 * 1024,  4, SECT_4K | SST_WRITE) },
+	{ "sst25wf040",  INFO(0xbf2504, 0, 64 * 1024,  8, SECT_4K | SST_WRITE) },
+
+	/* ST Microelectronics -- newer production may have feature updates */
+	{ "m25p05",  INFO(0x202010,  0,  32 * 1024,   2, 0) },
+	{ "m25p10",  INFO(0x202011,  0,  32 * 1024,   4, 0) },
+	{ "m25p20",  INFO(0x202012,  0,  64 * 1024,   4, 0) },
+	{ "m25p40",  INFO(0x202013,  0,  64 * 1024,   8, 0) },
+	{ "m25p80",  INFO(0x202014,  0,  64 * 1024,  16, 0) },
+	{ "m25p16",  INFO(0x202015,  0,  64 * 1024,  32, 0) },
+	{ "m25p32",  INFO(0x202016,  0,  64 * 1024,  64, 0) },
+	{ "m25p64",  INFO(0x202017,  0,  64 * 1024, 128, 0) },
+	{ "m25p128", INFO(0x202018,  0, 256 * 1024,  64, 0) },
+	{ "n25q032", INFO(0x20ba16,  0,  64 * 1024,  64, 0) },
+
+	{ "m25p05-nonjedec",  INFO(0, 0,  32 * 1024,   2, 0) },
+	{ "m25p10-nonjedec",  INFO(0, 0,  32 * 1024,   4, 0) },
+	{ "m25p20-nonjedec",  INFO(0, 0,  64 * 1024,   4, 0) },
+	{ "m25p40-nonjedec",  INFO(0, 0,  64 * 1024,   8, 0) },
+	{ "m25p80-nonjedec",  INFO(0, 0,  64 * 1024,  16, 0) },
+	{ "m25p16-nonjedec",  INFO(0, 0,  64 * 1024,  32, 0) },
+	{ "m25p32-nonjedec",  INFO(0, 0,  64 * 1024,  64, 0) },
+	{ "m25p64-nonjedec",  INFO(0, 0,  64 * 1024, 128, 0) },
+	{ "m25p128-nonjedec", INFO(0, 0, 256 * 1024,  64, 0) },
+
+	{ "m45pe10", INFO(0x204011,  0, 64 * 1024,    2, 0) },
+	{ "m45pe80", INFO(0x204014,  0, 64 * 1024,   16, 0) },
+	{ "m45pe16", INFO(0x204015,  0, 64 * 1024,   32, 0) },
+
+	{ "m25pe20", INFO(0x208012,  0, 64 * 1024,  4,       0) },
+	{ "m25pe80", INFO(0x208014,  0, 64 * 1024, 16,       0) },
+	{ "m25pe16", INFO(0x208015,  0, 64 * 1024, 32, SECT_4K) },
+
+	{ "m25px16",    INFO(0x207115,  0, 64 * 1024, 32, SECT_4K) },
+	{ "m25px32",    INFO(0x207116,  0, 64 * 1024, 64, SECT_4K) },
+	{ "m25px32-s0", INFO(0x207316,  0, 64 * 1024, 64, SECT_4K) },
+	{ "m25px32-s1", INFO(0x206316,  0, 64 * 1024, 64, SECT_4K) },
+	{ "m25px64",    INFO(0x207117,  0, 64 * 1024, 128, 0) },
+
+	/* Winbond -- w25x "blocks" are 64K, "sectors" are 4KiB */
+	{ "w25x10", INFO(0xef3011, 0, 64 * 1024,  2,  SECT_4K) },
+	{ "w25x20", INFO(0xef3012, 0, 64 * 1024,  4,  SECT_4K) },
+	{ "w25x40", INFO(0xef3013, 0, 64 * 1024,  8,  SECT_4K) },
+	{ "w25x80", INFO(0xef3014, 0, 64 * 1024,  16, SECT_4K) },
+	{ "w25x16", INFO(0xef3015, 0, 64 * 1024,  32, SECT_4K) },
+	{ "w25x32", INFO(0xef3016, 0, 64 * 1024,  64, SECT_4K) },
+	{ "w25q32", INFO(0xef4016, 0, 64 * 1024,  64, SECT_4K) },
+	{ "w25q32dw", INFO(0xef6016, 0, 64 * 1024,  64, SECT_4K) },
+	{ "w25x64", INFO(0xef3017, 0, 64 * 1024, 128, SECT_4K) },
+	{ "w25q64", INFO(0xef4017, 0, 64 * 1024, 128, SECT_4K) },
+	{ "w25q128", INFO(0xef4018, 0, 64 * 1024, 256, SECT_4K) },
+	{ "w25q80", INFO(0xef5014, 0, 64 * 1024,  16, SECT_4K) },
+	{ "w25q80bl", INFO(0xef4014, 0, 64 * 1024,  16, SECT_4K) },
+	{ "w25q128", INFO(0xef4018, 0, 64 * 1024, 256, SECT_4K) },
+	{ "w25q256", INFO(0xef4019, 0, 64 * 1024, 512, SECT_4K) },
+
+	/* Catalyst / On Semiconductor -- non-JEDEC */
+	{ "cat25c11", CAT25_INFO(  16, 8, 16, 1, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+	{ "cat25c03", CAT25_INFO(  32, 8, 16, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+	{ "cat25c09", CAT25_INFO( 128, 8, 32, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+	{ "cat25c17", CAT25_INFO( 256, 8, 32, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+	{ "cat25128", CAT25_INFO(2048, 8, 64, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+	{ },
+};
+
+static const struct spi_device_id *spi_nor_read_id(struct spi_nor *nor)
+{
+	int			tmp;
+	u8			id[5];
+	u32			jedec;
+	u16                     ext_jedec;
+	struct flash_info	*info;
+
+	tmp = nor->read_reg(nor, OPCODE_RDID, id, 5);
+	if (tmp < 0) {
+		dev_dbg(nor->dev, " error %d reading JEDEC ID\n", tmp);
+		return ERR_PTR(tmp);
+	}
+	jedec = id[0];
+	jedec = jedec << 8;
+	jedec |= id[1];
+	jedec = jedec << 8;
+	jedec |= id[2];
+
+	ext_jedec = id[3] << 8 | id[4];
+
+	for (tmp = 0; tmp < ARRAY_SIZE(spi_nor_ids) - 1; tmp++) {
+		info = (void *)spi_nor_ids[tmp].driver_data;
+		if (info->jedec_id == jedec) {
+			if (info->ext_id == 0 || info->ext_id == ext_jedec)
+				return &spi_nor_ids[tmp];
+		}
+	}
+	dev_err(nor->dev, "unrecognized JEDEC id %06x\n", jedec);
+	return ERR_PTR(-ENODEV);
+}
+
+static const struct spi_device_id *jedec_probe(struct spi_nor *nor)
+{
+	return nor->read_id(nor);
+}
+
+static int spi_nor_read(struct mtd_info *mtd, loff_t from, size_t len,
+			size_t *retlen, u_char *buf)
+{
+	struct spi_nor *nor = mtd_to_spi_nor(mtd);
+	int ret;
+
+	dev_dbg(nor->dev, "from 0x%08x, len %zd\n", (u32)from, len);
+
+	ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_READ);
+	if (ret)
+		return ret;
+
+	ret = nor->read(nor, from, len, retlen, buf);
+
+	spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_READ);
+	return ret;
+}
+
+static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
+		size_t *retlen, const u_char *buf)
+{
+	struct spi_nor *nor = mtd_to_spi_nor(mtd);
+	size_t actual;
+	int ret;
+
+	dev_dbg(nor->dev, "to 0x%08x, len %zd\n", (u32)to, len);
+
+	ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_WRITE);
+	if (ret)
+		return ret;
+
+	/* Wait until finished previous write command. */
+	ret = wait_till_ready(nor);
+	if (ret)
+		goto time_out;
+
+	write_enable(nor);
+
+	nor->sst_write_second = false;
+
+	actual = to % 2;
+	/* Start write from odd address. */
+	if (actual) {
+		nor->program_opcode = OPCODE_BP;
+
+		/* write one byte. */
+		nor->write(nor, to, 1, retlen, buf);
+		ret = wait_till_ready(nor);
+		if (ret)
+			goto time_out;
+	}
+	to += actual;
+
+	/* Write out most of the data here. */
+	for (; actual < len - 1; actual += 2) {
+		nor->program_opcode = OPCODE_AAI_WP;
+
+		/* write two bytes. */
+		nor->write(nor, to, 2, retlen, buf + actual);
+		ret = wait_till_ready(nor);
+		if (ret)
+			goto time_out;
+		to += 2;
+		nor->sst_write_second = true;
+	}
+	nor->sst_write_second = false;
+
+	write_disable(nor);
+	ret = wait_till_ready(nor);
+	if (ret)
+		goto time_out;
+
+	/* Write out trailing byte if it exists. */
+	if (actual != len) {
+		write_enable(nor);
+
+		nor->program_opcode = OPCODE_BP;
+		nor->write(nor, to, 1, retlen, buf + actual);
+
+		ret = wait_till_ready(nor);
+		if (ret)
+			goto time_out;
+		write_disable(nor);
+	}
+time_out:
+	spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE);
+	return ret;
+}
+
+/*
+ * Write an address range to the nor chip.  Data must be written in
+ * FLASH_PAGESIZE chunks.  The address range may be any size provided
+ * it is within the physical boundaries.
+ */
+static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
+	size_t *retlen, const u_char *buf)
+{
+	struct spi_nor *nor = mtd_to_spi_nor(mtd);
+	u32 page_offset, page_size, i;
+	int ret;
+
+	dev_dbg(nor->dev, "to 0x%08x, len %zd\n", (u32)to, len);
+
+	ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_WRITE);
+	if (ret)
+		return ret;
+
+	/* Wait until finished previous write command. */
+	ret = wait_till_ready(nor);
+	if (ret)
+		goto write_err;
+
+	write_enable(nor);
+
+	page_offset = to & (nor->page_size - 1);
+
+	/* do all the bytes fit onto one page? */
+	if (page_offset + len <= nor->page_size) {
+		nor->write(nor, to, len, retlen, buf);
+	} else {
+		/* the size of data remaining on the first page */
+		page_size = nor->page_size - page_offset;
+		nor->write(nor, to, page_size, retlen, buf);
+
+		/* write everything in nor->page_size chunks */
+		for (i = page_size; i < len; i += page_size) {
+			page_size = len - i;
+			if (page_size > nor->page_size)
+				page_size = nor->page_size;
+
+			wait_till_ready(nor);
+			write_enable(nor);
+
+			nor->write(nor, to + i, page_size, retlen, buf + i);
+		}
+	}
+
+write_err:
+	spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE);
+	return 0;
+}
+
+static int macronix_quad_enable(struct spi_nor *nor)
+{
+	int ret, val;
+
+	val = read_sr(nor);
+	write_enable(nor);
+
+	nor->cmd_buf[0] = val | SR_QUAD_EN_MX;
+	nor->write_reg(nor, OPCODE_WRSR, nor->cmd_buf, 1, 0);
+
+	if (wait_till_ready(nor))
+		return 1;
+
+	ret = read_sr(nor);
+	if (!(ret > 0 && (ret & SR_QUAD_EN_MX))) {
+		dev_err(nor->dev, "Macronix Quad bit not set\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Write status Register and configuration register with 2 bytes
+ * The first byte will be written to the status register, while the
+ * second byte will be written to the configuration register.
+ * Return negative if error occured.
+ */
+static int write_sr_cr(struct spi_nor *nor, u16 val)
+{
+	nor->cmd_buf[0] = val & 0xff;
+	nor->cmd_buf[1] = (val >> 8);
+
+	return nor->write_reg(nor, OPCODE_WRSR, nor->cmd_buf, 2, 0);
+}
+
+static int spansion_quad_enable(struct spi_nor *nor)
+{
+	int ret;
+	int quad_en = CR_QUAD_EN_SPAN << 8;
+
+	write_enable(nor);
+
+	ret = write_sr_cr(nor, quad_en);
+	if (ret < 0) {
+		dev_err(nor->dev,
+			"error while writing configuration register\n");
+		return -EINVAL;
+	}
+
+	/* read back and check it */
+	ret = read_cr(nor);
+	if (!(ret > 0 && (ret & CR_QUAD_EN_SPAN))) {
+		dev_err(nor->dev, "Spansion Quad bit not set\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int set_quad_mode(struct spi_nor *nor, u32 jedec_id)
+{
+	int status;
+
+	switch (JEDEC_MFR(jedec_id)) {
+	case CFI_MFR_MACRONIX:
+		status = macronix_quad_enable(nor);
+		if (status) {
+			dev_err(nor->dev, "Macronix quad-read not enabled\n");
+			return -EINVAL;
+		}
+		return status;
+	default:
+		status = spansion_quad_enable(nor);
+		if (status) {
+			dev_err(nor->dev, "Spansion quad-read not enabled\n");
+			return -EINVAL;
+		}
+		return status;
+	}
+}
+
+static int spi_nor_check(struct spi_nor *nor)
+{
+	if (!nor->dev || !nor->read || !nor->write ||
+		!nor->read_reg || !nor->write_reg || !nor->erase) {
+		pr_err("spi-nor: please fill all the necessary fields!\n");
+		return -EINVAL;
+	}
+
+	if (!nor->read_id)
+		nor->read_id = spi_nor_read_id;
+	if (!nor->wait_till_ready)
+		nor->wait_till_ready = spi_nor_wait_till_ready;
+
+	return 0;
+}
+
+int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
+			enum read_mode mode)
+{
+	struct flash_info		*info;
+	struct flash_platform_data	*data;
+	struct device *dev = nor->dev;
+	struct mtd_info *mtd = nor->mtd;
+	struct device_node *np = dev->of_node;
+	int ret;
+	int i;
+
+	ret = spi_nor_check(nor);
+	if (ret)
+		return ret;
+
+	/* Platform data helps sort out which chip type we have, as
+	 * well as how this board partitions it.  If we don't have
+	 * a chip ID, try the JEDEC id commands; they'll work for most
+	 * newer chips, even if we don't recognize the particular chip.
+	 */
+	data = dev_get_platdata(dev);
+	if (data && data->type) {
+		const struct spi_device_id *plat_id;
+
+		for (i = 0; i < ARRAY_SIZE(spi_nor_ids) - 1; i++) {
+			plat_id = &spi_nor_ids[i];
+			if (strcmp(data->type, plat_id->name))
+				continue;
+			break;
+		}
+
+		if (i < ARRAY_SIZE(spi_nor_ids) - 1)
+			id = plat_id;
+		else
+			dev_warn(dev, "unrecognized id %s\n", data->type);
+	}
+
+	info = (void *)id->driver_data;
+
+	if (info->jedec_id) {
+		const struct spi_device_id *jid;
+
+		jid = jedec_probe(nor);
+		if (IS_ERR(jid)) {
+			return PTR_ERR(jid);
+		} else if (jid != id) {
+			/*
+			 * JEDEC knows better, so overwrite platform ID. We
+			 * can't trust partitions any longer, but we'll let
+			 * mtd apply them anyway, since some partitions may be
+			 * marked read-only, and we don't want to lose that
+			 * information, even if it's not 100% accurate.
+			 */
+			dev_warn(dev, "found %s, expected %s\n",
+				 jid->name, id->name);
+			id = jid;
+			info = (void *)jid->driver_data;
+		}
+	}
+
+	mutex_init(&nor->lock);
+
+	/*
+	 * Atmel, SST and Intel/Numonyx serial nor tend to power
+	 * up with the software protection bits set
+	 */
+
+	if (JEDEC_MFR(info->jedec_id) == CFI_MFR_ATMEL ||
+	    JEDEC_MFR(info->jedec_id) == CFI_MFR_INTEL ||
+	    JEDEC_MFR(info->jedec_id) == CFI_MFR_SST) {
+		write_enable(nor);
+		write_sr(nor, 0);
+	}
+
+	if (data && data->name)
+		mtd->name = data->name;
+	else
+		mtd->name = dev_name(dev);
+
+	mtd->type = MTD_NORFLASH;
+	mtd->writesize = 1;
+	mtd->flags = MTD_CAP_NORFLASH;
+	mtd->size = info->sector_size * info->n_sectors;
+	mtd->_erase = spi_nor_erase;
+	mtd->_read = spi_nor_read;
+
+	/* nor protection support for STmicro chips */
+	if (JEDEC_MFR(info->jedec_id) == CFI_MFR_ST) {
+		mtd->_lock = spi_nor_lock;
+		mtd->_unlock = spi_nor_unlock;
+	}
+
+	/* sst nor chips use AAI word program */
+	if (info->flags & SST_WRITE)
+		mtd->_write = sst_write;
+	else
+		mtd->_write = spi_nor_write;
+
+	/* prefer "small sector" erase if possible */
+	if (info->flags & SECT_4K) {
+		nor->erase_opcode = OPCODE_BE_4K;
+		mtd->erasesize = 4096;
+	} else if (info->flags & SECT_4K_PMC) {
+		nor->erase_opcode = OPCODE_BE_4K_PMC;
+		mtd->erasesize = 4096;
+	} else {
+		nor->erase_opcode = OPCODE_SE;
+		mtd->erasesize = info->sector_size;
+	}
+
+	if (info->flags & SPI_NOR_NO_ERASE)
+		mtd->flags |= MTD_NO_ERASE;
+
+	mtd->dev.parent = dev;
+	nor->page_size = info->page_size;
+	mtd->writebufsize = nor->page_size;
+
+	if (np) {
+		/* If we were instantiated by DT, use it */
+		if (of_property_read_bool(np, "m25p,fast-read"))
+			nor->flash_read = SPI_NOR_FAST;
+		else
+			nor->flash_read = SPI_NOR_NORMAL;
+	} else {
+		/* If we weren't instantiated by DT, default to fast-read */
+		nor->flash_read = SPI_NOR_FAST;
+	}
+
+	/* Some devices cannot do fast-read, no matter what DT tells us */
+	if (info->flags & SPI_NOR_NO_FR)
+		nor->flash_read = SPI_NOR_NORMAL;
+
+	/* Quad/Dual-read mode takes precedence over fast/normal */
+	if (mode == SPI_NOR_QUAD && info->flags & SPI_NOR_QUAD_READ) {
+		ret = set_quad_mode(nor, info->jedec_id);
+		if (ret) {
+			dev_err(dev, "quad mode not supported\n");
+			return ret;
+		}
+		nor->flash_read = SPI_NOR_QUAD;
+	} else if (mode == SPI_NOR_DUAL && info->flags & SPI_NOR_DUAL_READ) {
+		nor->flash_read = SPI_NOR_DUAL;
+	}
+
+	/* Default commands */
+	switch (nor->flash_read) {
+	case SPI_NOR_QUAD:
+		nor->read_opcode = OPCODE_QUAD_READ;
+		break;
+	case SPI_NOR_DUAL:
+		nor->read_opcode = OPCODE_DUAL_READ;
+		break;
+	case SPI_NOR_FAST:
+		nor->read_opcode = OPCODE_FAST_READ;
+		break;
+	case SPI_NOR_NORMAL:
+		nor->read_opcode = OPCODE_NORM_READ;
+		break;
+	default:
+		dev_err(dev, "No Read opcode defined\n");
+		return -EINVAL;
+	}
+
+	nor->program_opcode = OPCODE_PP;
+
+	if (info->addr_width)
+		nor->addr_width = info->addr_width;
+	else if (mtd->size > 0x1000000) {
+		/* enable 4-byte addressing if the device exceeds 16MiB */
+		nor->addr_width = 4;
+		if (JEDEC_MFR(info->jedec_id) == CFI_MFR_AMD) {
+			/* Dedicated 4-byte command set */
+			switch (nor->flash_read) {
+			case SPI_NOR_QUAD:
+				nor->read_opcode = OPCODE_QUAD_READ_4B;
+				break;
+			case SPI_NOR_DUAL:
+				nor->read_opcode = OPCODE_DUAL_READ_4B;
+				break;
+			case SPI_NOR_FAST:
+				nor->read_opcode = OPCODE_FAST_READ_4B;
+				break;
+			case SPI_NOR_NORMAL:
+				nor->read_opcode = OPCODE_NORM_READ_4B;
+				break;
+			}
+			nor->program_opcode = OPCODE_PP_4B;
+			/* No small sector erase for 4-byte command set */
+			nor->erase_opcode = OPCODE_SE_4B;
+			mtd->erasesize = info->sector_size;
+		} else
+			set_4byte(nor, info->jedec_id, 1);
+	} else {
+		nor->addr_width = 3;
+	}
+
+	nor->read_dummy = spi_nor_read_dummy_cycles(nor);
+
+	dev_info(dev, "%s (%lld Kbytes)\n", id->name,
+			(long long)mtd->size >> 10);
+
+	dev_dbg(dev,
+		"mtd .name = %s, .size = 0x%llx (%lldMiB), "
+		".erasesize = 0x%.8x (%uKiB) .numeraseregions = %d\n",
+		mtd->name, (long long)mtd->size, (long long)(mtd->size >> 20),
+		mtd->erasesize, mtd->erasesize / 1024, mtd->numeraseregions);
+
+	if (mtd->numeraseregions)
+		for (i = 0; i < mtd->numeraseregions; i++)
+			dev_dbg(dev,
+				"mtd.eraseregions[%d] = { .offset = 0x%llx, "
+				".erasesize = 0x%.8x (%uKiB), "
+				".numblocks = %d }\n",
+				i, (long long)mtd->eraseregions[i].offset,
+				mtd->eraseregions[i].erasesize,
+				mtd->eraseregions[i].erasesize / 1024,
+				mtd->eraseregions[i].numblocks);
+	return 0;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Huang Shijie <shijie8@gmail.com>");
+MODULE_AUTHOR("Mike Lavender");
+MODULE_DESCRIPTION("framework for SPI NOR");
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 3a3c3872c8cd..16d8409abcdc 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -162,4 +162,24 @@ struct spi_nor {
 
 	void *priv;
 };
+
+/**
+ * spi_nor_scan() - scan the SPI NOR
+ * @nor:	the spi_nor structure
+ * @id:		the spi_device_id provided by the driver
+ * @mode:	the read mode supported by the driver
+ *
+ * The drivers can use this fuction to scan the SPI NOR.
+ * In the scanning, it will try to get all the necessary information to
+ * fill the mtd_info{} and the spi_nor{}.
+ *
+ * The board may assigns a spi_device_id with @id which be used to compared with
+ * the spi_device_id detected by the scanning.
+ *
+ * Return: 0 for success, others for failure.
+ */
+int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
+			enum read_mode mode);
+extern const struct spi_device_id spi_nor_ids[];
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 0d8c11c01274bde227d368daa8954911dd324a9f Mon Sep 17 00:00:00 2001
From: Huang Shijie <b32955@freescale.com>
Date: Mon, 24 Feb 2014 18:37:40 +0800
Subject: mtd: spi-nor: add a helper to find the spi_device_id

Add the spi_nor_match_id() to find the proper spi_device_id with the
NOR flash's name in the spi_nor_ids table.

Signed-off-by: Huang Shijie <b32955@freescale.com>
Acked-by: Marek Vasut <marex@denx.de>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/spi-nor/spi-nor.c | 12 ++++++++++++
 include/linux/mtd/spi-nor.h   | 12 ++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 50b929095bdb..f7c9e638623b 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -1082,6 +1082,18 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
 	return 0;
 }
 
+const struct spi_device_id *spi_nor_match_id(char *name)
+{
+	const struct spi_device_id *id = spi_nor_ids;
+
+	while (id->name[0]) {
+		if (!strcmp(name, id->name))
+			return id;
+		id++;
+	}
+	return NULL;
+}
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Huang Shijie <shijie8@gmail.com>");
 MODULE_AUTHOR("Mike Lavender");
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 16d8409abcdc..41dae78fbd1d 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -182,4 +182,16 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
 			enum read_mode mode);
 extern const struct spi_device_id spi_nor_ids[];
 
+/**
+ * spi_nor_match_id() - find the spi_device_id by the name
+ * @name:	the name of the spi_device_id
+ *
+ * The drivers use this function to find the spi_device_id
+ * specified by the @name.
+ *
+ * Return: returns the right spi_device_id pointer on success,
+ *         and returns NULL on failure.
+ */
+const struct spi_device_id *spi_nor_match_id(char *name);
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 8eabdd1ec122cf6b77cf73e798f134fbace1b8d1 Mon Sep 17 00:00:00 2001
From: Huang Shijie <b32955@freescale.com>
Date: Thu, 10 Apr 2014 16:27:28 +0800
Subject: mtd: spi-nor: add the copyright information

Add the copyright information for spi-nor.c and spi-nor.h.

Signed-off-by: Huang Shijie <b32955@freescale.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/spi-nor/spi-nor.c | 6 +++++-
 include/linux/mtd/spi-nor.h   | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 5cd86eb2a5f0..6c64ab95dee2 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -1,5 +1,9 @@
 /*
- * Cloned most of the code from the m25p80.c
+ * Based on m25p80.c, by Mike Lavender (mike@steroidmicros.com), with
+ * influence from lart.c (Abraham Van Der Merwe) and mtd_dataflash.c
+ *
+ * Copyright (C) 2005, Intec Automation Inc.
+ * Copyright (C) 2014, Freescale Semiconductor, Inc.
  *
  * This code is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 41dae78fbd1d..9428d285489b 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -1,3 +1,12 @@
+/*
+ * Copyright (C) 2014 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
 #ifndef __LINUX_MTD_SPI_NOR_H
 #define __LINUX_MTD_SPI_NOR_H
 
-- 
cgit v1.2.3-71-gd317


From becd0cb8666de4bfaaf6eb3042f69066c8fb8677 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Tue, 8 Apr 2014 18:10:23 -0700
Subject: mtd: spi-nor: drop \t after #define

Spacing is a little non-standard here. Fix up tabs vs. spaces.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Huang Shijie <b32955@freescale.com>
Reviewed-by: Marek Vasut <marex@denx.de>
---
 include/linux/mtd/spi-nor.h | 72 ++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 9428d285489b..a6e87190ead1 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -11,55 +11,55 @@
 #define __LINUX_MTD_SPI_NOR_H
 
 /* Flash opcodes. */
-#define	OPCODE_WREN		0x06	/* Write enable */
-#define	OPCODE_RDSR		0x05	/* Read status register */
-#define	OPCODE_WRSR		0x01	/* Write status register 1 byte */
-#define	OPCODE_NORM_READ	0x03	/* Read data bytes (low frequency) */
-#define	OPCODE_FAST_READ	0x0b	/* Read data bytes (high frequency) */
-#define	OPCODE_DUAL_READ        0x3b    /* Read data bytes (Dual SPI) */
-#define	OPCODE_QUAD_READ        0x6b    /* Read data bytes (Quad SPI) */
-#define	OPCODE_PP		0x02	/* Page program (up to 256 bytes) */
-#define	OPCODE_BE_4K		0x20	/* Erase 4KiB block */
-#define	OPCODE_BE_4K_PMC	0xd7	/* Erase 4KiB block on PMC chips */
-#define	OPCODE_BE_32K		0x52	/* Erase 32KiB block */
-#define	OPCODE_CHIP_ERASE	0xc7	/* Erase whole flash chip */
-#define	OPCODE_SE		0xd8	/* Sector erase (usually 64KiB) */
-#define	OPCODE_RDID		0x9f	/* Read JEDEC ID */
-#define	OPCODE_RDCR             0x35    /* Read configuration register */
+#define OPCODE_WREN		0x06	/* Write enable */
+#define OPCODE_RDSR		0x05	/* Read status register */
+#define OPCODE_WRSR		0x01	/* Write status register 1 byte */
+#define OPCODE_NORM_READ	0x03	/* Read data bytes (low frequency) */
+#define OPCODE_FAST_READ	0x0b	/* Read data bytes (high frequency) */
+#define OPCODE_DUAL_READ	0x3b	/* Read data bytes (Dual SPI) */
+#define OPCODE_QUAD_READ	0x6b	/* Read data bytes (Quad SPI) */
+#define OPCODE_PP		0x02	/* Page program (up to 256 bytes) */
+#define OPCODE_BE_4K		0x20	/* Erase 4KiB block */
+#define OPCODE_BE_4K_PMC	0xd7	/* Erase 4KiB block on PMC chips */
+#define OPCODE_BE_32K		0x52	/* Erase 32KiB block */
+#define OPCODE_CHIP_ERASE	0xc7	/* Erase whole flash chip */
+#define OPCODE_SE		0xd8	/* Sector erase (usually 64KiB) */
+#define OPCODE_RDID		0x9f	/* Read JEDEC ID */
+#define OPCODE_RDCR		0x35	/* Read configuration register */
 
 /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
-#define	OPCODE_NORM_READ_4B	0x13	/* Read data bytes (low frequency) */
-#define	OPCODE_FAST_READ_4B	0x0c	/* Read data bytes (high frequency) */
-#define	OPCODE_DUAL_READ_4B	0x3c    /* Read data bytes (Dual SPI) */
-#define	OPCODE_QUAD_READ_4B	0x6c    /* Read data bytes (Quad SPI) */
-#define	OPCODE_PP_4B		0x12	/* Page program (up to 256 bytes) */
-#define	OPCODE_SE_4B		0xdc	/* Sector erase (usually 64KiB) */
+#define OPCODE_NORM_READ_4B	0x13	/* Read data bytes (low frequency) */
+#define OPCODE_FAST_READ_4B	0x0c	/* Read data bytes (high frequency) */
+#define OPCODE_DUAL_READ_4B	0x3c	/* Read data bytes (Dual SPI) */
+#define OPCODE_QUAD_READ_4B	0x6c	/* Read data bytes (Quad SPI) */
+#define OPCODE_PP_4B		0x12	/* Page program (up to 256 bytes) */
+#define OPCODE_SE_4B		0xdc	/* Sector erase (usually 64KiB) */
 
 /* Used for SST flashes only. */
-#define	OPCODE_BP		0x02	/* Byte program */
-#define	OPCODE_WRDI		0x04	/* Write disable */
-#define	OPCODE_AAI_WP		0xad	/* Auto address increment word program */
+#define OPCODE_BP		0x02	/* Byte program */
+#define OPCODE_WRDI		0x04	/* Write disable */
+#define OPCODE_AAI_WP		0xad	/* Auto address increment word program */
 
 /* Used for Macronix and Winbond flashes. */
-#define	OPCODE_EN4B		0xb7	/* Enter 4-byte mode */
-#define	OPCODE_EX4B		0xe9	/* Exit 4-byte mode */
+#define OPCODE_EN4B		0xb7	/* Enter 4-byte mode */
+#define OPCODE_EX4B		0xe9	/* Exit 4-byte mode */
 
 /* Used for Spansion flashes only. */
-#define	OPCODE_BRWR		0x17	/* Bank register write */
+#define OPCODE_BRWR		0x17	/* Bank register write */
 
 /* Status Register bits. */
-#define	SR_WIP			1	/* Write in progress */
-#define	SR_WEL			2	/* Write enable latch */
+#define SR_WIP			1	/* Write in progress */
+#define SR_WEL			2	/* Write enable latch */
 /* meaning of other SR_* bits may differ between vendors */
-#define	SR_BP0			4	/* Block protect 0 */
-#define	SR_BP1			8	/* Block protect 1 */
-#define	SR_BP2			0x10	/* Block protect 2 */
-#define	SR_SRWD			0x80	/* SR write protect */
+#define SR_BP0			4	/* Block protect 0 */
+#define SR_BP1			8	/* Block protect 1 */
+#define SR_BP2			0x10	/* Block protect 2 */
+#define SR_SRWD			0x80	/* SR write protect */
 
-#define SR_QUAD_EN_MX           0x40    /* Macronix Quad I/O */
+#define SR_QUAD_EN_MX		0x40	/* Macronix Quad I/O */
 
 /* Configuration Register bits. */
-#define CR_QUAD_EN_SPAN		0x2     /* Spansion Quad I/O */
+#define CR_QUAD_EN_SPAN		0x2	/* Spansion Quad I/O */
 
 enum read_mode {
 	SPI_NOR_NORMAL = 0,
@@ -95,7 +95,7 @@ struct spi_nor_xfer_cfg {
 	u8		dummy_cycles;
 };
 
-#define	SPI_NOR_MAX_CMD_SIZE	8
+#define SPI_NOR_MAX_CMD_SIZE	8
 enum spi_nor_ops {
 	SPI_NOR_OPS_READ = 0,
 	SPI_NOR_OPS_WRITE,
-- 
cgit v1.2.3-71-gd317


From b02e7f3ef0beb72da8fc64542f0ac977996ec56b Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Tue, 8 Apr 2014 18:15:31 -0700
Subject: mtd: spi-nor: re-name OPCODE_* to SPINOR_OP_*

Qualify these with a better namespace, and prepare them for use in more
drivers.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Reviewed-by: Marek Vasut <marex@denx.de>
Acked-by: Huang Shijie <b32955@freescale.com>
---
 drivers/mtd/devices/m25p80.c      |  4 +--
 drivers/mtd/spi-nor/fsl-quadspi.c | 58 +++++++++++++++++------------------
 drivers/mtd/spi-nor/spi-nor.c     | 64 +++++++++++++++++++--------------------
 include/linux/mtd/spi-nor.h       | 54 ++++++++++++++++-----------------
 4 files changed, 90 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 4af6400ccd95..1557d8f672c1 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -86,7 +86,7 @@ static void m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
 
 	spi_message_init(&m);
 
-	if (nor->program_opcode == OPCODE_AAI_WP && nor->sst_write_second)
+	if (nor->program_opcode == SPINOR_OP_AAI_WP && nor->sst_write_second)
 		cmd_sz = 1;
 
 	flash->command[0] = nor->program_opcode;
@@ -171,7 +171,7 @@ static int m25p80_erase(struct spi_nor *nor, loff_t offset)
 		return ret;
 
 	/* Send write enable, then erase commands. */
-	ret = nor->write_reg(nor, OPCODE_WREN, NULL, 0, 0);
+	ret = nor->write_reg(nor, SPINOR_OP_WREN, NULL, 0, 0);
 	if (ret)
 		return ret;
 
diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c
index 6dc08ed950c8..2977f026f39d 100644
--- a/drivers/mtd/spi-nor/fsl-quadspi.c
+++ b/drivers/mtd/spi-nor/fsl-quadspi.c
@@ -294,12 +294,12 @@ static void fsl_qspi_init_lut(struct fsl_qspi *q)
 	lut_base = SEQID_QUAD_READ * 4;
 
 	if (q->nor_size <= SZ_16M) {
-		cmd = OPCODE_QUAD_READ;
+		cmd = SPINOR_OP_QUAD_READ;
 		addrlen = ADDR24BIT;
 		dummy = 8;
 	} else {
 		/* use the 4-byte address */
-		cmd = OPCODE_QUAD_READ;
+		cmd = SPINOR_OP_QUAD_READ;
 		addrlen = ADDR32BIT;
 		dummy = 8;
 	}
@@ -311,17 +311,17 @@ static void fsl_qspi_init_lut(struct fsl_qspi *q)
 
 	/* Write enable */
 	lut_base = SEQID_WREN * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_WREN), base + QUADSPI_LUT(lut_base));
+	writel(LUT0(CMD, PAD1, SPINOR_OP_WREN), base + QUADSPI_LUT(lut_base));
 
 	/* Page Program */
 	lut_base = SEQID_PP * 4;
 
 	if (q->nor_size <= SZ_16M) {
-		cmd = OPCODE_PP;
+		cmd = SPINOR_OP_PP;
 		addrlen = ADDR24BIT;
 	} else {
 		/* use the 4-byte address */
-		cmd = OPCODE_PP;
+		cmd = SPINOR_OP_PP;
 		addrlen = ADDR32BIT;
 	}
 
@@ -331,18 +331,18 @@ static void fsl_qspi_init_lut(struct fsl_qspi *q)
 
 	/* Read Status */
 	lut_base = SEQID_RDSR * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_RDSR) | LUT1(READ, PAD1, 0x1),
+	writel(LUT0(CMD, PAD1, SPINOR_OP_RDSR) | LUT1(READ, PAD1, 0x1),
 			base + QUADSPI_LUT(lut_base));
 
 	/* Erase a sector */
 	lut_base = SEQID_SE * 4;
 
 	if (q->nor_size <= SZ_16M) {
-		cmd = OPCODE_SE;
+		cmd = SPINOR_OP_SE;
 		addrlen = ADDR24BIT;
 	} else {
 		/* use the 4-byte address */
-		cmd = OPCODE_SE;
+		cmd = SPINOR_OP_SE;
 		addrlen = ADDR32BIT;
 	}
 
@@ -351,35 +351,35 @@ static void fsl_qspi_init_lut(struct fsl_qspi *q)
 
 	/* Erase the whole chip */
 	lut_base = SEQID_CHIP_ERASE * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_CHIP_ERASE),
+	writel(LUT0(CMD, PAD1, SPINOR_OP_CHIP_ERASE),
 			base + QUADSPI_LUT(lut_base));
 
 	/* READ ID */
 	lut_base = SEQID_RDID * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_RDID) | LUT1(READ, PAD1, 0x8),
+	writel(LUT0(CMD, PAD1, SPINOR_OP_RDID) | LUT1(READ, PAD1, 0x8),
 			base + QUADSPI_LUT(lut_base));
 
 	/* Write Register */
 	lut_base = SEQID_WRSR * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_WRSR) | LUT1(WRITE, PAD1, 0x2),
+	writel(LUT0(CMD, PAD1, SPINOR_OP_WRSR) | LUT1(WRITE, PAD1, 0x2),
 			base + QUADSPI_LUT(lut_base));
 
 	/* Read Configuration Register */
 	lut_base = SEQID_RDCR * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_RDCR) | LUT1(READ, PAD1, 0x1),
+	writel(LUT0(CMD, PAD1, SPINOR_OP_RDCR) | LUT1(READ, PAD1, 0x1),
 			base + QUADSPI_LUT(lut_base));
 
 	/* Write disable */
 	lut_base = SEQID_WRDI * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_WRDI), base + QUADSPI_LUT(lut_base));
+	writel(LUT0(CMD, PAD1, SPINOR_OP_WRDI), base + QUADSPI_LUT(lut_base));
 
 	/* Enter 4 Byte Mode (Micron) */
 	lut_base = SEQID_EN4B * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_EN4B), base + QUADSPI_LUT(lut_base));
+	writel(LUT0(CMD, PAD1, SPINOR_OP_EN4B), base + QUADSPI_LUT(lut_base));
 
 	/* Enter 4 Byte Mode (Spansion) */
 	lut_base = SEQID_BRWR * 4;
-	writel(LUT0(CMD, PAD1, OPCODE_BRWR), base + QUADSPI_LUT(lut_base));
+	writel(LUT0(CMD, PAD1, SPINOR_OP_BRWR), base + QUADSPI_LUT(lut_base));
 
 	fsl_qspi_lock_lut(q);
 }
@@ -388,29 +388,29 @@ static void fsl_qspi_init_lut(struct fsl_qspi *q)
 static int fsl_qspi_get_seqid(struct fsl_qspi *q, u8 cmd)
 {
 	switch (cmd) {
-	case OPCODE_QUAD_READ:
+	case SPINOR_OP_QUAD_READ:
 		return SEQID_QUAD_READ;
-	case OPCODE_WREN:
+	case SPINOR_OP_WREN:
 		return SEQID_WREN;
-	case OPCODE_WRDI:
+	case SPINOR_OP_WRDI:
 		return SEQID_WRDI;
-	case OPCODE_RDSR:
+	case SPINOR_OP_RDSR:
 		return SEQID_RDSR;
-	case OPCODE_SE:
+	case SPINOR_OP_SE:
 		return SEQID_SE;
-	case OPCODE_CHIP_ERASE:
+	case SPINOR_OP_CHIP_ERASE:
 		return SEQID_CHIP_ERASE;
-	case OPCODE_PP:
+	case SPINOR_OP_PP:
 		return SEQID_PP;
-	case OPCODE_RDID:
+	case SPINOR_OP_RDID:
 		return SEQID_RDID;
-	case OPCODE_WRSR:
+	case SPINOR_OP_WRSR:
 		return SEQID_WRSR;
-	case OPCODE_RDCR:
+	case SPINOR_OP_RDCR:
 		return SEQID_RDCR;
-	case OPCODE_EN4B:
+	case SPINOR_OP_EN4B:
 		return SEQID_EN4B;
-	case OPCODE_BRWR:
+	case SPINOR_OP_BRWR:
 		return SEQID_BRWR;
 	default:
 		dev_err(q->dev, "Unsupported cmd 0x%.2x\n", cmd);
@@ -688,7 +688,7 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len,
 		if (ret)
 			return ret;
 
-		if (opcode == OPCODE_CHIP_ERASE)
+		if (opcode == SPINOR_OP_CHIP_ERASE)
 			fsl_qspi_invalid(q);
 
 	} else if (len > 0) {
@@ -750,7 +750,7 @@ static int fsl_qspi_erase(struct spi_nor *nor, loff_t offs)
 		return ret;
 
 	/* Send write enable, then erase commands. */
-	ret = nor->write_reg(nor, OPCODE_WREN, NULL, 0, 0);
+	ret = nor->write_reg(nor, SPINOR_OP_WREN, NULL, 0, 0);
 	if (ret)
 		return ret;
 
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 6c64ab95dee2..1716f3ce9949 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -38,7 +38,7 @@ static int read_sr(struct spi_nor *nor)
 	int ret;
 	u8 val;
 
-	ret = nor->read_reg(nor, OPCODE_RDSR, &val, 1);
+	ret = nor->read_reg(nor, SPINOR_OP_RDSR, &val, 1);
 	if (ret < 0) {
 		pr_err("error %d reading SR\n", (int) ret);
 		return ret;
@@ -57,7 +57,7 @@ static int read_cr(struct spi_nor *nor)
 	int ret;
 	u8 val;
 
-	ret = nor->read_reg(nor, OPCODE_RDCR, &val, 1);
+	ret = nor->read_reg(nor, SPINOR_OP_RDCR, &val, 1);
 	if (ret < 0) {
 		dev_err(nor->dev, "error %d reading CR\n", ret);
 		return ret;
@@ -91,7 +91,7 @@ static inline int spi_nor_read_dummy_cycles(struct spi_nor *nor)
 static inline int write_sr(struct spi_nor *nor, u8 val)
 {
 	nor->cmd_buf[0] = val;
-	return nor->write_reg(nor, OPCODE_WRSR, nor->cmd_buf, 1, 0);
+	return nor->write_reg(nor, SPINOR_OP_WRSR, nor->cmd_buf, 1, 0);
 }
 
 /*
@@ -100,7 +100,7 @@ static inline int write_sr(struct spi_nor *nor, u8 val)
  */
 static inline int write_enable(struct spi_nor *nor)
 {
-	return nor->write_reg(nor, OPCODE_WREN, NULL, 0, 0);
+	return nor->write_reg(nor, SPINOR_OP_WREN, NULL, 0, 0);
 }
 
 /*
@@ -108,7 +108,7 @@ static inline int write_enable(struct spi_nor *nor)
  */
 static inline int write_disable(struct spi_nor *nor)
 {
-	return nor->write_reg(nor, OPCODE_WRDI, NULL, 0, 0);
+	return nor->write_reg(nor, SPINOR_OP_WRDI, NULL, 0, 0);
 }
 
 static inline struct spi_nor *mtd_to_spi_nor(struct mtd_info *mtd)
@@ -132,7 +132,7 @@ static inline int set_4byte(struct spi_nor *nor, u32 jedec_id, int enable)
 		if (need_wren)
 			write_enable(nor);
 
-		cmd = enable ? OPCODE_EN4B : OPCODE_EX4B;
+		cmd = enable ? SPINOR_OP_EN4B : SPINOR_OP_EX4B;
 		status = nor->write_reg(nor, cmd, NULL, 0, 0);
 		if (need_wren)
 			write_disable(nor);
@@ -141,7 +141,7 @@ static inline int set_4byte(struct spi_nor *nor, u32 jedec_id, int enable)
 	default:
 		/* Spansion style */
 		nor->cmd_buf[0] = enable << 7;
-		return nor->write_reg(nor, OPCODE_BRWR, nor->cmd_buf, 1, 0);
+		return nor->write_reg(nor, SPINOR_OP_BRWR, nor->cmd_buf, 1, 0);
 	}
 }
 
@@ -193,7 +193,7 @@ static int erase_chip(struct spi_nor *nor)
 	/* Send write enable, then erase commands. */
 	write_enable(nor);
 
-	return nor->write_reg(nor, OPCODE_CHIP_ERASE, NULL, 0, 0);
+	return nor->write_reg(nor, SPINOR_OP_CHIP_ERASE, NULL, 0, 0);
 }
 
 static int spi_nor_lock_and_prep(struct spi_nor *nor, enum spi_nor_ops ops)
@@ -253,7 +253,7 @@ static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr)
 		}
 
 	/* REVISIT in some cases we could speed up erasing large regions
-	 * by using OPCODE_SE instead of OPCODE_BE_4K.  We may have set up
+	 * by using SPINOR_OP_SE instead of SPINOR_OP_BE_4K.  We may have set up
 	 * to use "small sector erase", but that's not always optimal.
 	 */
 
@@ -385,7 +385,7 @@ struct flash_info {
 	u32		jedec_id;
 	u16             ext_id;
 
-	/* The size listed here is what works with OPCODE_SE, which isn't
+	/* The size listed here is what works with SPINOR_OP_SE, which isn't
 	 * necessarily called a "sector" by the vendor.
 	 */
 	unsigned	sector_size;
@@ -395,11 +395,11 @@ struct flash_info {
 	u16		addr_width;
 
 	u16		flags;
-#define	SECT_4K			0x01	/* OPCODE_BE_4K works uniformly */
+#define	SECT_4K			0x01	/* SPINOR_OP_BE_4K works uniformly */
 #define	SPI_NOR_NO_ERASE	0x02	/* No erase command needed */
 #define	SST_WRITE		0x04	/* use SST byte programming */
 #define	SPI_NOR_NO_FR		0x08	/* Can't do fastread */
-#define	SECT_4K_PMC		0x10	/* OPCODE_BE_4K_PMC works uniformly */
+#define	SECT_4K_PMC		0x10	/* SPINOR_OP_BE_4K_PMC works uniformly */
 #define	SPI_NOR_DUAL_READ	0x20    /* Flash supports Dual Read */
 #define	SPI_NOR_QUAD_READ	0x40    /* Flash supports Quad Read */
 };
@@ -598,7 +598,7 @@ static const struct spi_device_id *spi_nor_read_id(struct spi_nor *nor)
 	u16                     ext_jedec;
 	struct flash_info	*info;
 
-	tmp = nor->read_reg(nor, OPCODE_RDID, id, 5);
+	tmp = nor->read_reg(nor, SPINOR_OP_RDID, id, 5);
 	if (tmp < 0) {
 		dev_dbg(nor->dev, " error %d reading JEDEC ID\n", tmp);
 		return ERR_PTR(tmp);
@@ -670,7 +670,7 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
 	actual = to % 2;
 	/* Start write from odd address. */
 	if (actual) {
-		nor->program_opcode = OPCODE_BP;
+		nor->program_opcode = SPINOR_OP_BP;
 
 		/* write one byte. */
 		nor->write(nor, to, 1, retlen, buf);
@@ -682,7 +682,7 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 	/* Write out most of the data here. */
 	for (; actual < len - 1; actual += 2) {
-		nor->program_opcode = OPCODE_AAI_WP;
+		nor->program_opcode = SPINOR_OP_AAI_WP;
 
 		/* write two bytes. */
 		nor->write(nor, to, 2, retlen, buf + actual);
@@ -703,7 +703,7 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
 	if (actual != len) {
 		write_enable(nor);
 
-		nor->program_opcode = OPCODE_BP;
+		nor->program_opcode = SPINOR_OP_BP;
 		nor->write(nor, to, 1, retlen, buf + actual);
 
 		ret = wait_till_ready(nor);
@@ -777,7 +777,7 @@ static int macronix_quad_enable(struct spi_nor *nor)
 	write_enable(nor);
 
 	nor->cmd_buf[0] = val | SR_QUAD_EN_MX;
-	nor->write_reg(nor, OPCODE_WRSR, nor->cmd_buf, 1, 0);
+	nor->write_reg(nor, SPINOR_OP_WRSR, nor->cmd_buf, 1, 0);
 
 	if (wait_till_ready(nor))
 		return 1;
@@ -802,7 +802,7 @@ static int write_sr_cr(struct spi_nor *nor, u16 val)
 	nor->cmd_buf[0] = val & 0xff;
 	nor->cmd_buf[1] = (val >> 8);
 
-	return nor->write_reg(nor, OPCODE_WRSR, nor->cmd_buf, 2, 0);
+	return nor->write_reg(nor, SPINOR_OP_WRSR, nor->cmd_buf, 2, 0);
 }
 
 static int spansion_quad_enable(struct spi_nor *nor)
@@ -967,13 +967,13 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
 
 	/* prefer "small sector" erase if possible */
 	if (info->flags & SECT_4K) {
-		nor->erase_opcode = OPCODE_BE_4K;
+		nor->erase_opcode = SPINOR_OP_BE_4K;
 		mtd->erasesize = 4096;
 	} else if (info->flags & SECT_4K_PMC) {
-		nor->erase_opcode = OPCODE_BE_4K_PMC;
+		nor->erase_opcode = SPINOR_OP_BE_4K_PMC;
 		mtd->erasesize = 4096;
 	} else {
-		nor->erase_opcode = OPCODE_SE;
+		nor->erase_opcode = SPINOR_OP_SE;
 		mtd->erasesize = info->sector_size;
 	}
 
@@ -1014,23 +1014,23 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
 	/* Default commands */
 	switch (nor->flash_read) {
 	case SPI_NOR_QUAD:
-		nor->read_opcode = OPCODE_QUAD_READ;
+		nor->read_opcode = SPINOR_OP_QUAD_READ;
 		break;
 	case SPI_NOR_DUAL:
-		nor->read_opcode = OPCODE_DUAL_READ;
+		nor->read_opcode = SPINOR_OP_DUAL_READ;
 		break;
 	case SPI_NOR_FAST:
-		nor->read_opcode = OPCODE_FAST_READ;
+		nor->read_opcode = SPINOR_OP_FAST_READ;
 		break;
 	case SPI_NOR_NORMAL:
-		nor->read_opcode = OPCODE_NORM_READ;
+		nor->read_opcode = SPINOR_OP_NORM_READ;
 		break;
 	default:
 		dev_err(dev, "No Read opcode defined\n");
 		return -EINVAL;
 	}
 
-	nor->program_opcode = OPCODE_PP;
+	nor->program_opcode = SPINOR_OP_PP;
 
 	if (info->addr_width)
 		nor->addr_width = info->addr_width;
@@ -1041,21 +1041,21 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
 			/* Dedicated 4-byte command set */
 			switch (nor->flash_read) {
 			case SPI_NOR_QUAD:
-				nor->read_opcode = OPCODE_QUAD_READ_4B;
+				nor->read_opcode = SPINOR_OP_QUAD_READ_4B;
 				break;
 			case SPI_NOR_DUAL:
-				nor->read_opcode = OPCODE_DUAL_READ_4B;
+				nor->read_opcode = SPINOR_OP_DUAL_READ_4B;
 				break;
 			case SPI_NOR_FAST:
-				nor->read_opcode = OPCODE_FAST_READ_4B;
+				nor->read_opcode = SPINOR_OP_FAST_READ_4B;
 				break;
 			case SPI_NOR_NORMAL:
-				nor->read_opcode = OPCODE_NORM_READ_4B;
+				nor->read_opcode = SPINOR_OP_NORM_READ_4B;
 				break;
 			}
-			nor->program_opcode = OPCODE_PP_4B;
+			nor->program_opcode = SPINOR_OP_PP_4B;
 			/* No small sector erase for 4-byte command set */
-			nor->erase_opcode = OPCODE_SE_4B;
+			nor->erase_opcode = SPINOR_OP_SE_4B;
 			mtd->erasesize = info->sector_size;
 		} else
 			set_4byte(nor, info->jedec_id, 1);
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index a6e87190ead1..f1fe1a6659a3 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -11,41 +11,41 @@
 #define __LINUX_MTD_SPI_NOR_H
 
 /* Flash opcodes. */
-#define OPCODE_WREN		0x06	/* Write enable */
-#define OPCODE_RDSR		0x05	/* Read status register */
-#define OPCODE_WRSR		0x01	/* Write status register 1 byte */
-#define OPCODE_NORM_READ	0x03	/* Read data bytes (low frequency) */
-#define OPCODE_FAST_READ	0x0b	/* Read data bytes (high frequency) */
-#define OPCODE_DUAL_READ	0x3b	/* Read data bytes (Dual SPI) */
-#define OPCODE_QUAD_READ	0x6b	/* Read data bytes (Quad SPI) */
-#define OPCODE_PP		0x02	/* Page program (up to 256 bytes) */
-#define OPCODE_BE_4K		0x20	/* Erase 4KiB block */
-#define OPCODE_BE_4K_PMC	0xd7	/* Erase 4KiB block on PMC chips */
-#define OPCODE_BE_32K		0x52	/* Erase 32KiB block */
-#define OPCODE_CHIP_ERASE	0xc7	/* Erase whole flash chip */
-#define OPCODE_SE		0xd8	/* Sector erase (usually 64KiB) */
-#define OPCODE_RDID		0x9f	/* Read JEDEC ID */
-#define OPCODE_RDCR		0x35	/* Read configuration register */
+#define SPINOR_OP_WREN		0x06	/* Write enable */
+#define SPINOR_OP_RDSR		0x05	/* Read status register */
+#define SPINOR_OP_WRSR		0x01	/* Write status register 1 byte */
+#define SPINOR_OP_NORM_READ	0x03	/* Read data bytes (low frequency) */
+#define SPINOR_OP_FAST_READ	0x0b	/* Read data bytes (high frequency) */
+#define SPINOR_OP_DUAL_READ	0x3b	/* Read data bytes (Dual SPI) */
+#define SPINOR_OP_QUAD_READ	0x6b	/* Read data bytes (Quad SPI) */
+#define SPINOR_OP_PP		0x02	/* Page program (up to 256 bytes) */
+#define SPINOR_OP_BE_4K		0x20	/* Erase 4KiB block */
+#define SPINOR_OP_BE_4K_PMC	0xd7	/* Erase 4KiB block on PMC chips */
+#define SPINOR_OP_BE_32K	0x52	/* Erase 32KiB block */
+#define SPINOR_OP_CHIP_ERASE	0xc7	/* Erase whole flash chip */
+#define SPINOR_OP_SE		0xd8	/* Sector erase (usually 64KiB) */
+#define SPINOR_OP_RDID		0x9f	/* Read JEDEC ID */
+#define SPINOR_OP_RDCR		0x35	/* Read configuration register */
 
 /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
-#define OPCODE_NORM_READ_4B	0x13	/* Read data bytes (low frequency) */
-#define OPCODE_FAST_READ_4B	0x0c	/* Read data bytes (high frequency) */
-#define OPCODE_DUAL_READ_4B	0x3c	/* Read data bytes (Dual SPI) */
-#define OPCODE_QUAD_READ_4B	0x6c	/* Read data bytes (Quad SPI) */
-#define OPCODE_PP_4B		0x12	/* Page program (up to 256 bytes) */
-#define OPCODE_SE_4B		0xdc	/* Sector erase (usually 64KiB) */
+#define SPINOR_OP_NORM_READ_4B	0x13	/* Read data bytes (low frequency) */
+#define SPINOR_OP_FAST_READ_4B	0x0c	/* Read data bytes (high frequency) */
+#define SPINOR_OP_DUAL_READ_4B	0x3c	/* Read data bytes (Dual SPI) */
+#define SPINOR_OP_QUAD_READ_4B	0x6c	/* Read data bytes (Quad SPI) */
+#define SPINOR_OP_PP_4B		0x12	/* Page program (up to 256 bytes) */
+#define SPINOR_OP_SE_4B		0xdc	/* Sector erase (usually 64KiB) */
 
 /* Used for SST flashes only. */
-#define OPCODE_BP		0x02	/* Byte program */
-#define OPCODE_WRDI		0x04	/* Write disable */
-#define OPCODE_AAI_WP		0xad	/* Auto address increment word program */
+#define SPINOR_OP_BP		0x02	/* Byte program */
+#define SPINOR_OP_WRDI		0x04	/* Write disable */
+#define SPINOR_OP_AAI_WP	0xad	/* Auto address increment word program */
 
 /* Used for Macronix and Winbond flashes. */
-#define OPCODE_EN4B		0xb7	/* Enter 4-byte mode */
-#define OPCODE_EX4B		0xe9	/* Exit 4-byte mode */
+#define SPINOR_OP_EN4B		0xb7	/* Enter 4-byte mode */
+#define SPINOR_OP_EX4B		0xe9	/* Exit 4-byte mode */
 
 /* Used for Spansion flashes only. */
-#define OPCODE_BRWR		0x17	/* Bank register write */
+#define SPINOR_OP_BRWR		0x17	/* Bank register write */
 
 /* Status Register bits. */
 #define SR_WIP			1	/* Write in progress */
-- 
cgit v1.2.3-71-gd317


From 58b89a1f4c2a65b10b8f7b90b6ff2161b19bb0d1 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Tue, 8 Apr 2014 19:16:49 -0700
Subject: mtd: spi-nor: unify read opcode variants with ST SPI FSM

serial_flash_cmds.h defines our opcodes a little differently. Let's
borrow its naming, since it's borrowed from the SFDP standard, and it's
more extensible.

This prepares us for merging serial_flash_cmds.h and spi-nor.h opcode
listing.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Reviewed-by: Marek Vasut <marex@denx.de>
Acked-by: Huang Shijie <b32955@freescale.com>
---
 drivers/mtd/spi-nor/fsl-quadspi.c |  6 +++---
 drivers/mtd/spi-nor/spi-nor.c     | 16 ++++++++--------
 include/linux/mtd/spi-nor.h       | 24 ++++++++++++++++--------
 3 files changed, 27 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c
index 2977f026f39d..b41bbbc531ff 100644
--- a/drivers/mtd/spi-nor/fsl-quadspi.c
+++ b/drivers/mtd/spi-nor/fsl-quadspi.c
@@ -294,12 +294,12 @@ static void fsl_qspi_init_lut(struct fsl_qspi *q)
 	lut_base = SEQID_QUAD_READ * 4;
 
 	if (q->nor_size <= SZ_16M) {
-		cmd = SPINOR_OP_QUAD_READ;
+		cmd = SPINOR_OP_READ_1_1_4;
 		addrlen = ADDR24BIT;
 		dummy = 8;
 	} else {
 		/* use the 4-byte address */
-		cmd = SPINOR_OP_QUAD_READ;
+		cmd = SPINOR_OP_READ_1_1_4;
 		addrlen = ADDR32BIT;
 		dummy = 8;
 	}
@@ -388,7 +388,7 @@ static void fsl_qspi_init_lut(struct fsl_qspi *q)
 static int fsl_qspi_get_seqid(struct fsl_qspi *q, u8 cmd)
 {
 	switch (cmd) {
-	case SPINOR_OP_QUAD_READ:
+	case SPINOR_OP_READ_1_1_4:
 		return SEQID_QUAD_READ;
 	case SPINOR_OP_WREN:
 		return SEQID_WREN;
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 1716f3ce9949..d6f44d527701 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -1014,16 +1014,16 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
 	/* Default commands */
 	switch (nor->flash_read) {
 	case SPI_NOR_QUAD:
-		nor->read_opcode = SPINOR_OP_QUAD_READ;
+		nor->read_opcode = SPINOR_OP_READ_1_1_4;
 		break;
 	case SPI_NOR_DUAL:
-		nor->read_opcode = SPINOR_OP_DUAL_READ;
+		nor->read_opcode = SPINOR_OP_READ_1_1_2;
 		break;
 	case SPI_NOR_FAST:
-		nor->read_opcode = SPINOR_OP_FAST_READ;
+		nor->read_opcode = SPINOR_OP_READ_FAST;
 		break;
 	case SPI_NOR_NORMAL:
-		nor->read_opcode = SPINOR_OP_NORM_READ;
+		nor->read_opcode = SPINOR_OP_READ;
 		break;
 	default:
 		dev_err(dev, "No Read opcode defined\n");
@@ -1041,16 +1041,16 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id,
 			/* Dedicated 4-byte command set */
 			switch (nor->flash_read) {
 			case SPI_NOR_QUAD:
-				nor->read_opcode = SPINOR_OP_QUAD_READ_4B;
+				nor->read_opcode = SPINOR_OP_READ4_1_1_4;
 				break;
 			case SPI_NOR_DUAL:
-				nor->read_opcode = SPINOR_OP_DUAL_READ_4B;
+				nor->read_opcode = SPINOR_OP_READ4_1_1_2;
 				break;
 			case SPI_NOR_FAST:
-				nor->read_opcode = SPINOR_OP_FAST_READ_4B;
+				nor->read_opcode = SPINOR_OP_READ4_FAST;
 				break;
 			case SPI_NOR_NORMAL:
-				nor->read_opcode = SPINOR_OP_NORM_READ_4B;
+				nor->read_opcode = SPINOR_OP_READ4;
 				break;
 			}
 			nor->program_opcode = SPINOR_OP_PP_4B;
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index f1fe1a6659a3..53241842a7ab 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -10,14 +10,22 @@
 #ifndef __LINUX_MTD_SPI_NOR_H
 #define __LINUX_MTD_SPI_NOR_H
 
+/*
+ * Note on opcode nomenclature: some opcodes have a format like
+ * SPINOR_OP_FUNCTION{4,}_x_y_z. The numbers x, y, and z stand for the number
+ * of I/O lines used for the opcode, address, and data (respectively). The
+ * FUNCTION has an optional suffix of '4', to represent an opcode which
+ * requires a 4-byte (32-bit) address.
+ */
+
 /* Flash opcodes. */
 #define SPINOR_OP_WREN		0x06	/* Write enable */
 #define SPINOR_OP_RDSR		0x05	/* Read status register */
 #define SPINOR_OP_WRSR		0x01	/* Write status register 1 byte */
-#define SPINOR_OP_NORM_READ	0x03	/* Read data bytes (low frequency) */
-#define SPINOR_OP_FAST_READ	0x0b	/* Read data bytes (high frequency) */
-#define SPINOR_OP_DUAL_READ	0x3b	/* Read data bytes (Dual SPI) */
-#define SPINOR_OP_QUAD_READ	0x6b	/* Read data bytes (Quad SPI) */
+#define SPINOR_OP_READ		0x03	/* Read data bytes (low frequency) */
+#define SPINOR_OP_READ_FAST	0x0b	/* Read data bytes (high frequency) */
+#define SPINOR_OP_READ_1_1_2	0x3b	/* Read data bytes (Dual SPI) */
+#define SPINOR_OP_READ_1_1_4	0x6b	/* Read data bytes (Quad SPI) */
 #define SPINOR_OP_PP		0x02	/* Page program (up to 256 bytes) */
 #define SPINOR_OP_BE_4K		0x20	/* Erase 4KiB block */
 #define SPINOR_OP_BE_4K_PMC	0xd7	/* Erase 4KiB block on PMC chips */
@@ -28,10 +36,10 @@
 #define SPINOR_OP_RDCR		0x35	/* Read configuration register */
 
 /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
-#define SPINOR_OP_NORM_READ_4B	0x13	/* Read data bytes (low frequency) */
-#define SPINOR_OP_FAST_READ_4B	0x0c	/* Read data bytes (high frequency) */
-#define SPINOR_OP_DUAL_READ_4B	0x3c	/* Read data bytes (Dual SPI) */
-#define SPINOR_OP_QUAD_READ_4B	0x6c	/* Read data bytes (Quad SPI) */
+#define SPINOR_OP_READ4		0x13	/* Read data bytes (low frequency) */
+#define SPINOR_OP_READ4_FAST	0x0c	/* Read data bytes (high frequency) */
+#define SPINOR_OP_READ4_1_1_2	0x3c	/* Read data bytes (Dual SPI) */
+#define SPINOR_OP_READ4_1_1_4	0x6c	/* Read data bytes (Quad SPI) */
 #define SPINOR_OP_PP_4B		0x12	/* Page program (up to 256 bytes) */
 #define SPINOR_OP_SE_4B		0xdc	/* Sector erase (usually 64KiB) */
 
-- 
cgit v1.2.3-71-gd317


From fdb9c293decf7e06795f7d9ae409df907c7ae1b6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 15 Apr 2014 12:39:14 -0500
Subject: percpu: Replace __get_cpu_var with this_cpu_ptr

__this_cpu_ptr is being phased out.  Use raw_cpu_ptr instead which was
introduced in 3.15-rc1.  One case of using __get_cpu_var in the
get_cpu_var macro for address calculation was remaining in
include/linux/percpu.h.

tj: Updated patch description.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e7a0b95ed527..539b3caa5748 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -29,7 +29,7 @@
  */
 #define get_cpu_var(var) (*({				\
 	preempt_disable();				\
-	&__get_cpu_var(var); }))
+	this_cpu_ptr(&var); }))
 
 /*
  * The weird & is necessary because sparse considers (void)(var) to be
-- 
cgit v1.2.3-71-gd317


From 79f7ae7c45a6ccf04e2908337461dee615f6afb0 Mon Sep 17 00:00:00 2001
From: Seungwon Jeon <tgih.jun@samsung.com>
Date: Fri, 14 Mar 2014 21:11:56 +0900
Subject: mmc: clarify DDR timing mode between SD-UHS and eMMC

This change distinguishes DDR timing mode of current
mixed usage to clarify device type.

Signed-off-by: Seungwon Jeon <tgih.jun@samsung.com>
Acked-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/debugfs.c | 3 +++
 drivers/mmc/core/mmc.c     | 2 +-
 include/linux/mmc/host.h   | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 54829c0ed000..509229b48b55 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -135,6 +135,9 @@ static int mmc_ios_show(struct seq_file *s, void *data)
 	case MMC_TIMING_UHS_DDR50:
 		str = "sd uhs DDR50";
 		break;
+	case MMC_TIMING_MMC_DDR52:
+		str = "mmc DDR52";
+		break;
 	case MMC_TIMING_MMC_HS200:
 		str = "mmc high-speed SDR200";
 		break;
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 1ab5f3a0af5b..e22d8515ff97 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1264,7 +1264,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 					goto err;
 			}
 			mmc_card_set_ddr_mode(card);
-			mmc_set_timing(card->host, MMC_TIMING_UHS_DDR50);
+			mmc_set_timing(card->host, MMC_TIMING_MMC_DDR52);
 			mmc_set_bus_width(card->host, bus_width);
 		}
 	}
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index cb61ea4d6945..35354207e71f 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -58,7 +58,8 @@ struct mmc_ios {
 #define MMC_TIMING_UHS_SDR50	5
 #define MMC_TIMING_UHS_SDR104	6
 #define MMC_TIMING_UHS_DDR50	7
-#define MMC_TIMING_MMC_HS200	8
+#define MMC_TIMING_MMC_DDR52	8
+#define MMC_TIMING_MMC_HS200	9
 
 #define MMC_SDR_MODE		0
 #define MMC_1_2V_DDR_MODE	1
-- 
cgit v1.2.3-71-gd317


From af6b6967d6e17fe070c0fd1be364c34cbd31a523 Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Wed, 16 Apr 2014 17:19:12 +0200
Subject: net: phy: export genphy_config_init()

This enables other drivers to call this generic implementation, and then
only do specific details on top of it.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 3 ++-
 include/linux/phy.h          | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 0ce606624296..466ae3e06322 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1067,7 +1067,7 @@ int genphy_soft_reset(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(genphy_soft_reset);
 
-static int genphy_config_init(struct phy_device *phydev)
+int genphy_config_init(struct phy_device *phydev)
 {
 	int val;
 	u32 features;
@@ -1118,6 +1118,7 @@ static int gen10g_soft_reset(struct phy_device *phydev)
 	/* Do nothing for now */
 	return 0;
 }
+EXPORT_SYMBOL(genphy_config_init);
 
 static int gen10g_config_init(struct phy_device *phydev)
 {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4d0221fd0688..51d15f684e7e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -666,6 +666,7 @@ static inline int phy_read_status(struct phy_device *phydev)
 	return phydev->drv->read_status(phydev);
 }
 
+int genphy_config_init(struct phy_device *phydev);
 int genphy_setup_forced(struct phy_device *phydev);
 int genphy_restart_aneg(struct phy_device *phydev);
 int genphy_config_aneg(struct phy_device *phydev);
-- 
cgit v1.2.3-71-gd317


From a0265d28b3a5877b5b8edd14eb12a2ccb60ab1f3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 17 Apr 2014 13:45:03 +0800
Subject: net: Add __dev_forward_skb

This patch adds the helper __dev_forward_skb which is identical to
dev_forward_skb except that it doesn't actually inject the skb into
the stack.  This is useful where we wish to have finer control over
how the packet is injected, e.g., via netif_rx_ni or netif_receive_skb.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 42 ++++++++++++++++++++++++------------------
 2 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7ed3a3aa6604..a803d792df1e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2633,6 +2633,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_port_id *ppid);
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq);
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d2c8a06b3a98..11d70e3afefa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1661,6 +1661,29 @@ bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(is_skb_forwardable);
 
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
+			atomic_long_inc(&dev->rx_dropped);
+			kfree_skb(skb);
+			return NET_RX_DROP;
+		}
+	}
+
+	if (unlikely(!is_skb_forwardable(dev, skb))) {
+		atomic_long_inc(&dev->rx_dropped);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	skb_scrub_packet(skb, true);
+	skb->protocol = eth_type_trans(skb, dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
+
 /**
  * dev_forward_skb - loopback an skb to another netif
  *
@@ -1681,24 +1704,7 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
  */
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 {
-	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
-		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
-			atomic_long_inc(&dev->rx_dropped);
-			kfree_skb(skb);
-			return NET_RX_DROP;
-		}
-	}
-
-	if (unlikely(!is_skb_forwardable(dev, skb))) {
-		atomic_long_inc(&dev->rx_dropped);
-		kfree_skb(skb);
-		return NET_RX_DROP;
-	}
-
-	skb_scrub_packet(skb, true);
-	skb->protocol = eth_type_trans(skb, dev);
-
-	return netif_rx_internal(skb);
+	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
 
-- 
cgit v1.2.3-71-gd317


From 4104d326b670c2b66f575d2004daa28b2d1b4c8d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 10 Jan 2014 17:01:58 -0500
Subject: ftrace: Remove global function list and call function directly

Instead of having a list of global functions that are called,
as only one global function is allow to be enabled at a time, there's
no reason to have a list.

Instead, simply have all the users of the global ops, use the global ops
directly, instead of registering their own ftrace_ops. Just switch what
function is used before enabling the function tracer.

This removes a lot of code as well as the complexity involved with it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h            |  20 ++---
 kernel/trace/ftrace.c             | 152 +++++++++++---------------------------
 kernel/trace/trace.c              |   2 +
 kernel/trace/trace.h              |  19 +++--
 kernel/trace/trace_functions.c    |  55 +++++---------
 kernel/trace/trace_irqsoff.c      |  33 ++++-----
 kernel/trace/trace_sched_wakeup.c |  40 +++++-----
 kernel/trace/trace_selftest.c     |  33 +++++----
 8 files changed, 133 insertions(+), 221 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9212b017bc72..f0ff2c2453e7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -62,9 +62,6 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
  * set in the flags member.
  *
  * ENABLED - set/unset when ftrace_ops is registered/unregistered
- * GLOBAL  - set manualy by ftrace_ops user to denote the ftrace_ops
- *           is part of the global tracers sharing the same filter
- *           via set_ftrace_* debugfs files.
  * DYNAMIC - set when ftrace_ops is registered to denote dynamically
  *           allocated ftrace_ops which need special care
  * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops
@@ -96,15 +93,14 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
-	FTRACE_OPS_FL_GLOBAL			= 1 << 1,
-	FTRACE_OPS_FL_DYNAMIC			= 1 << 2,
-	FTRACE_OPS_FL_CONTROL			= 1 << 3,
-	FTRACE_OPS_FL_SAVE_REGS			= 1 << 4,
-	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 5,
-	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 6,
-	FTRACE_OPS_FL_STUB			= 1 << 7,
-	FTRACE_OPS_FL_INITIALIZED		= 1 << 8,
-	FTRACE_OPS_FL_DELETED			= 1 << 9,
+	FTRACE_OPS_FL_DYNAMIC			= 1 << 1,
+	FTRACE_OPS_FL_CONTROL			= 1 << 2,
+	FTRACE_OPS_FL_SAVE_REGS			= 1 << 3,
+	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 4,
+	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 5,
+	FTRACE_OPS_FL_STUB			= 1 << 6,
+	FTRACE_OPS_FL_INITIALIZED		= 1 << 7,
+	FTRACE_OPS_FL_DELETED			= 1 << 8,
 };
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1fd4b9479210..8f61ef70a297 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,7 +62,7 @@
 #define FTRACE_HASH_DEFAULT_BITS 10
 #define FTRACE_HASH_MAX_BITS 12
 
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
+#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 #define INIT_REGEX_LOCK(opsname)	\
@@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly;
 
 static DEFINE_MUTEX(ftrace_lock);
 
-static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
@@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void)
 	return cnt;
 }
 
-static void
-ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
-			struct ftrace_ops *op, struct pt_regs *regs)
-{
-	int bit;
-
-	bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
-	if (bit < 0)
-		return;
-
-	do_for_each_ftrace_op(op, ftrace_global_list) {
-		op->func(ip, parent_ip, op, regs);
-	} while_for_each_ftrace_op(op);
-
-	trace_clear_recursion(bit);
-}
-
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 			    struct ftrace_ops *op, struct pt_regs *regs)
 {
@@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
 	return 0;
 }
 
-static void update_global_ops(void)
-{
-	ftrace_func_t func = ftrace_global_list_func;
-	void *private = NULL;
-
-	/* The list has its own recursion protection. */
-	global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
-
-	/*
-	 * If there's only one function registered, then call that
-	 * function directly. Otherwise, we need to iterate over the
-	 * registered callers.
-	 */
-	if (ftrace_global_list == &ftrace_list_end ||
-	    ftrace_global_list->next == &ftrace_list_end) {
-		func = ftrace_global_list->func;
-		private = ftrace_global_list->private;
-		/*
-		 * As we are calling the function directly.
-		 * If it does not have recursion protection,
-		 * the function_trace_op needs to be updated
-		 * accordingly.
-		 */
-		if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
-			global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
-	}
-
-	/* If we filter on pids, update to use the pid function */
-	if (!list_empty(&ftrace_pids)) {
-		set_ftrace_pid_function(func);
-		func = ftrace_pid_func;
-	}
-
-	global_ops.func = func;
-	global_ops.private = private;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -301,8 +246,6 @@ static void update_ftrace_function(void)
 {
 	ftrace_func_t func;
 
-	update_global_ops();
-
 	/*
 	 * If we are at the end of the list and this ops is
 	 * recursion safe and not dynamic and the arch supports passing ops,
@@ -314,10 +257,7 @@ static void update_ftrace_function(void)
 	     (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
 	     !FTRACE_FORCE_LIST_FUNC)) {
 		/* Set the ftrace_ops that the arch callback uses */
-		if (ftrace_ops_list == &global_ops)
-			set_function_trace_op = ftrace_global_list;
-		else
-			set_function_trace_op = ftrace_ops_list;
+		set_function_trace_op = ftrace_ops_list;
 		func = ftrace_ops_list->func;
 	} else {
 		/* Just use the default ftrace_ops */
@@ -434,16 +374,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	if (ops->flags & FTRACE_OPS_FL_DELETED)
 		return -EINVAL;
 
-	if (FTRACE_WARN_ON(ops == &global_ops))
-		return -EINVAL;
-
 	if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
 		return -EBUSY;
 
-	/* We don't support both control and global flags set. */
-	if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
-		return -EINVAL;
-
 #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 	/*
 	 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
@@ -461,10 +394,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-		add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
-		ops->flags |= FTRACE_OPS_FL_ENABLED;
-	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+	if (ops->flags & FTRACE_OPS_FL_CONTROL) {
 		if (control_ops_alloc(ops))
 			return -ENOMEM;
 		add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
@@ -484,15 +414,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
 		return -EBUSY;
 
-	if (FTRACE_WARN_ON(ops == &global_ops))
-		return -EINVAL;
-
-	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-		ret = remove_ftrace_list_ops(&ftrace_global_list,
-					     &global_ops, ops);
-		if (!ret)
-			ops->flags &= ~FTRACE_OPS_FL_ENABLED;
-	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+	if (ops->flags & FTRACE_OPS_FL_CONTROL) {
 		ret = remove_ftrace_list_ops(&ftrace_control_list,
 					     &control_ops, ops);
 	} else
@@ -2128,15 +2050,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
 	ftrace_start_up++;
 	command |= FTRACE_UPDATE_CALLS;
 
-	/* ops marked global share the filter hashes */
-	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-		ops = &global_ops;
-		/* Don't update hash if global is already set */
-		if (global_start_up)
-			hash_enable = false;
-		global_start_up++;
-	}
-
 	ops->flags |= FTRACE_OPS_FL_ENABLED;
 	if (hash_enable)
 		ftrace_hash_rec_enable(ops, 1);
@@ -2166,21 +2079,10 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	 */
 	WARN_ON_ONCE(ftrace_start_up < 0);
 
-	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-		ops = &global_ops;
-		global_start_up--;
-		WARN_ON_ONCE(global_start_up < 0);
-		/* Don't update hash if global still has users */
-		if (global_start_up) {
-			WARN_ON_ONCE(!ftrace_start_up);
-			hash_disable = false;
-		}
-	}
-
 	if (hash_disable)
 		ftrace_hash_rec_disable(ops, 1);
 
-	if (ops != &global_ops || !global_start_up)
+	if (!global_start_up)
 		ops->flags &= ~FTRACE_OPS_FL_ENABLED;
 
 	command |= FTRACE_UPDATE_CALLS;
@@ -3524,10 +3426,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 	struct ftrace_hash *hash;
 	int ret;
 
-	/* All global ops uses the global ops filters */
-	if (ops->flags & FTRACE_OPS_FL_GLOBAL)
-		ops = &global_ops;
-
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
@@ -4462,6 +4360,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
 
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+__init void ftrace_init_global_array_ops(struct trace_array *tr)
+{
+	tr->ops = &global_ops;
+	tr->ops->private = tr;
+}
+
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
+{
+	/* If we filter on pids, update to use the pid function */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+		if (WARN_ON(tr->ops->func != ftrace_stub))
+			printk("ftrace ops had %pS for function\n",
+			       tr->ops->func);
+		/* Only the top level instance does pid tracing */
+		if (!list_empty(&ftrace_pids)) {
+			set_ftrace_pid_function(func);
+			func = ftrace_pid_func;
+		}
+	}
+	tr->ops->func = func;
+	tr->ops->private = tr;
+}
+
+void ftrace_reset_array_ops(struct trace_array *tr)
+{
+	tr->ops->func = ftrace_stub;
+}
+
 static void
 ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 			struct ftrace_ops *op, struct pt_regs *regs)
@@ -4520,9 +4446,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 	 */
 	preempt_disable_notrace();
 	do_for_each_ftrace_op(op, ftrace_ops_list) {
-		if (ftrace_ops_test(op, ip, regs))
+		if (ftrace_ops_test(op, ip, regs)) {
+			if (WARN_ON(!op->func)) {
+				function_trace_stop = 1;
+				printk("op=%p %pS\n", op, op);
+				goto out;
+			}
 			op->func(ip, parent_ip, op, regs);
+		}
 	} while_for_each_ftrace_op(op);
+out:
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
 }
@@ -5076,8 +5009,7 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
 /* Just a place holder for function graph */
 static struct ftrace_ops fgraph_ops __read_mostly = {
 	.func		= ftrace_stub,
-	.flags		= FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
-				FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags		= FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 737b0efa1a62..fdd33aacdf05 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6629,6 +6629,8 @@ __init static int tracer_alloc_buffers(void)
 	 */
 	global_trace.current_trace = &nop_trace;
 
+	ftrace_init_global_array_ops(&global_trace);
+
 	register_tracer(&nop_trace);
 
 	/* All seems OK, enable tracing */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2e29d7ba5a52..df5256be64cd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -416,13 +416,7 @@ enum {
 	TRACE_FTRACE_IRQ_BIT,
 	TRACE_FTRACE_SIRQ_BIT,
 
-	/* GLOBAL_BITs must be greater than FTRACE_BITs */
-	TRACE_GLOBAL_BIT,
-	TRACE_GLOBAL_NMI_BIT,
-	TRACE_GLOBAL_IRQ_BIT,
-	TRACE_GLOBAL_SIRQ_BIT,
-
-	/* INTERNAL_BITs must be greater than GLOBAL_BITs */
+	/* INTERNAL_BITs must be greater than FTRACE_BITs */
 	TRACE_INTERNAL_BIT,
 	TRACE_INTERNAL_NMI_BIT,
 	TRACE_INTERNAL_IRQ_BIT,
@@ -449,9 +443,6 @@ enum {
 #define TRACE_FTRACE_START	TRACE_FTRACE_BIT
 #define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
 
-#define TRACE_GLOBAL_START	TRACE_GLOBAL_BIT
-#define TRACE_GLOBAL_MAX	((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
-
 #define TRACE_LIST_START	TRACE_INTERNAL_BIT
 #define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
 
@@ -823,6 +814,9 @@ extern int ftrace_is_dead(void);
 int ftrace_create_function_files(struct trace_array *tr,
 				 struct dentry *parent);
 void ftrace_destroy_function_files(struct trace_array *tr);
+void ftrace_init_global_array_ops(struct trace_array *tr);
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
+void ftrace_reset_array_ops(struct trace_array *tr);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
@@ -836,6 +830,11 @@ ftrace_create_function_files(struct trace_array *tr,
 	return 0;
 }
 static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
+static inline __init void
+ftrace_init_global_array_ops(struct trace_array *tr) { }
+static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+/* ftace_func_t type is not defined, use macro instead of static inline */
+#define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index ffd56351b521..2d9482b8f26a 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *op, struct pt_regs *pt_regs);
-static struct ftrace_ops trace_ops;
-static struct ftrace_ops trace_stack_ops;
 static struct tracer_flags func_flags;
 
 /* Our option */
@@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr)
 
 static int function_trace_init(struct trace_array *tr)
 {
-	struct ftrace_ops *ops;
-
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
-		/* There's only one global tr */
-		if (!trace_ops.private) {
-			trace_ops.private = tr;
-			trace_stack_ops.private = tr;
-		}
+	ftrace_func_t func;
 
-		if (func_flags.val & TRACE_FUNC_OPT_STACK)
-			ops = &trace_stack_ops;
-		else
-			ops = &trace_ops;
-		tr->ops = ops;
-	} else if (!tr->ops) {
-		/*
-		 * Instance trace_arrays get their ops allocated
-		 * at instance creation. Unless it failed
-		 * the allocation.
-		 */
+	/*
+	 * Instance trace_arrays get their ops allocated
+	 * at instance creation. Unless it failed
+	 * the allocation.
+	 */
+	if (!tr->ops)
 		return -ENOMEM;
-	}
+
+	/* Currently only the global instance can do stack tracing */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+	    func_flags.val & TRACE_FUNC_OPT_STACK)
+		func = function_stack_trace_call;
+	else
+		func = function_trace_call;
+
+	ftrace_init_array_ops(tr, func);
 
 	tr->trace_buffer.cpu = get_cpu();
 	put_cpu();
@@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr)
 {
 	tracing_stop_function_trace(tr);
 	tracing_stop_cmdline_record();
+	ftrace_reset_array_ops(tr);
 }
 
 static void function_trace_start(struct trace_array *tr)
@@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	local_irq_restore(flags);
 }
 
-static struct ftrace_ops trace_ops __read_mostly =
-{
-	.func = function_trace_call,
-	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
-static struct ftrace_ops trace_stack_ops __read_mostly =
-{
-	.func = function_stack_trace_call,
-	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
 static struct tracer_opt func_opts[] = {
 #ifdef CONFIG_STACKTRACE
 	{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 		unregister_ftrace_function(tr->ops);
 
 		if (set) {
-			tr->ops = &trace_stack_ops;
+			tr->ops->func = function_stack_trace_call;
 			register_ftrace_function(tr->ops);
 		} else {
-			tr->ops = &trace_ops;
+			tr->ops->func = function_trace_call;
 			register_ftrace_function(tr->ops);
 		}
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8ff02cbb892f..b5cb047df3e9 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
 
 	atomic_dec(&data->disabled);
 }
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
-	.func = irqsoff_tracer_call,
-	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-static int register_irqsoff_function(int graph, int set)
+static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
 {
 	int ret;
 
@@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)
 		ret = register_ftrace_graph(&irqsoff_graph_return,
 					    &irqsoff_graph_entry);
 	else
-		ret = register_ftrace_function(&trace_ops);
+		ret = register_ftrace_function(tr->ops);
 
 	if (!ret)
 		function_enabled = true;
@@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)
 	return ret;
 }
 
-static void unregister_irqsoff_function(int graph)
+static void unregister_irqsoff_function(struct trace_array *tr, int graph)
 {
 	if (!function_enabled)
 		return;
@@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph)
 	if (graph)
 		unregister_ftrace_graph();
 	else
-		unregister_ftrace_function(&trace_ops);
+		unregister_ftrace_function(tr->ops);
 
 	function_enabled = false;
 }
 
-static void irqsoff_function_set(int set)
+static void irqsoff_function_set(struct trace_array *tr, int set)
 {
 	if (set)
-		register_irqsoff_function(is_graph(), 1);
+		register_irqsoff_function(tr, is_graph(), 1);
 	else
-		unregister_irqsoff_function(is_graph());
+		unregister_irqsoff_function(tr, is_graph());
 }
 
 static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
 	struct tracer *tracer = tr->current_trace;
 
 	if (mask & TRACE_ITER_FUNCTION)
-		irqsoff_function_set(set);
+		irqsoff_function_set(tr, set);
 
 	return trace_keep_overwrite(tracer, mask, set);
 }
@@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
 	int ret;
 
-	ret = register_irqsoff_function(graph, 0);
+	ret = register_irqsoff_function(tr, graph, 0);
 
 	if (!ret && tracing_is_enabled())
 		tracer_enabled = 1;
@@ -600,7 +594,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
 	tracer_enabled = 0;
 
-	unregister_irqsoff_function(graph);
+	unregister_irqsoff_function(tr, graph);
 }
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -617,7 +611,11 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	smp_wmb();
 	tracing_reset_online_cpus(&tr->trace_buffer);
 
-	if (start_irqsoff_tracer(tr, is_graph()))
+	ftrace_init_array_ops(tr, irqsoff_tracer_call);
+
+	/* Only toplevel instance supports graph tracing */
+	if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+				      is_graph())))
 		printk(KERN_ERR "failed to start irqsoff tracer\n");
 }
 
@@ -630,6 +628,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
 
 	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
 	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+	ftrace_reset_array_ops(tr);
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e14da5e97a69..4dd986defa60 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
 	atomic_dec(&data->disabled);
 	preempt_enable_notrace();
 }
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
-	.func = wakeup_tracer_call,
-	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
 #endif /* CONFIG_FUNCTION_TRACER */
 
-static int register_wakeup_function(int graph, int set)
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
 {
 	int ret;
 
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)
 		ret = register_ftrace_graph(&wakeup_graph_return,
 					    &wakeup_graph_entry);
 	else
-		ret = register_ftrace_function(&trace_ops);
+		ret = register_ftrace_function(tr->ops);
 
 	if (!ret)
 		function_enabled = true;
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)
 	return ret;
 }
 
-static void unregister_wakeup_function(int graph)
+static void unregister_wakeup_function(struct trace_array *tr, int graph)
 {
 	if (!function_enabled)
 		return;
@@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph)
 	if (graph)
 		unregister_ftrace_graph();
 	else
-		unregister_ftrace_function(&trace_ops);
+		unregister_ftrace_function(tr->ops);
 
 	function_enabled = false;
 }
 
-static void wakeup_function_set(int set)
+static void wakeup_function_set(struct trace_array *tr, int set)
 {
 	if (set)
-		register_wakeup_function(is_graph(), 1);
+		register_wakeup_function(tr, is_graph(), 1);
 	else
-		unregister_wakeup_function(is_graph());
+		unregister_wakeup_function(tr, is_graph());
 }
 
 static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
 	struct tracer *tracer = tr->current_trace;
 
 	if (mask & TRACE_ITER_FUNCTION)
-		wakeup_function_set(set);
+		wakeup_function_set(tr, set);
 
 	return trace_keep_overwrite(tracer, mask, set);
 }
 
-static int start_func_tracer(int graph)
+static int start_func_tracer(struct trace_array *tr, int graph)
 {
 	int ret;
 
-	ret = register_wakeup_function(graph, 0);
+	ret = register_wakeup_function(tr, graph, 0);
 
 	if (!ret && tracing_is_enabled())
 		tracer_enabled = 1;
@@ -203,11 +197,11 @@ static int start_func_tracer(int graph)
 	return ret;
 }
 
-static void stop_func_tracer(int graph)
+static void stop_func_tracer(struct trace_array *tr, int graph)
 {
 	tracer_enabled = 0;
 
-	unregister_wakeup_function(graph);
+	unregister_wakeup_function(tr, graph);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 	if (!(is_graph() ^ set))
 		return 0;
 
-	stop_func_tracer(!set);
+	stop_func_tracer(tr, !set);
 
 	wakeup_reset(wakeup_trace);
 	tracing_max_latency = 0;
 
-	return start_func_tracer(set);
+	return start_func_tracer(tr, set);
 }
 
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	 */
 	smp_wmb();
 
-	if (start_func_tracer(is_graph()))
+	if (start_func_tracer(tr, is_graph()))
 		printk(KERN_ERR "failed to start wakeup tracer\n");
 
 	return;
@@ -600,7 +594,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	stop_func_tracer(is_graph());
+	stop_func_tracer(tr, is_graph());
 	unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
 	unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
 	unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -617,6 +611,7 @@ static int __wakeup_tracer_init(struct trace_array *tr)
 
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
+	ftrace_init_array_ops(tr, wakeup_tracer_call);
 	start_wakeup_tracer(tr);
 	return 0;
 }
@@ -653,6 +648,7 @@ static void wakeup_tracer_reset(struct trace_array *tr)
 
 	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
 	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+	ftrace_reset_array_ops(tr);
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e98fca60974f..519d04affe38 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {
 	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
-static struct ftrace_ops test_global = {
-	.func		= trace_selftest_test_global_func,
-	.flags		= FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
 static void print_counts(void)
 {
 	printk("(%d %d %d %d %d) ",
@@ -185,7 +180,7 @@ static void reset_counts(void)
 	trace_selftest_test_dyn_cnt = 0;
 }
 
-static int trace_selftest_ops(int cnt)
+static int trace_selftest_ops(struct trace_array *tr, int cnt)
 {
 	int save_ftrace_enabled = ftrace_enabled;
 	struct ftrace_ops *dyn_ops;
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)
 	register_ftrace_function(&test_probe1);
 	register_ftrace_function(&test_probe2);
 	register_ftrace_function(&test_probe3);
-	register_ftrace_function(&test_global);
+	/* First time we are running with main function */
+	if (cnt > 1) {
+		ftrace_init_array_ops(tr, trace_selftest_test_global_func);
+		register_ftrace_function(tr->ops);
+	}
 
 	DYN_FTRACE_TEST_NAME();
 
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)
 		goto out;
 	if (trace_selftest_test_probe3_cnt != 1)
 		goto out;
-	if (trace_selftest_test_global_cnt == 0)
-		goto out;
+	if (cnt > 1) {
+		if (trace_selftest_test_global_cnt == 0)
+			goto out;
+	}
 
 	DYN_FTRACE_TEST_NAME2();
 
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)
 		goto out_free;
 	if (trace_selftest_test_probe3_cnt != 3)
 		goto out_free;
-	if (trace_selftest_test_global_cnt == 0)
-		goto out;
+	if (cnt > 1) {
+		if (trace_selftest_test_global_cnt == 0)
+			goto out;
+	}
 	if (trace_selftest_test_dyn_cnt == 0)
 		goto out_free;
 
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)
 	unregister_ftrace_function(&test_probe1);
 	unregister_ftrace_function(&test_probe2);
 	unregister_ftrace_function(&test_probe3);
-	unregister_ftrace_function(&test_global);
+	if (cnt > 1)
+		unregister_ftrace_function(tr->ops);
+	ftrace_reset_array_ops(tr);
 
 	/* Make sure everything is off */
 	reset_counts();
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	}
 
 	/* Test the ops with global tracing running */
-	ret = trace_selftest_ops(1);
+	ret = trace_selftest_ops(tr, 1);
 	trace->reset(tr);
 
  out:
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 
 	/* Test the ops with global tracing off */
 	if (!ret)
-		ret = trace_selftest_ops(2);
+		ret = trace_selftest_ops(tr, 2);
 
 	return ret;
 }
-- 
cgit v1.2.3-71-gd317


From 68957303f44a501af5cf37913208a2acaa6bcdf1 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Tue, 25 Mar 2014 06:51:47 +0100
Subject: NFC: ST21NFCA: Add driver for STMicroelectronics ST21NFCA NFC Chip

Add driver for STMicroelectronics ST21NFCA NFC controller.
ST21NFCA is using HCI protocol, shdlc as LLC layer & I2C as
communication protocol.

Adding support for Reader/Writer mode with Tag type 1/2/3/4 A & B.
It is using proprietary gate 15 for ISO14443-3 such as type 1 &
type 2 tags. It is using proprietary gate 14 for type F tags.
ST21NFCA_DEVICE_MGNT_GATE gives access to proprietary CLF configuration.
Standard gate for ISO14443-4 A (13) & B (11) are also used.

ST21NFCA specific mecanism:

One particular point to notice for the data handling is that frame
does not contain any length value. Therefore the i2c part of this driver
is managing the reception with a read length sequence until the end of
frame (0x7e) is reached.

In order to avoid conflict between sof & eof a mecanism
called byte stuffing concist of an escape byte (0x7d) insertion before
special byte (0x7e, 0x7d). The special byte is then xored with 0x20.

In this driver, When data are available in the CLF, the interrupt
gpio is driven to active state and triggered an interrupt.
Once the i2c_master_recv start, the interrupt gpio is driven to idle
state until its complete. If the frame is incomplete or data are still
available, interrupts will be triggered again.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/Kconfig                    |   1 +
 drivers/nfc/Makefile                   |   1 +
 drivers/nfc/st21nfca/Kconfig           |  23 ++
 drivers/nfc/st21nfca/Makefile          |   8 +
 drivers/nfc/st21nfca/i2c.c             | 595 +++++++++++++++++++++++++++++++++
 drivers/nfc/st21nfca/st21nfca.c        | 506 ++++++++++++++++++++++++++++
 drivers/nfc/st21nfca/st21nfca.h        |  87 +++++
 include/linux/platform_data/st21nfca.h |  32 ++
 8 files changed, 1253 insertions(+)
 create mode 100644 drivers/nfc/st21nfca/Kconfig
 create mode 100644 drivers/nfc/st21nfca/Makefile
 create mode 100644 drivers/nfc/st21nfca/i2c.c
 create mode 100644 drivers/nfc/st21nfca/st21nfca.c
 create mode 100644 drivers/nfc/st21nfca/st21nfca.h
 create mode 100644 include/linux/platform_data/st21nfca.h

(limited to 'include/linux')

diff --git a/drivers/nfc/Kconfig b/drivers/nfc/Kconfig
index 65d4ca19d132..26c66a126551 100644
--- a/drivers/nfc/Kconfig
+++ b/drivers/nfc/Kconfig
@@ -71,5 +71,6 @@ config NFC_PORT100
 source "drivers/nfc/pn544/Kconfig"
 source "drivers/nfc/microread/Kconfig"
 source "drivers/nfc/nfcmrvl/Kconfig"
+source "drivers/nfc/st21nfca/Kconfig"
 
 endmenu
diff --git a/drivers/nfc/Makefile b/drivers/nfc/Makefile
index ae42a3fa60c9..23225b0287fd 100644
--- a/drivers/nfc/Makefile
+++ b/drivers/nfc/Makefile
@@ -11,5 +11,6 @@ obj-$(CONFIG_NFC_SIM)		+= nfcsim.o
 obj-$(CONFIG_NFC_PORT100)	+= port100.o
 obj-$(CONFIG_NFC_MRVL)		+= nfcmrvl/
 obj-$(CONFIG_NFC_TRF7970A)	+= trf7970a.o
+obj-$(CONFIG_NFC_ST21NFCA)  += st21nfca/
 
 ccflags-$(CONFIG_NFC_DEBUG) := -DDEBUG
diff --git a/drivers/nfc/st21nfca/Kconfig b/drivers/nfc/st21nfca/Kconfig
new file mode 100644
index 000000000000..ee459f066ade
--- /dev/null
+++ b/drivers/nfc/st21nfca/Kconfig
@@ -0,0 +1,23 @@
+config NFC_ST21NFCA
+	tristate "STMicroelectronics ST21NFCA NFC driver"
+	depends on NFC_HCI
+	select CRC_CCITT
+	default n
+	---help---
+	  STMicroelectronics ST21NFCA core driver. It implements the chipset
+	  HCI logic and hooks into the NFC kernel APIs. Physical layers will
+	  register against it.
+
+	  To compile this driver as a module, choose m here. The module will
+	  be called st21nfca.
+	  Say N if unsure.
+
+config NFC_ST21NFCA_I2C
+	tristate "NFC ST21NFCA i2c support"
+	depends on NFC_ST21NFCA && I2C && NFC_SHDLC
+	---help---
+	  This module adds support for the STMicroelectronics st21nfca i2c interface.
+	  Select this if your platform is using the i2c bus.
+
+	  If you choose to build a module, it'll be called st21nfca_i2c.
+	  Say N if unsure.
diff --git a/drivers/nfc/st21nfca/Makefile b/drivers/nfc/st21nfca/Makefile
new file mode 100644
index 000000000000..038ed093a119
--- /dev/null
+++ b/drivers/nfc/st21nfca/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for ST21NFCA HCI based NFC driver
+#
+
+st21nfca_i2c-objs  = i2c.o
+
+obj-$(CONFIG_NFC_ST21NFCA)     += st21nfca.o
+obj-$(CONFIG_NFC_ST21NFCA_I2C) += st21nfca_i2c.o
diff --git a/drivers/nfc/st21nfca/i2c.c b/drivers/nfc/st21nfca/i2c.c
new file mode 100644
index 000000000000..3b0fd0f76d1c
--- /dev/null
+++ b/drivers/nfc/st21nfca/i2c.c
@@ -0,0 +1,595 @@
+/*
+ * I2C Link Layer for ST21NFCA HCI based Driver
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/crc-ccitt.h>
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/miscdevice.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nfc.h>
+#include <linux/firmware.h>
+#include <linux/unaligned/access_ok.h>
+#include <linux/platform_data/st21nfca.h>
+
+#include <net/nfc/hci.h>
+#include <net/nfc/llc.h>
+#include <net/nfc/nfc.h>
+
+#include "st21nfca.h"
+
+/*
+ * Every frame starts with ST21NFCA_SOF_EOF and ends with ST21NFCA_SOF_EOF.
+ * Because ST21NFCA_SOF_EOF is a possible data value, there is a mecanism
+ * called byte stuffing has been introduced.
+ *
+ * if byte == ST21NFCA_SOF_EOF or ST21NFCA_ESCAPE_BYTE_STUFFING
+ * - insert ST21NFCA_ESCAPE_BYTE_STUFFING (escape byte)
+ * - xor byte with ST21NFCA_BYTE_STUFFING_MASK
+ */
+#define ST21NFCA_SOF_EOF		0x7e
+#define ST21NFCA_BYTE_STUFFING_MASK	0x20
+#define ST21NFCA_ESCAPE_BYTE_STUFFING	0x7d
+
+/* SOF + 00 fill size */
+#define ST21NFCA_FRAME_HEADROOM			2
+
+/* 4 bytes crc (worst case byte stuffing) + EOF */
+#define ST21NFCA_FRAME_TAILROOM 5
+
+#define ST21NFCA_HCI_I2C_DRIVER_NAME "st21nfca_hci_i2c"
+
+static struct i2c_device_id st21nfca_hci_i2c_id_table[] = {
+	{ST21NFCA_HCI_DRIVER_NAME, 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, st21nfca_hci_i2c_id_table);
+
+struct st21nfca_i2c_phy {
+	struct i2c_client *i2c_dev;
+	struct nfc_hci_dev *hdev;
+
+	unsigned int gpio_ena;
+	unsigned int gpio_irq;
+	unsigned int irq_polarity;
+
+	struct sk_buff *pending_skb;
+	int current_read_len;
+	/*
+	 * crc might have fail because i2c macro
+	 * is disable due to other interface activity
+	 */
+	int crc_trials;
+
+	int powered;
+	int run_mode;
+
+	/*
+	 * < 0 if hardware error occured (e.g. i2c err)
+	 * and prevents normal operation.
+	 */
+	int hard_fault;
+};
+static u8 len_seq[] = { 13, 24, 15, 29 };
+static u16 wait_tab[] = { 2, 3, 5, 15, 20, 40};
+
+#define I2C_DUMP_SKB(info, skb)					\
+do {								\
+	pr_debug("%s:\n", info);				\
+	print_hex_dump(KERN_DEBUG, "i2c: ", DUMP_PREFIX_OFFSET,	\
+		       16, 1, (skb)->data, (skb)->len, 0);	\
+} while (0)
+
+static void st21nfca_hci_platform_init(struct st21nfca_i2c_phy *phy)
+{
+	u16 wait_tab[] = { 50, 300, 1000 };
+	char reboot_cmd[] = { 0x7E, 0x66, 0x48, 0xF6, 0x7E };
+	u8 tmp[ST21NFCA_HCI_LLC_MAX_SIZE];
+	int i, r = -1;
+
+	for (i = 0; i < ARRAY_SIZE(wait_tab) && r < 0; i++)
+		r = i2c_master_recv(phy->i2c_dev, tmp,
+				    ST21NFCA_HCI_LLC_MAX_SIZE);
+
+	r = -1;
+	for (i = 0; i < ARRAY_SIZE(wait_tab) && r < 0; i++)
+		r = i2c_master_send(phy->i2c_dev, reboot_cmd,
+				    sizeof(reboot_cmd));
+	usleep_range(1000, 1500);
+
+}
+
+static int st21nfca_hci_i2c_enable(void *phy_id)
+{
+	struct st21nfca_i2c_phy *phy = phy_id;
+
+	gpio_set_value(phy->gpio_ena, 1);
+	phy->powered = 1;
+	phy->run_mode = ST21NFCA_HCI_MODE;
+
+	usleep_range(10000, 15000);
+
+	return 0;
+}
+
+static void st21nfca_hci_i2c_disable(void *phy_id)
+{
+	struct st21nfca_i2c_phy *phy = phy_id;
+
+	pr_info("\n");
+	gpio_set_value(phy->gpio_ena, 0);
+
+	phy->powered = 0;
+}
+
+static int st21nfca_hci_add_len_crc(struct sk_buff *skb)
+{
+	int ret = 2;
+	u16 crc;
+	u8 tmp;
+
+	*skb_push(skb, 1) = 0;
+
+	crc = crc_ccitt(0xffff, skb->data, skb->len);
+	crc = ~crc;
+
+	tmp = crc & 0x00ff;
+	*skb_put(skb, 1) = tmp;
+
+	tmp = (crc >> 8) & 0x00ff;
+	*skb_put(skb, 1) = tmp;
+
+	return ret;
+}
+
+static void st21nfca_hci_remove_len_crc(struct sk_buff *skb, int crc_len)
+{
+	skb_pull(skb, ST21NFCA_FRAME_HEADROOM);
+	skb_trim(skb, crc_len);
+}
+
+/*
+ * Writing a frame must not return the number of written bytes.
+ * It must return either zero for success, or <0 for error.
+ * In addition, it must not alter the skb
+ */
+static int st21nfca_hci_i2c_write(void *phy_id, struct sk_buff *skb)
+{
+	int r = -1, i, j, len;
+	struct st21nfca_i2c_phy *phy = phy_id;
+	struct i2c_client *client = phy->i2c_dev;
+	u16 wait_tab[] = { 2, 3, 5, 15, 20, 40};
+	u8 tmp[ST21NFCA_HCI_LLC_MAX_SIZE * 2];
+
+	I2C_DUMP_SKB("st21nfca_hci_i2c_write", skb);
+
+
+	if (phy->hard_fault != 0)
+		return phy->hard_fault;
+
+	/*
+	 * Compute CRC before byte stuffing computation on frame
+	 * Note st21nfca_hci_add_len_crc is doing a byte stuffing
+	 * on its own value
+	 */
+	len = st21nfca_hci_add_len_crc(skb);
+
+	/* add ST21NFCA_SOF_EOF on tail */
+	*skb_put(skb, 1) = ST21NFCA_SOF_EOF;
+	/* add ST21NFCA_SOF_EOF on head */
+	*skb_push(skb, 1) = ST21NFCA_SOF_EOF;
+
+	/*
+	 * Compute byte stuffing
+	 * if byte == ST21NFCA_SOF_EOF or ST21NFCA_ESCAPE_BYTE_STUFFING
+	 * insert ST21NFCA_ESCAPE_BYTE_STUFFING (escape byte)
+	 * xor byte with ST21NFCA_BYTE_STUFFING_MASK
+	 */
+	tmp[0] = skb->data[0];
+	for (i = 1, j = 1; i < skb->len - 1; i++, j++) {
+		if (skb->data[i] == ST21NFCA_SOF_EOF
+		    || skb->data[i] == ST21NFCA_ESCAPE_BYTE_STUFFING) {
+			tmp[j] = ST21NFCA_ESCAPE_BYTE_STUFFING;
+			j++;
+			tmp[j] = skb->data[i] ^ ST21NFCA_BYTE_STUFFING_MASK;
+		} else {
+			tmp[j] = skb->data[i];
+		}
+	}
+	tmp[j] = skb->data[i];
+	j++;
+
+	/*
+	 * Manage sleep mode
+	 * Try 3 times to send data with delay between each
+	 */
+	for (i = 0; i < ARRAY_SIZE(wait_tab) && r < 0; i++) {
+		r = i2c_master_send(client, tmp, j);
+		if (r < 0)
+			msleep(wait_tab[i]);
+	}
+
+	if (r >= 0) {
+		if (r != j)
+			r = -EREMOTEIO;
+		else
+			r = 0;
+	}
+
+	st21nfca_hci_remove_len_crc(skb, len);
+
+	return r;
+}
+
+static int get_frame_size(u8 *buf, int buflen)
+{
+	int len = 0;
+	if (buf[len + 1] == ST21NFCA_SOF_EOF)
+		return 0;
+
+	for (len = 1; len < buflen && buf[len] != ST21NFCA_SOF_EOF; len++)
+		;
+
+	return len;
+}
+
+static int check_crc(u8 *buf, int buflen)
+{
+	u16 crc;
+
+	crc = crc_ccitt(0xffff, buf, buflen - 2);
+	crc = ~crc;
+
+	if (buf[buflen - 2] != (crc & 0xff) || buf[buflen - 1] != (crc >> 8)) {
+		pr_err(ST21NFCA_HCI_DRIVER_NAME
+		       ": CRC error 0x%x != 0x%x 0x%x\n", crc, buf[buflen - 1],
+		       buf[buflen - 2]);
+
+		pr_info(DRIVER_DESC ": %s : BAD CRC\n", __func__);
+		print_hex_dump(KERN_DEBUG, "crc: ", DUMP_PREFIX_NONE,
+			       16, 2, buf, buflen, false);
+		return -EPERM;
+	}
+	return 0;
+}
+
+/*
+ * Prepare received data for upper layer.
+ * Received data include byte stuffing, crc and sof/eof
+ * which is not usable by hci part.
+ * returns:
+ * frame size without sof/eof, header and byte stuffing
+ * -EBADMSG : frame was incorrect and discarded
+ */
+static int st21nfca_hci_i2c_repack(struct sk_buff *skb)
+{
+	int i, j, r, size;
+	if (skb->len < 1 || (skb->len > 1 && skb->data[1] != 0))
+		return -EBADMSG;
+
+	size = get_frame_size(skb->data, skb->len);
+	if (size > 0) {
+		skb_trim(skb, size);
+		/* remove ST21NFCA byte stuffing for upper layer */
+		for (i = 1, j = 0; i < skb->len; i++) {
+			if (skb->data[i] ==
+					(u8) ST21NFCA_ESCAPE_BYTE_STUFFING) {
+				skb->data[i] =
+				    skb->data[i +
+					      1] | ST21NFCA_BYTE_STUFFING_MASK;
+				i++;
+				j++;
+			}
+			skb->data[i] = skb->data[i + j];
+		}
+		/* remove byte stuffing useless byte */
+		skb_trim(skb, i - j);
+		/* remove ST21NFCA_SOF_EOF from head */
+		skb_pull(skb, 1);
+
+		r = check_crc(skb->data, skb->len);
+		if (r != 0) {
+			i = 0;
+			return -EBADMSG;
+		}
+
+		/* remove headbyte */
+		skb_pull(skb, 1);
+		/* remove crc. Byte Stuffing is already removed here */
+		skb_trim(skb, skb->len - 2);
+		return skb->len;
+	}
+	return 0;
+}
+
+/*
+ * Reads an shdlc frame and returns it in a newly allocated sk_buff. Guarantees
+ * that i2c bus will be flushed and that next read will start on a new frame.
+ * returned skb contains only LLC header and payload.
+ * returns:
+ * frame size : if received frame is complete (find ST21NFCA_SOF_EOF at
+ * end of read)
+ * -EAGAIN : if received frame is incomplete (not find ST21NFCA_SOF_EOF
+ * at end of read)
+ * -EREMOTEIO : i2c read error (fatal)
+ * -EBADMSG : frame was incorrect and discarded
+ * (value returned from st21nfca_hci_i2c_repack)
+ * -EIO : if no ST21NFCA_SOF_EOF is found after reaching
+ * the read length end sequence
+ */
+static int st21nfca_hci_i2c_read(struct st21nfca_i2c_phy *phy,
+				 struct sk_buff *skb)
+{
+	int r, i;
+	u8 len;
+	struct i2c_client *client = phy->i2c_dev;
+
+	if (phy->current_read_len < ARRAY_SIZE(len_seq)) {
+		len = len_seq[phy->current_read_len];
+
+		/*
+		 * Add retry mecanism
+		 * Operation on I2C interface may fail in case of operation on
+		 * RF or SWP interface
+		 */
+		r = 0;
+		for (i = 0; i < ARRAY_SIZE(wait_tab) && r <= 0; i++) {
+			r = i2c_master_recv(client, skb_put(skb, len), len);
+			if (r < 0)
+				msleep(wait_tab[i]);
+		}
+
+		if (r != len) {
+			phy->current_read_len = 0;
+			return -EREMOTEIO;
+		}
+
+		if (memchr(skb->data + 2, ST21NFCA_SOF_EOF,
+				skb->len - 2) != NULL) {
+			phy->current_read_len = 0;
+			return st21nfca_hci_i2c_repack(skb);
+		}
+		phy->current_read_len++;
+		return -EAGAIN;
+	}
+	return -EIO;
+}
+
+/*
+ * Reads an shdlc frame from the chip. This is not as straightforward as it
+ * seems. The frame format is data-crc, and corruption can occur anywhere
+ * while transiting on i2c bus, such that we could read an invalid data.
+ * The tricky case is when we read a corrupted data or crc. We must detect
+ * this here in order to determine that data can be transmitted to the hci
+ * core. This is the reason why we check the crc here.
+ * The CLF will repeat a frame until we send a RR on that frame.
+ *
+ * On ST21NFCA, IRQ goes in idle when read starts. As no size information are
+ * available in the incoming data, other IRQ might come. Every IRQ will trigger
+ * a read sequence with different length and will fill the current frame.
+ * The reception is complete once we reach a ST21NFCA_SOF_EOF.
+ */
+static irqreturn_t st21nfca_hci_irq_thread_fn(int irq, void *phy_id)
+{
+	struct st21nfca_i2c_phy *phy = phy_id;
+	struct i2c_client *client;
+
+	int r;
+
+	if (!phy || irq != phy->i2c_dev->irq) {
+		WARN_ON_ONCE(1);
+		return IRQ_NONE;
+	}
+
+	client = phy->i2c_dev;
+	dev_dbg(&client->dev, "IRQ\n");
+
+	if (phy->hard_fault != 0)
+		return IRQ_HANDLED;
+
+	r = st21nfca_hci_i2c_read(phy, phy->pending_skb);
+	if (r == -EREMOTEIO) {
+		phy->hard_fault = r;
+
+		nfc_hci_recv_frame(phy->hdev, NULL);
+
+		return IRQ_HANDLED;
+	} else if (r == -EAGAIN || r == -EIO) {
+		return IRQ_HANDLED;
+	} else if (r == -EBADMSG && phy->crc_trials < ARRAY_SIZE(wait_tab)) {
+		/*
+		 * With ST21NFCA, only one interface (I2C, RF or SWP)
+		 * may be active at a time.
+		 * Having incorrect crc is usually due to i2c macrocell
+		 * deactivation in the middle of a transmission.
+		 * It may generate corrupted data on i2c.
+		 * We give sometime to get i2c back.
+		 * The complete frame will be repeated.
+		 */
+		msleep(wait_tab[phy->crc_trials]);
+		phy->crc_trials++;
+		phy->current_read_len = 0;
+	} else if (r > 0) {
+		/*
+		 * We succeeded to read data from the CLF and
+		 * data is valid.
+		 * Reset counter.
+		 */
+		nfc_hci_recv_frame(phy->hdev, phy->pending_skb);
+		phy->crc_trials = 0;
+	}
+
+	phy->pending_skb = alloc_skb(ST21NFCA_HCI_LLC_MAX_SIZE * 2, GFP_KERNEL);
+	if (phy->pending_skb == NULL) {
+		phy->hard_fault = -ENOMEM;
+		nfc_hci_recv_frame(phy->hdev, NULL);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct nfc_phy_ops i2c_phy_ops = {
+	.write = st21nfca_hci_i2c_write,
+	.enable = st21nfca_hci_i2c_enable,
+	.disable = st21nfca_hci_i2c_disable,
+};
+
+static int st21nfca_request_resources(struct st21nfca_i2c_phy *phy,
+				      struct i2c_client *client)
+{
+	struct st21nfca_nfc_platform_data *pdata;
+	int r;
+
+	pdata = client->dev.platform_data;
+	if (pdata == NULL) {
+		nfc_err(&client->dev, "No platform data\n");
+		return -EINVAL;
+	}
+
+	/* store for later use */
+	phy->gpio_irq = pdata->gpio_irq;
+	phy->gpio_ena = pdata->gpio_ena;
+	phy->irq_polarity = pdata->irq_polarity;
+	phy->i2c_dev = client;
+
+	r = devm_gpio_request(&client->dev, phy->gpio_irq, "wake_up");
+	if (r) {
+		pr_err("%s : gpio_request failed\n", __FILE__);
+		return -ENODEV;
+	}
+
+	r = gpio_direction_input(phy->gpio_irq);
+	if (r) {
+		pr_err("%s : gpio_direction_input failed\n", __FILE__);
+		return -ENODEV;
+	}
+
+	if (phy->gpio_ena != 0) {
+		r = devm_gpio_request(&client->dev,
+					phy->gpio_ena, "clf_enable");
+		if (r) {
+			pr_err("%s : ena gpio_request failed\n", __FILE__);
+			return -ENODEV;
+		}
+		r = gpio_direction_output(phy->gpio_ena, 1);
+
+		if (r) {
+			pr_err("%s : ena gpio_direction_output failed\n",
+			       __FILE__);
+			return -ENODEV;
+		}
+	}
+
+	phy->pending_skb = alloc_skb(ST21NFCA_HCI_LLC_MAX_SIZE * 2, GFP_KERNEL);
+	if (phy->pending_skb == NULL)
+		return -ENOMEM;
+
+	phy->current_read_len = 0;
+	phy->crc_trials = 0;
+	return r;
+}
+
+static int st21nfca_hci_i2c_probe(struct i2c_client *client,
+				  const struct i2c_device_id *id)
+{
+	struct st21nfca_i2c_phy *phy;
+	struct st21nfca_nfc_platform_data *pdata;
+	int r = 0;
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+	dev_dbg(&client->dev, "IRQ: %d\n", client->irq);
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+		nfc_err(&client->dev, "Need I2C_FUNC_I2C\n");
+		return -ENODEV;
+	}
+
+	phy = devm_kzalloc(&client->dev, sizeof(struct st21nfca_i2c_phy),
+			   GFP_KERNEL);
+	if (!phy) {
+		nfc_err(&client->dev,
+			"Cannot allocate memory for st21nfca i2c phy.\n");
+		return -ENOMEM;
+	}
+
+	phy->i2c_dev = client;
+
+	i2c_set_clientdata(client, phy);
+
+	pdata = client->dev.platform_data;
+	if (pdata == NULL) {
+		nfc_err(&client->dev, "No platform data\n");
+		return -EINVAL;
+	}
+
+	r = st21nfca_request_resources(phy, client);
+	if (r) {
+		nfc_err(&client->dev, "Cannot get platform resources\n");
+		return r;
+	}
+
+	st21nfca_hci_platform_init(phy);
+	r = devm_request_threaded_irq(&client->dev, client->irq, NULL,
+				st21nfca_hci_irq_thread_fn,
+				phy->irq_polarity | IRQF_ONESHOT,
+				ST21NFCA_HCI_DRIVER_NAME, phy);
+	if (r < 0) {
+		nfc_err(&client->dev, "Unable to register IRQ handler\n");
+		return r;
+	}
+
+	r = st21nfca_hci_probe(phy, &i2c_phy_ops, LLC_SHDLC_NAME,
+			       ST21NFCA_FRAME_HEADROOM, ST21NFCA_FRAME_TAILROOM,
+			       ST21NFCA_HCI_LLC_MAX_PAYLOAD, &phy->hdev);
+
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+static int st21nfca_hci_i2c_remove(struct i2c_client *client)
+{
+	struct st21nfca_i2c_phy *phy = i2c_get_clientdata(client);
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+
+	st21nfca_hci_remove(phy->hdev);
+
+	if (phy->powered)
+		st21nfca_hci_i2c_disable(phy);
+
+	return 0;
+}
+
+static struct i2c_driver st21nfca_hci_i2c_driver = {
+	.driver = {
+		   .name = ST21NFCA_HCI_I2C_DRIVER_NAME,
+		   },
+	.probe = st21nfca_hci_i2c_probe,
+	.id_table = st21nfca_hci_i2c_id_table,
+	.remove = st21nfca_hci_i2c_remove,
+};
+
+module_i2c_driver(st21nfca_hci_i2c_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/nfc/st21nfca/st21nfca.c b/drivers/nfc/st21nfca/st21nfca.c
new file mode 100644
index 000000000000..69213f37b7ba
--- /dev/null
+++ b/drivers/nfc/st21nfca/st21nfca.c
@@ -0,0 +1,506 @@
+/*
+ * HCI based Driver for STMicroelectronics NFC Chip
+ *
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/crc-ccitt.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/interrupt.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+
+#include <linux/nfc.h>
+#include <net/nfc/hci.h>
+#include <net/nfc/llc.h>
+
+#include <uapi/linux/nfc.h>
+
+#include "st21nfca.h"
+#include <linux/platform_data/st21nfca.h>
+
+#define DRIVER_DESC "HCI NFC driver for ST21NFCA"
+
+#define FULL_VERSION_LEN 3
+
+/* Proprietary gates, events, commands and registers */
+
+/* Commands that apply to all RF readers */
+#define ST21NFCA_RF_READER_CMD_PRESENCE_CHECK	0x30
+
+#define ST21NFCA_RF_READER_ISO15693_GATE	0x12
+
+/*
+ * Reader gate for communication with contact-less cards using Type A
+ * protocol ISO14443-3 but not compliant with ISO14443-4
+ */
+#define ST21NFCA_RF_READER_14443_3_A_GATE	0x15
+#define ST21NFCA_RF_READER_14443_3_A_UID	0x02
+#define ST21NFCA_RF_READER_14443_3_A_ATQA	0x03
+#define ST21NFCA_RF_READER_14443_3_A_SAK	0x04
+
+#define ST21NFCA_DEVICE_MGNT_GATE		0x01
+#define ST21NFCA_DEVICE_MGNT_PIPE		0x02
+#define ST21NFCA_NFC_MODE	0x03	/* NFC_MODE parameter*/
+
+
+static DECLARE_BITMAP(dev_mask, ST21NFCA_NUM_DEVICES);
+
+static struct nfc_hci_gate st21nfca_gates[] = {
+	{NFC_HCI_ADMIN_GATE, NFC_HCI_INVALID_PIPE},
+	{NFC_HCI_LOOPBACK_GATE, NFC_HCI_INVALID_PIPE},
+	{NFC_HCI_ID_MGMT_GATE, NFC_HCI_INVALID_PIPE},
+	{NFC_HCI_LINK_MGMT_GATE, NFC_HCI_INVALID_PIPE},
+	{NFC_HCI_RF_READER_B_GATE, NFC_HCI_INVALID_PIPE},
+	{NFC_HCI_RF_READER_A_GATE, NFC_HCI_INVALID_PIPE},
+	{ST21NFCA_DEVICE_MGNT_GATE, ST21NFCA_DEVICE_MGNT_PIPE},
+	{ST21NFCA_RF_READER_F_GATE, NFC_HCI_INVALID_PIPE},
+	{ST21NFCA_RF_READER_14443_3_A_GATE, NFC_HCI_INVALID_PIPE},
+};
+/* Largest headroom needed for outgoing custom commands */
+#define ST21NFCA_CMDS_HEADROOM  7
+
+static int st21nfca_hci_open(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+	int r;
+
+	mutex_lock(&info->info_lock);
+
+	if (info->state != ST21NFCA_ST_COLD) {
+		r = -EBUSY;
+		goto out;
+	}
+
+	r = info->phy_ops->enable(info->phy_id);
+
+	if (r == 0)
+		info->state = ST21NFCA_ST_READY;
+
+out:
+	mutex_unlock(&info->info_lock);
+	return r;
+}
+
+static void st21nfca_hci_close(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	mutex_lock(&info->info_lock);
+
+	if (info->state == ST21NFCA_ST_COLD)
+		goto out;
+
+	info->phy_ops->disable(info->phy_id);
+	info->state = ST21NFCA_ST_COLD;
+
+out:
+	mutex_unlock(&info->info_lock);
+}
+
+static int st21nfca_hci_ready(struct nfc_hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	u8 param;
+	int r;
+
+	param = NFC_HCI_UICC_HOST_ID;
+	r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE,
+			      NFC_HCI_ADMIN_WHITELIST, &param, 1);
+	if (r < 0)
+		return r;
+
+	/* Set NFC_MODE in device management gate to enable */
+	r = nfc_hci_get_param(hdev, ST21NFCA_DEVICE_MGNT_GATE,
+			      ST21NFCA_NFC_MODE, &skb);
+	if (r < 0)
+		return r;
+
+	if (skb->data[0] == 0) {
+		kfree_skb(skb);
+		param = 1;
+
+		r = nfc_hci_set_param(hdev, ST21NFCA_DEVICE_MGNT_GATE,
+					ST21NFCA_NFC_MODE, &param, 1);
+		if (r < 0)
+			return r;
+	}
+
+	r = nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE,
+			       NFC_HCI_EVT_END_OPERATION, NULL, 0);
+	if (r < 0)
+		return r;
+
+	r = nfc_hci_get_param(hdev, NFC_HCI_ID_MGMT_GATE,
+			      NFC_HCI_ID_MGMT_VERSION_SW, &skb);
+	if (r < 0)
+		return r;
+
+	if (skb->len != FULL_VERSION_LEN) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	print_hex_dump(KERN_DEBUG, "FULL VERSION SOFTWARE INFO: ",
+		       DUMP_PREFIX_NONE, 16, 1,
+		       skb->data, FULL_VERSION_LEN, false);
+
+	kfree_skb(skb);
+
+	return 0;
+}
+
+static int st21nfca_hci_xmit(struct nfc_hci_dev *hdev, struct sk_buff *skb)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	return info->phy_ops->write(info->phy_id, skb);
+}
+
+static int st21nfca_hci_start_poll(struct nfc_hci_dev *hdev,
+				   u32 im_protocols, u32 tm_protocols)
+{
+	int r;
+
+	pr_info(DRIVER_DESC ": %s protocols 0x%x 0x%x\n",
+		__func__, im_protocols, tm_protocols);
+
+	r = nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE,
+			       NFC_HCI_EVT_END_OPERATION, NULL, 0);
+	if (r < 0)
+		return r;
+	if (im_protocols) {
+		/*
+		 * enable polling according to im_protocols & tm_protocols
+		 * - CLOSE pipe according to im_protocols & tm_protocols
+		 */
+		if ((NFC_HCI_RF_READER_B_GATE & im_protocols) == 0) {
+			r = nfc_hci_disconnect_gate(hdev,
+					NFC_HCI_RF_READER_B_GATE);
+			if (r < 0)
+				return r;
+		}
+
+		if ((NFC_HCI_RF_READER_A_GATE & im_protocols) == 0) {
+			r = nfc_hci_disconnect_gate(hdev,
+					NFC_HCI_RF_READER_A_GATE);
+			if (r < 0)
+				return r;
+		}
+
+		if ((ST21NFCA_RF_READER_F_GATE & im_protocols) == 0) {
+			r = nfc_hci_disconnect_gate(hdev,
+					ST21NFCA_RF_READER_F_GATE);
+			if (r < 0)
+				return r;
+		}
+
+		if ((ST21NFCA_RF_READER_14443_3_A_GATE & im_protocols) == 0) {
+			r = nfc_hci_disconnect_gate(hdev,
+					ST21NFCA_RF_READER_14443_3_A_GATE);
+			if (r < 0)
+				return r;
+		}
+
+		if ((ST21NFCA_RF_READER_ISO15693_GATE & im_protocols) == 0) {
+			r = nfc_hci_disconnect_gate(hdev,
+					ST21NFCA_RF_READER_ISO15693_GATE);
+			if (r < 0)
+				return r;
+		}
+
+		r = nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE,
+				       NFC_HCI_EVT_READER_REQUESTED, NULL, 0);
+		if (r < 0)
+			nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE,
+					   NFC_HCI_EVT_END_OPERATION, NULL, 0);
+	}
+	return r;
+}
+
+static int st21nfca_get_iso14443_3_atqa(struct nfc_hci_dev *hdev, u16 *atqa)
+{
+	int r;
+	struct sk_buff *atqa_skb = NULL;
+
+	r = nfc_hci_get_param(hdev, ST21NFCA_RF_READER_14443_3_A_GATE,
+			      ST21NFCA_RF_READER_14443_3_A_ATQA, &atqa_skb);
+	if (r < 0)
+		goto exit;
+
+	if (atqa_skb->len != 2) {
+		r = -EPROTO;
+		goto exit;
+	}
+
+	*atqa = be16_to_cpu(*(u16 *) atqa_skb->data);
+
+exit:
+	kfree_skb(atqa_skb);
+	return r;
+}
+
+static int st21nfca_get_iso14443_3_sak(struct nfc_hci_dev *hdev, u8 *sak)
+{
+	int r;
+	struct sk_buff *sak_skb = NULL;
+
+	r = nfc_hci_get_param(hdev, ST21NFCA_RF_READER_14443_3_A_GATE,
+			      ST21NFCA_RF_READER_14443_3_A_SAK, &sak_skb);
+	if (r < 0)
+		goto exit;
+
+	if (sak_skb->len != 1) {
+		r = -EPROTO;
+		goto exit;
+	}
+
+	*sak = sak_skb->data[0];
+
+exit:
+	kfree_skb(sak_skb);
+	return r;
+}
+
+static int st21nfca_get_iso14443_3_uid(struct nfc_hci_dev *hdev, u8 *gate,
+				       int *len)
+{
+	int r;
+	struct sk_buff *uid_skb = NULL;
+
+	r = nfc_hci_get_param(hdev, ST21NFCA_RF_READER_14443_3_A_GATE,
+			      ST21NFCA_RF_READER_14443_3_A_UID, &uid_skb);
+	if (r < 0)
+		goto exit;
+
+	if (uid_skb->len == 0 || uid_skb->len > NFC_NFCID1_MAXSIZE) {
+		r = -EPROTO;
+		goto exit;
+	}
+
+	gate = uid_skb->data;
+	*len = uid_skb->len;
+exit:
+	kfree_skb(uid_skb);
+	return r;
+}
+
+static int st21nfca_hci_target_from_gate(struct nfc_hci_dev *hdev, u8 gate,
+					 struct nfc_target *target)
+{
+	int r, len;
+	u16 atqa;
+	u8 sak;
+	u8 uid[NFC_NFCID1_MAXSIZE];
+
+	switch (gate) {
+	case ST21NFCA_RF_READER_F_GATE:
+		target->supported_protocols = NFC_PROTO_FELICA_MASK;
+		break;
+	case ST21NFCA_RF_READER_14443_3_A_GATE:
+		/* ISO14443-3 type 1 or 2 tags */
+		r = st21nfca_get_iso14443_3_atqa(hdev, &atqa);
+		if (r < 0)
+			return r;
+		if (atqa == 0x000c) {
+			target->supported_protocols = NFC_PROTO_JEWEL_MASK;
+			target->sens_res = 0x0c00;
+		} else {
+			r = st21nfca_get_iso14443_3_sak(hdev, &sak);
+			if (r < 0)
+				return r;
+
+			r = st21nfca_get_iso14443_3_uid(hdev, uid, &len);
+			if (r < 0)
+				return r;
+
+			target->supported_protocols =
+			    nfc_hci_sak_to_protocol(sak);
+			if (target->supported_protocols == 0xffffffff)
+				return -EPROTO;
+
+			target->sens_res = atqa;
+			target->sel_res = sak;
+			memcpy(target->nfcid1, uid, len);
+			target->nfcid1_len = len;
+		}
+
+		break;
+	default:
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns:
+ * <= 0: driver handled the data exchange
+ *    1: driver doesn't especially handle, please do standard processing
+ */
+static int st21nfca_hci_im_transceive(struct nfc_hci_dev *hdev,
+				      struct nfc_target *target,
+				      struct sk_buff *skb,
+				      data_exchange_cb_t cb, void *cb_context)
+{
+	pr_info(DRIVER_DESC ": %s for gate=%d len=%d\n", __func__,
+		target->hci_reader_gate, skb->len);
+
+	switch (target->hci_reader_gate) {
+	case ST21NFCA_RF_READER_F_GATE:
+		*skb_push(skb, 1) = 0x1a;
+		return nfc_hci_send_cmd_async(hdev, target->hci_reader_gate,
+					      ST21NFCA_WR_XCHG_DATA, skb->data,
+					      skb->len, cb, cb_context);
+	case ST21NFCA_RF_READER_14443_3_A_GATE:
+		*skb_push(skb, 1) = 0x1a;	/* CTR, see spec:10.2.2.1 */
+
+		return nfc_hci_send_cmd_async(hdev, target->hci_reader_gate,
+					      ST21NFCA_WR_XCHG_DATA, skb->data,
+					      skb->len, cb, cb_context);
+	default:
+		return 1;
+	}
+}
+
+static int st21nfca_hci_check_presence(struct nfc_hci_dev *hdev,
+				       struct nfc_target *target)
+{
+	u8 fwi = 0x11;
+	switch (target->hci_reader_gate) {
+	case NFC_HCI_RF_READER_A_GATE:
+	case NFC_HCI_RF_READER_B_GATE:
+		/*
+		 * PRESENCE_CHECK on those gates is available
+		 * However, the answer to this command is taking 3 * fwi
+		 * if the card is no present.
+		 * Instead, we send an empty I-Frame with a very short
+		 * configurable fwi ~604µs.
+		 */
+		return nfc_hci_send_cmd(hdev, target->hci_reader_gate,
+					ST21NFCA_WR_XCHG_DATA, &fwi, 1, NULL);
+	case ST21NFCA_RF_READER_14443_3_A_GATE:
+		return nfc_hci_send_cmd(hdev, target->hci_reader_gate,
+					ST21NFCA_RF_READER_CMD_PRESENCE_CHECK,
+					NULL, 0, NULL);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static struct nfc_hci_ops st21nfca_hci_ops = {
+	.open = st21nfca_hci_open,
+	.close = st21nfca_hci_close,
+	.hci_ready = st21nfca_hci_ready,
+	.xmit = st21nfca_hci_xmit,
+	.start_poll = st21nfca_hci_start_poll,
+	.target_from_gate = st21nfca_hci_target_from_gate,
+	.im_transceive = st21nfca_hci_im_transceive,
+	.check_presence = st21nfca_hci_check_presence,
+};
+
+int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
+		       char *llc_name, int phy_headroom, int phy_tailroom,
+		       int phy_payload, struct nfc_hci_dev **hdev)
+{
+	struct st21nfca_hci_info *info;
+	int r = 0;
+	int dev_num;
+	u32 protocols;
+	struct nfc_hci_init_data init_data;
+	unsigned long quirks = 0;
+
+	info = kzalloc(sizeof(struct st21nfca_hci_info), GFP_KERNEL);
+	if (!info) {
+		r = -ENOMEM;
+		goto err_alloc_hdev;
+	}
+
+	info->phy_ops = phy_ops;
+	info->phy_id = phy_id;
+	info->state = ST21NFCA_ST_COLD;
+	mutex_init(&info->info_lock);
+
+	init_data.gate_count = ARRAY_SIZE(st21nfca_gates);
+
+	memcpy(init_data.gates, st21nfca_gates, sizeof(st21nfca_gates));
+
+	/*
+	 * Session id must include the driver name + i2c bus addr
+	 * persistent info to discriminate 2 identical chips
+	 */
+	dev_num = find_first_zero_bit(dev_mask, ST21NFCA_NUM_DEVICES);
+	if (dev_num >= ST21NFCA_NUM_DEVICES)
+		goto err_alloc_hdev;
+
+	scnprintf(init_data.session_id, sizeof(init_data.session_id), "%s%2x",
+		  "ST21AH", dev_num);
+
+	protocols = NFC_PROTO_JEWEL_MASK |
+	    NFC_PROTO_MIFARE_MASK |
+	    NFC_PROTO_FELICA_MASK |
+	    NFC_PROTO_ISO14443_MASK |
+	    NFC_PROTO_ISO14443_B_MASK;
+
+	set_bit(NFC_HCI_QUIRK_SHORT_CLEAR, &quirks);
+
+	info->hdev =
+	    nfc_hci_allocate_device(&st21nfca_hci_ops, &init_data, quirks,
+				    protocols, llc_name,
+				    phy_headroom + ST21NFCA_CMDS_HEADROOM,
+				    phy_tailroom, phy_payload);
+
+	if (!info->hdev) {
+		pr_err("Cannot allocate nfc hdev.\n");
+		r = -ENOMEM;
+		goto err_alloc_hdev;
+	}
+
+	nfc_hci_set_clientdata(info->hdev, info);
+
+	r = nfc_hci_register_device(info->hdev);
+	if (r)
+		goto err_regdev;
+
+	*hdev = info->hdev;
+
+	return 0;
+
+err_regdev:
+	nfc_hci_free_device(info->hdev);
+
+err_alloc_hdev:
+	kfree(info);
+
+	return r;
+}
+EXPORT_SYMBOL(st21nfca_hci_probe);
+
+void st21nfca_hci_remove(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	nfc_hci_unregister_device(hdev);
+	nfc_hci_free_device(hdev);
+	kfree(info);
+}
+EXPORT_SYMBOL(st21nfca_hci_remove);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/nfc/st21nfca/st21nfca.h b/drivers/nfc/st21nfca/st21nfca.h
new file mode 100644
index 000000000000..334cd90bcc8c
--- /dev/null
+++ b/drivers/nfc/st21nfca/st21nfca.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LOCAL_ST21NFCA_H_
+#define __LOCAL_ST21NFCA_H_
+
+#include <net/nfc/hci.h>
+
+#define HCI_MODE 0
+
+/* framing in HCI mode */
+#define ST21NFCA_SOF_EOF_LEN    2
+
+/* Almost every time value is 0 */
+#define ST21NFCA_HCI_LLC_LEN    1
+
+/* Size in worst case :
+ * In normal case CRC len = 2 but byte stuffing
+ * may appear in case one CRC byte = ST21NFCA_SOF_EOF
+ */
+#define ST21NFCA_HCI_LLC_CRC    4
+
+#define ST21NFCA_HCI_LLC_LEN_CRC        (ST21NFCA_SOF_EOF_LEN + \
+						ST21NFCA_HCI_LLC_LEN + \
+						ST21NFCA_HCI_LLC_CRC)
+#define ST21NFCA_HCI_LLC_MIN_SIZE       (1 + ST21NFCA_HCI_LLC_LEN_CRC)
+
+/* Worst case when adding byte stuffing between each byte */
+#define ST21NFCA_HCI_LLC_MAX_PAYLOAD    29
+#define ST21NFCA_HCI_LLC_MAX_SIZE       (ST21NFCA_HCI_LLC_LEN_CRC + 1 + \
+					ST21NFCA_HCI_LLC_MAX_PAYLOAD)
+
+#define DRIVER_DESC "HCI NFC driver for ST21NFCA"
+
+#define ST21NFCA_HCI_MODE 0
+
+#define ST21NFCA_NUM_DEVICES 256
+
+int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
+		       char *llc_name, int phy_headroom, int phy_tailroom,
+		       int phy_payload, struct nfc_hci_dev **hdev);
+void st21nfca_hci_remove(struct nfc_hci_dev *hdev);
+
+enum st21nfca_state {
+	ST21NFCA_ST_COLD,
+	ST21NFCA_ST_READY,
+};
+
+struct st21nfca_hci_info {
+	struct nfc_phy_ops *phy_ops;
+	void *phy_id;
+
+	struct nfc_hci_dev *hdev;
+
+	enum st21nfca_state state;
+
+	struct mutex info_lock;
+
+	int async_cb_type;
+	data_exchange_cb_t async_cb;
+	void *async_cb_context;
+
+} __packed;
+
+/* Reader RF commands */
+#define ST21NFCA_WR_XCHG_DATA            0x10
+
+#define ST21NFCA_RF_READER_F_GATE               0x14
+#define ST21NFCA_RF_READER_F_DATARATE 0x01
+#define ST21NFCA_RF_READER_F_DATARATE_106 0x01
+#define ST21NFCA_RF_READER_F_DATARATE_212 0x02
+#define ST21NFCA_RF_READER_F_DATARATE_424 0x04
+
+#endif /* __LOCAL_ST21NFCA_H_ */
diff --git a/include/linux/platform_data/st21nfca.h b/include/linux/platform_data/st21nfca.h
new file mode 100644
index 000000000000..1730312398ff
--- /dev/null
+++ b/include/linux/platform_data/st21nfca.h
@@ -0,0 +1,32 @@
+/*
+ * Driver include for the ST21NFCA NFC chip.
+ *
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ST21NFCA_HCI_H_
+#define _ST21NFCA_HCI_H_
+
+#include <linux/i2c.h>
+
+#define ST21NFCA_HCI_DRIVER_NAME "st21nfca_hci"
+
+struct st21nfca_nfc_platform_data {
+	unsigned int gpio_irq;
+	unsigned int gpio_ena;
+	unsigned int irq_polarity;
+};
+
+#endif /* _ST21NFCA_HCI_H_ */
-- 
cgit v1.2.3-71-gd317


From fa372a51cb5f93800f711473e5a36e0e0c9a8f00 Mon Sep 17 00:00:00 2001
From: Markus Mayer <markus.mayer@linaro.org>
Date: Tue, 8 Apr 2014 15:19:43 -0700
Subject: mmc: Delay the card_event callback into the mmc_rescan worker

This change removes the callback from atomic context which it doesn't
need to be in, and puts it in line with the debounced rescan.

This code is based on these e-mail threads with Christian Daudt:

  https://lkml.org/lkml/2013/8/19/539
  https://lkml.org/lkml/2014/3/19/79

Signed-off-by: Markus Mayer <markus.mayer@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/core.c      | 5 +++++
 drivers/mmc/core/slot-gpio.c | 4 +---
 include/linux/mmc/host.h     | 2 ++
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index acbc3f2aaaf9..f396d1bb4ac4 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2403,6 +2403,11 @@ void mmc_rescan(struct work_struct *work)
 		container_of(work, struct mmc_host, detect.work);
 	int i;
 
+	if (host->trigger_card_event && host->ops->card_event) {
+		host->ops->card_event(host);
+		host->trigger_card_event = false;
+	}
+
 	if (host->rescan_disable)
 		return;
 
diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index f7650b899e3d..5f89cb83d5f0 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -32,9 +32,7 @@ static irqreturn_t mmc_gpio_cd_irqt(int irq, void *dev_id)
 	/* Schedule a card detection after a debounce timeout */
 	struct mmc_host *host = dev_id;
 
-	if (host->ops->card_event)
-		host->ops->card_event(host);
-
+	host->trigger_card_event = true;
 	mmc_detect_change(host, msecs_to_jiffies(200));
 
 	return IRQ_HANDLED;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 35354207e71f..0cf705c83998 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -319,6 +319,8 @@ struct mmc_host {
 	int			rescan_disable;	/* disable card detection */
 	int			rescan_entered;	/* used with nonremovable devices */
 
+	bool			trigger_card_event; /* card_event necessary */
+
 	struct mmc_card		*card;		/* device attached to this host */
 
 	wait_queue_head_t	wq;
-- 
cgit v1.2.3-71-gd317


From 297d40560bc8f474adbb43178e3118321fa702ea Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@freescale.com>
Date: Fri, 4 Apr 2014 22:42:48 -0300
Subject: mmc: card.h: Use NULL instead of 0 for END_FIXUP

Fix the following sparse warnings:

drivers/mmc/card/block.c:2421:9: warning: Using plain integer as NULL pointer

drivers/mmc/core/quirks.c:69:9: warning: Using plain integer as NULL pointer

Signed-off-by: Fabio Estevam <fabio.estevam@freescale.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 include/linux/mmc/card.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index b73027298b3a..aa7e57f60fb2 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -353,7 +353,7 @@ struct mmc_fixup {
 #define CID_OEMID_ANY ((unsigned short) -1)
 #define CID_NAME_ANY (NULL)
 
-#define END_FIXUP { 0 }
+#define END_FIXUP { NULL }
 
 #define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end,	\
 		   _cis_vendor, _cis_device,				\
-- 
cgit v1.2.3-71-gd317


From 3de0b592394d17b2c41a261a6a493a521213f299 Mon Sep 17 00:00:00 2001
From: Venkata Duvvuru <VenkatKumar.Duvvuru@Emulex.Com>
Date: Mon, 21 Apr 2014 15:37:59 +0530
Subject: ethtool: Support for configurable RSS hash key

This ethtool patch primarily copies the ioctl command data structures
from/to the User space and invokes the driver hook.

Signed-off-by: Venkat Duvvuru <VenkatKumar.Duvvuru@Emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  13 +++
 include/uapi/linux/ethtool.h |  32 +++++++
 net/core/ethtool.c           | 221 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 252 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 0a114d05f68d..212f537fc686 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -154,13 +154,23 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  * @reset: Reset (part of) the device, as specified by a bitmask of
  *	flags from &enum ethtool_reset_flags.  Returns a negative
  *	error code or zero.
+ * @get_rxfh_key_size: Get the size of the RX flow hash key.
+ *	Returns zero if not supported for this specific device.
  * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table.
  *	Returns zero if not supported for this specific device.
  * @get_rxfh_indir: Get the contents of the RX flow hash indirection table.
  *	Will not be called if @get_rxfh_indir_size returns zero.
+ * @get_rxfh: Get the contents of the RX flow hash indirection table and hash
+ *	key.
+ *	Will not be called if @get_rxfh_indir_size and @get_rxfh_key_size
+ *	returns zero.
  *	Returns a negative error code or zero.
  * @set_rxfh_indir: Set the contents of the RX flow hash indirection table.
  *	Will not be called if @get_rxfh_indir_size returns zero.
+ * @set_rxfh: Set the contents of the RX flow hash indirection table and
+ *	hash key.
+ *	Will not be called if @get_rxfh_indir_size and @get_rxfh_key_size
+ *	returns zero.
  *	Returns a negative error code or zero.
  * @get_channels: Get number of channels.
  * @set_channels: Set number of channels.  Returns a negative error code or
@@ -232,7 +242,10 @@ struct ethtool_ops {
 	int	(*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *);
 	int	(*flash_device)(struct net_device *, struct ethtool_flash *);
 	int	(*reset)(struct net_device *, u32 *);
+	u32	(*get_rxfh_key_size)(struct net_device *);
 	u32	(*get_rxfh_indir_size)(struct net_device *);
+	int	(*get_rxfh)(struct net_device *, u32 *, u8 *);
+	int	(*set_rxfh)(struct net_device *, u32 *, u8 *);
 	int	(*get_rxfh_indir)(struct net_device *, u32 *);
 	int	(*set_rxfh_indir)(struct net_device *, const u32 *);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index fd161e91b6d7..d47d31d6fa0e 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -846,6 +846,35 @@ struct ethtool_rxfh_indir {
 	__u32	ring_index[0];
 };
 
+/**
+ * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key.
+ * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH
+ * @rss_context: RSS context identifier.
+ * @indir_size: On entry, the array size of the user buffer, which may be zero.
+ *		On return from %ETHTOOL_GRSSH, the array size of the hardware
+ *		indirection table.
+ * @key_size:	On entry, the array size of the user buffer in bytes,
+ *		which may be zero.
+ *		On return from %ETHTOOL_GRSSH, the size of the RSS hash key.
+ * @rsvd:	Reserved for future extensions.
+ * @rss_config: RX ring/queue index for each hash value i.e., indirection table
+ *		of size @indir_size followed by hash key of size @key_size.
+ *
+ * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the
+ * size should be returned.  For %ETHTOOL_SRSSH, a @indir_size of 0xDEADBEEF
+ * means that indir table setting is not requested and a @indir_size of zero
+ * means the indir table should be reset to default values.  This last feature
+ * is not supported by the original implementations.
+ */
+struct ethtool_rxfh {
+	__u32   cmd;
+	__u32	rss_context;
+	__u32   indir_size;
+	__u32   key_size;
+	__u32	rsvd[2];
+	__u32   rss_config[0];
+};
+
 /**
  * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter
  * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
@@ -1118,6 +1147,9 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_GEEE		0x00000044 /* Get EEE settings */
 #define ETHTOOL_SEEE		0x00000045 /* Set EEE settings */
 
+#define ETHTOOL_GRSSH		0x00000046 /* Get RX flow hash configuration */
+#define ETHTOOL_SRSSH		0x00000047 /* Set RX flow hash configuration */
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 640ba0e5831c..1d72786ef866 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -557,6 +557,23 @@ err_out:
 	return ret;
 }
 
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+					struct ethtool_rxnfc *rx_rings,
+					u32 size)
+{
+	int ret = 0, i;
+
+	if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+		ret = -EFAULT;
+
+	/* Validate ring indices */
+	for (i = 0; i < size; i++) {
+		if (indir[i] >= rx_rings->data)
+			ret = -EINVAL;
+	}
+	return ret;
+}
+
 static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 						     void __user *useraddr)
 {
@@ -613,6 +630,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 	u32 *indir;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 	int ret;
+	u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
 
 	if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir ||
 	    !ops->get_rxnfc)
@@ -643,28 +661,196 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 		for (i = 0; i < dev_size; i++)
 			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
 	} else {
-		if (copy_from_user(indir,
-				  useraddr +
-				  offsetof(struct ethtool_rxfh_indir,
-					   ring_index[0]),
-				  dev_size * sizeof(indir[0]))) {
+		ret = ethtool_copy_validate_indir(indir,
+						  useraddr + ringidx_offset,
+						  &rx_rings,
+						  dev_size);
+		if (ret)
+			goto out;
+	}
+
+	ret = ops->set_rxfh_indir(dev, indir);
+
+out:
+	kfree(indir);
+	return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+					       void __user *useraddr)
+{
+	int ret;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	u32 user_indir_size = 0, user_key_size = 0;
+	u32 dev_indir_size = 0, dev_key_size = 0;
+	u32 total_size;
+	u32 indir_offset, indir_bytes;
+	u32 key_offset;
+	u32 *indir = NULL;
+	u8 *hkey = NULL;
+	u8 *rss_config;
+
+	if (!(dev->ethtool_ops->get_rxfh_indir_size ||
+	      dev->ethtool_ops->get_rxfh_key_size) ||
+	      !dev->ethtool_ops->get_rxfh)
+		return -EOPNOTSUPP;
+
+	if (ops->get_rxfh_indir_size)
+		dev_indir_size = ops->get_rxfh_indir_size(dev);
+
+	indir_offset = offsetof(struct ethtool_rxfh, indir_size);
+
+	if (copy_from_user(&user_indir_size,
+			   useraddr + indir_offset,
+			   sizeof(user_indir_size)))
+		return -EFAULT;
+
+	if (copy_to_user(useraddr + indir_offset,
+			 &dev_indir_size, sizeof(dev_indir_size)))
+		return -EFAULT;
+
+	if (ops->get_rxfh_key_size)
+		dev_key_size = ops->get_rxfh_key_size(dev);
+
+	if ((dev_key_size + dev_indir_size) == 0)
+		return -EOPNOTSUPP;
+
+	key_offset = offsetof(struct ethtool_rxfh, key_size);
+
+	if (copy_from_user(&user_key_size,
+			   useraddr + key_offset,
+			   sizeof(user_key_size)))
+		return -EFAULT;
+
+	if (copy_to_user(useraddr + key_offset,
+			 &dev_key_size, sizeof(dev_key_size)))
+		return -EFAULT;
+
+	/* If the user buffer size is 0, this is just a query for the
+	 * device table size and key size.  Otherwise, if the User size is
+	 * not equal to device table size or key size it's an error.
+	 */
+	if (!user_indir_size && !user_key_size)
+		return 0;
+
+	if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
+	    (user_key_size && (user_key_size != dev_key_size)))
+		return -EINVAL;
+
+	indir_bytes = user_indir_size * sizeof(indir[0]);
+	total_size = indir_bytes + user_key_size;
+	rss_config = kzalloc(total_size, GFP_USER);
+	if (!rss_config)
+		return -ENOMEM;
+
+	if (user_indir_size)
+		indir = (u32 *)rss_config;
+
+	if (user_key_size)
+		hkey = rss_config + indir_bytes;
+
+	ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
+	if (!ret) {
+		if (copy_to_user(useraddr +
+				 offsetof(struct ethtool_rxfh, rss_config[0]),
+				 rss_config, total_size))
 			ret = -EFAULT;
+	}
+
+	kfree(rss_config);
+
+	return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+					       void __user *useraddr)
+{
+	int ret;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rxnfc rx_rings;
+	u32 user_indir_size = 0, dev_indir_size = 0, i;
+	u32 user_key_size = 0, dev_key_size = 0;
+	u32 *indir = NULL, indir_bytes = 0;
+	u8 *hkey = NULL;
+	u8 *rss_config;
+	u32 indir_offset, key_offset;
+	u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+
+	if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
+	    !ops->get_rxnfc || !ops->set_rxfh)
+		return -EOPNOTSUPP;
+
+	if (ops->get_rxfh_indir_size)
+		dev_indir_size = ops->get_rxfh_indir_size(dev);
+
+	indir_offset = offsetof(struct ethtool_rxfh, indir_size);
+	if (copy_from_user(&user_indir_size,
+			   useraddr + indir_offset,
+			   sizeof(user_indir_size)))
+		return -EFAULT;
+
+	if (ops->get_rxfh_key_size)
+		dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
+
+	if ((dev_key_size + dev_indir_size) == 0)
+		return -EOPNOTSUPP;
+
+	key_offset = offsetof(struct ethtool_rxfh, key_size);
+	if (copy_from_user(&user_key_size,
+			   useraddr + key_offset,
+			   sizeof(user_key_size)))
+		return -EFAULT;
+
+	/* If either indir or hash key is valid, proceed further.
+	 */
+	if ((user_indir_size && ((user_indir_size != 0xDEADBEEF) &&
+				 user_indir_size != dev_indir_size)) ||
+	    (user_key_size && (user_key_size != dev_key_size)))
+		return -EINVAL;
+
+	if (user_indir_size != 0xDEADBEEF)
+		indir_bytes = dev_indir_size * sizeof(indir[0]);
+
+	rss_config = kzalloc(indir_bytes + user_key_size, GFP_USER);
+	if (!rss_config)
+		return -ENOMEM;
+
+	rx_rings.cmd = ETHTOOL_GRXRINGS;
+	ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+	if (ret)
+		goto out;
+
+	/* user_indir_size == 0 means reset the indir table to default.
+	 * user_indir_size == 0xDEADBEEF means indir setting is not requested.
+	 */
+	if (user_indir_size && user_indir_size != 0xDEADBEEF) {
+		indir = (u32 *)rss_config;
+		ret = ethtool_copy_validate_indir(indir,
+						  useraddr + rss_cfg_offset,
+						  &rx_rings,
+						  user_indir_size);
+		if (ret)
 			goto out;
-		}
+	} else if (user_indir_size == 0) {
+		indir = (u32 *)rss_config;
+		for (i = 0; i < dev_indir_size; i++)
+			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+	}
 
-		/* Validate ring indices */
-		for (i = 0; i < dev_size; i++) {
-			if (indir[i] >= rx_rings.data) {
-				ret = -EINVAL;
-				goto out;
-			}
+	if (user_key_size) {
+		hkey = rss_config + indir_bytes;
+		if (copy_from_user(hkey,
+				   useraddr + rss_cfg_offset + indir_bytes,
+				   user_key_size)) {
+			ret = -EFAULT;
+			goto out;
 		}
 	}
 
-	ret = ops->set_rxfh_indir(dev, indir);
+	ret = ops->set_rxfh(dev, indir, hkey);
 
 out:
-	kfree(indir);
+	kfree(rss_config);
 	return ret;
 }
 
@@ -1491,6 +1677,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GRXCLSRULE:
 	case ETHTOOL_GRXCLSRLALL:
 	case ETHTOOL_GRXFHINDIR:
+	case ETHTOOL_GRSSH:
 	case ETHTOOL_GFEATURES:
 	case ETHTOOL_GCHANNELS:
 	case ETHTOOL_GET_TS_INFO:
@@ -1628,6 +1815,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SRXFHINDIR:
 		rc = ethtool_set_rxfh_indir(dev, useraddr);
 		break;
+	case ETHTOOL_GRSSH:
+		rc = ethtool_get_rxfh(dev, useraddr);
+		break;
+	case ETHTOOL_SRSSH:
+		rc = ethtool_set_rxfh(dev, useraddr);
+		break;
 	case ETHTOOL_GFEATURES:
 		rc = ethtool_get_features(dev, useraddr);
 		break;
-- 
cgit v1.2.3-71-gd317


From 4cd3675ebf74d7f559038ded6aa8088e4099a83d Mon Sep 17 00:00:00 2001
From: Chema Gonzalez <chema@google.com>
Date: Mon, 21 Apr 2014 09:21:24 -0700
Subject: filter: added BPF random opcode

Added a new ancillary load (bpf call in eBPF parlance) that produces
a 32-bit random number. We are implementing it as an ancillary load
(instead of an ISA opcode) because (a) it is simpler, (b) allows easy
JITing, and (c) seems more in line with generic ISAs that do not have
"get a random number" as a instruction, but as an OS call.

The main use for this ancillary load is to perform random packet sampling.

Signed-off-by: Chema Gonzalez <chema@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt | 13 +++++++++++++
 include/linux/filter.h              |  1 +
 include/uapi/linux/filter.h         |  3 ++-
 net/core/filter.c                   | 12 ++++++++++++
 tools/net/bpf_exp.l                 |  1 +
 tools/net/bpf_exp.y                 | 11 ++++++++++-
 6 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 81f940f4e884..82e1cb0b3da8 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -281,6 +281,7 @@ Possible BPF extensions are shown in the following table:
   cpu                                   raw_smp_processor_id()
   vlan_tci                              vlan_tx_tag_get(skb)
   vlan_pr                               vlan_tx_tag_present(skb)
+  rand                                  prandom_u32()
 
 These extensions can also be prefixed with '#'.
 Examples for low-level BPF:
@@ -308,6 +309,18 @@ Examples for low-level BPF:
   ret #-1
   drop: ret #0
 
+** icmp random packet sampling, 1 in 4
+  ldh [12]
+  jne #0x800, drop
+  ldb [23]
+  jneq #1, drop
+  # get a random uint32 number
+  ld rand
+  mod #4
+  jneq #1, drop
+  ret #-1
+  drop: ret #0
+
 ** SECCOMP filter example:
 
   ld [4]                  /* offsetof(struct seccomp_data, arch) */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 024fd03e5d18..759abf78dd61 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -223,6 +223,7 @@ enum {
 	BPF_S_ANC_VLAN_TAG,
 	BPF_S_ANC_VLAN_TAG_PRESENT,
 	BPF_S_ANC_PAY_OFFSET,
+	BPF_S_ANC_RANDOM,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 8eb9ccaa5b48..253b4d42cf2b 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -130,7 +130,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_VLAN_TAG	44
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
-#define SKF_AD_MAX	56
+#define SKF_AD_RANDOM	56
+#define SKF_AD_MAX	60
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index cd58614660cf..78a636e60a0b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -643,6 +643,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
 	return raw_smp_processor_id();
 }
 
+/* note that this only generates 32-bit random numbers */
+static u64 __get_random_u32(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+{
+	return (u64)prandom_u32();
+}
+
 /* Register mappings for user programs. */
 #define A_REG		0
 #define X_REG		7
@@ -779,6 +785,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR:
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
+	case SKF_AD_OFF + SKF_AD_RANDOM:
 		/* arg1 = ctx */
 		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
 		insn->a_reg = ARG1_REG;
@@ -812,6 +819,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_CPU:
 			insn->imm = __get_raw_cpu_id - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_RANDOM:
+			insn->imm = __get_random_u32 - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1362,6 +1372,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG);
 			ANCILLARY(VLAN_TAG_PRESENT);
 			ANCILLARY(PAY_OFFSET);
+			ANCILLARY(RANDOM);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1746,6 +1757,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index bf7be77ddd62..833a96611da6 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -92,6 +92,7 @@ extern void yyerror(const char *str);
 "#"?("cpu")	{ return K_CPU; }
 "#"?("vlan_tci") { return K_VLANT; }
 "#"?("vlan_pr")	{ return K_VLANP; }
+"#"?("rand")	{ return K_RAND; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index d15efc989ef5..e6306c51c26f 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -164,6 +164,9 @@ ldb
 	| OP_LDB K_POFF {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_PAY_OFFSET); }
+	| OP_LDB K_RAND {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RANDOM); }
 	;
 
 ldh
@@ -212,6 +215,9 @@ ldh
 	| OP_LDH K_POFF {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_PAY_OFFSET); }
+	| OP_LDH K_RAND {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RANDOM); }
 	;
 
 ldi
@@ -265,6 +271,9 @@ ld
 	| OP_LD K_POFF {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_PAY_OFFSET); }
+	| OP_LD K_RAND {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RANDOM); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
cgit v1.2.3-71-gd317


From 4f520900522fd596e336c07e9aafd5b7a9564235 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 22 Apr 2014 21:31:54 -0400
Subject: netlink: have netlink per-protocol bind function return an error
 code.

Have the netlink per-protocol optional bind function return an int error code
rather than void to signal a failure.

This will enable netlink protocols to perform extra checks including
capabilities and permissions verifications when updating memberships in
multicast groups.

In netlink_bind() and netlink_setsockopt() the call to the per-protocol bind
function was moved above the multicast group update to prevent any access to
the multicast socket groups before checking with the per-protocol bind
function.  This will enable the per-protocol bind function to be used to check
permissions which could be denied before making them available, and to avoid
the messy job of undoing the addition should the per-protocol bind function
fail.

The netfilter subsystem seems to be the only one currently using the
per-protocol bind function.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h   |  3 ++-
 net/netfilter/nfnetlink.c |  3 ++-
 net/netlink/af_netlink.c  | 68 +++++++++++++++++++++++++++++++++--------------
 net/netlink/af_netlink.h  |  6 +++--
 4 files changed, 56 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index aad8eeaf416d..5146ce066498 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -45,7 +45,8 @@ struct netlink_kernel_cfg {
 	unsigned int	flags;
 	void		(*input)(struct sk_buff *skb);
 	struct mutex	*cb_mutex;
-	void		(*bind)(int group);
+	int		(*bind)(int group);
+	void		(*unbind)(int group);
 	bool		(*compare)(struct net *net, struct sock *sk);
 };
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 0df800a454ec..6e42dcfad40a 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -400,7 +400,7 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 }
 
 #ifdef CONFIG_MODULES
-static void nfnetlink_bind(int group)
+static int nfnetlink_bind(int group)
 {
 	const struct nfnetlink_subsystem *ss;
 	int type = nfnl_group2type[group];
@@ -410,6 +410,7 @@ static void nfnetlink_bind(int group)
 	rcu_read_unlock();
 	if (!ss)
 		request_module("nfnetlink-subsys-%d", type);
+	return 0;
 }
 #endif
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 894cda0206bb..7e8d229bc010 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1206,7 +1206,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	struct module *module = NULL;
 	struct mutex *cb_mutex;
 	struct netlink_sock *nlk;
-	void (*bind)(int group);
+	int (*bind)(int group);
+	void (*unbind)(int group);
 	int err = 0;
 
 	sock->state = SS_UNCONNECTED;
@@ -1232,6 +1233,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 		err = -EPROTONOSUPPORT;
 	cb_mutex = nl_table[protocol].cb_mutex;
 	bind = nl_table[protocol].bind;
+	unbind = nl_table[protocol].unbind;
 	netlink_unlock_table();
 
 	if (err < 0)
@@ -1248,6 +1250,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	nlk = nlk_sk(sock->sk);
 	nlk->module = module;
 	nlk->netlink_bind = bind;
+	nlk->netlink_unbind = unbind;
 out:
 	return err;
 
@@ -1301,6 +1304,7 @@ static int netlink_release(struct socket *sock)
 			kfree_rcu(old, rcu);
 			nl_table[sk->sk_protocol].module = NULL;
 			nl_table[sk->sk_protocol].bind = NULL;
+			nl_table[sk->sk_protocol].unbind = NULL;
 			nl_table[sk->sk_protocol].flags = 0;
 			nl_table[sk->sk_protocol].registered = 0;
 		}
@@ -1411,6 +1415,19 @@ static int netlink_realloc_groups(struct sock *sk)
 	return err;
 }
 
+static void netlink_unbind(int group, long unsigned int groups,
+			   struct netlink_sock *nlk)
+{
+	int undo;
+
+	if (!nlk->netlink_unbind)
+		return;
+
+	for (undo = 0; undo < group; undo++)
+		if (test_bit(group, &groups))
+			nlk->netlink_unbind(undo);
+}
+
 static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 			int addr_len)
 {
@@ -1419,6 +1436,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
 	int err;
+	long unsigned int groups = nladdr->nl_groups;
 
 	if (addr_len < sizeof(struct sockaddr_nl))
 		return -EINVAL;
@@ -1427,7 +1445,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 		return -EINVAL;
 
 	/* Only superuser is allowed to listen multicasts */
-	if (nladdr->nl_groups) {
+	if (groups) {
 		if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV))
 			return -EPERM;
 		err = netlink_realloc_groups(sk);
@@ -1435,37 +1453,45 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 			return err;
 	}
 
-	if (nlk->portid) {
+	if (nlk->portid)
 		if (nladdr->nl_pid != nlk->portid)
 			return -EINVAL;
-	} else {
+
+	if (nlk->netlink_bind && groups) {
+		int group;
+
+		for (group = 0; group < nlk->ngroups; group++) {
+			if (!test_bit(group, &groups))
+				continue;
+			err = nlk->netlink_bind(group);
+			if (!err)
+				continue;
+			netlink_unbind(group, groups, nlk);
+			return err;
+		}
+	}
+
+	if (!nlk->portid) {
 		err = nladdr->nl_pid ?
 			netlink_insert(sk, net, nladdr->nl_pid) :
 			netlink_autobind(sock);
-		if (err)
+		if (err) {
+			netlink_unbind(nlk->ngroups - 1, groups, nlk);
 			return err;
+		}
 	}
 
-	if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
+	if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
 		return 0;
 
 	netlink_table_grab();
 	netlink_update_subscriptions(sk, nlk->subscriptions +
-					 hweight32(nladdr->nl_groups) -
+					 hweight32(groups) -
 					 hweight32(nlk->groups[0]));
-	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
+	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
 	netlink_update_listeners(sk);
 	netlink_table_ungrab();
 
-	if (nlk->netlink_bind && nlk->groups[0]) {
-		int i;
-
-		for (i = 0; i < nlk->ngroups; i++) {
-			if (test_bit(i, nlk->groups))
-				nlk->netlink_bind(i);
-		}
-	}
-
 	return 0;
 }
 
@@ -2103,14 +2129,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			return err;
 		if (!val || val - 1 >= nlk->ngroups)
 			return -EINVAL;
+		if (nlk->netlink_bind) {
+			err = nlk->netlink_bind(val);
+			if (err)
+				return err;
+		}
 		netlink_table_grab();
 		netlink_update_socket_mc(nlk, val,
 					 optname == NETLINK_ADD_MEMBERSHIP);
 		netlink_table_ungrab();
 
-		if (nlk->netlink_bind)
-			nlk->netlink_bind(val);
-
 		err = 0;
 		break;
 	}
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index ed13a790b00e..0b59d441f5b6 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -38,7 +38,8 @@ struct netlink_sock {
 	struct mutex		*cb_mutex;
 	struct mutex		cb_def_mutex;
 	void			(*netlink_rcv)(struct sk_buff *skb);
-	void			(*netlink_bind)(int group);
+	int			(*netlink_bind)(int group);
+	void			(*netlink_unbind)(int group);
 	struct module		*module;
 #ifdef CONFIG_NETLINK_MMAP
 	struct mutex		pg_vec_lock;
@@ -74,7 +75,8 @@ struct netlink_table {
 	unsigned int		groups;
 	struct mutex		*cb_mutex;
 	struct module		*module;
-	void			(*bind)(int group);
+	int			(*bind)(int group);
+	void			(*unbind)(int group);
 	bool			(*compare)(struct net *net, struct sock *sock);
 	int			registered;
 };
-- 
cgit v1.2.3-71-gd317


From f392e51cd6ae6f6ee5b9b6d611cdc282b4c1711e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Apr 2014 11:13:14 -0400
Subject: cgroup: update cgroup->subsys_mask to ->child_subsys_mask and restore
 cgroup_root->subsys_mask

944196278d3d ("cgroup: move ->subsys_mask from cgroupfs_root to
cgroup") moved ->subsys_mask from cgroup_root to cgroup to prepare for
the unified hierarhcy; however, it turns out that carrying the
subsys_mask of the children in the parent, instead of itself, is a lot
more natural.  This patch restores cgroup_root->subsys_mask and morphs
cgroup->subsys_mask into cgroup->child_subsys_mask.

* Uses of root->cgrp.subsys_mask are restored to root->subsys_mask.

* Remove automatic setting and clearing of cgrp->subsys_mask and
  instead just inherit ->child_subsys_mask from the parent during
  cgroup creation.  Note that this doesn't affect any current
  behaviors.

* Undo __kill_css() separation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  7 ++++--
 kernel/cgroup.c        | 64 +++++++++++++++++++++-----------------------------
 2 files changed, 32 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c2515851c1aa..1b5b2fe1b228 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -173,8 +173,8 @@ struct cgroup {
 	 */
 	u64 serial_nr;
 
-	/* The bitmask of subsystems attached to this cgroup */
-	unsigned long subsys_mask;
+	/* the bitmask of subsystems enabled on the child cgroups */
+	unsigned long child_subsys_mask;
 
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
@@ -282,6 +282,9 @@ enum {
 struct cgroup_root {
 	struct kernfs_root *kf_root;
 
+	/* The bitmask of subsystems attached to this hierarchy */
+	unsigned long subsys_mask;
+
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a6894272353b..f944619077f4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -529,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
 	 * won't change, so no need for locking.
 	 */
 	for_each_subsys(ss, i) {
-		if (root->cgrp.subsys_mask & (1UL << i)) {
+		if (root->subsys_mask & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
 			 * cgroup */
@@ -742,7 +742,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	BUG_ON(!list_empty(&cgrp->children));
 
 	/* Rebind all subsystems back to the default hierarchy */
-	rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
+	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
 
 	/*
 	 * Release all the links from cset_links to this hierarchy's
@@ -1050,8 +1050,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		ss->root = dst_root;
 		css->cgroup = &dst_root->cgrp;
 
-		src_root->cgrp.subsys_mask &= ~(1 << ssid);
-		dst_root->cgrp.subsys_mask |= 1 << ssid;
+		src_root->subsys_mask &= ~(1 << ssid);
+		src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+		dst_root->subsys_mask |= 1 << ssid;
+		dst_root->cgrp.child_subsys_mask |= 1 << ssid;
 
 		if (ss->bind)
 			ss->bind(css);
@@ -1069,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq,
 	int ssid;
 
 	for_each_subsys(ss, ssid)
-		if (root->cgrp.subsys_mask & (1 << ssid))
+		if (root->subsys_mask & (1 << ssid))
 			seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
@@ -1273,12 +1276,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 
-	added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
-	removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
+	added_mask = opts.subsys_mask & ~root->subsys_mask;
+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
 
 	/* Don't allow flags or name to change at remount */
 	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1535,7 +1538,7 @@ retry:
 		 * subsystems) then they must match.
 		 */
 		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->cgrp.subsys_mask)) {
+		    (opts.subsys_mask != root->subsys_mask)) {
 			if (!name_match)
 				continue;
 			ret = -EBUSY;
@@ -3658,8 +3661,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	cgroup_get(cgrp);
 	css_get(css->parent);
 
-	cgrp->subsys_mask |= 1 << ss->id;
-
 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
 	    parent->parent) {
 		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -3780,13 +3781,15 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 
 	/* let's create and online css's */
 	for_each_subsys(ss, ssid) {
-		if (root->cgrp.subsys_mask & (1 << ssid)) {
+		if (parent->child_subsys_mask & (1 << ssid)) {
 			err = create_css(cgrp, ss);
 			if (err)
 				goto err_destroy;
 		}
 	}
 
+	cgrp->child_subsys_mask = parent->child_subsys_mask;
+
 	kernfs_activate(kn);
 
 	mutex_unlock(&cgroup_mutex);
@@ -3882,7 +3885,16 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 
-static void __kill_css(struct cgroup_subsys_state *css)
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference.  ->css_offline() will be invoked
+ * asynchronously once css_tryget() is guaranteed to fail and when the
+ * reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
 {
 	lockdep_assert_held(&cgroup_tree_mutex);
 
@@ -3911,28 +3923,6 @@ static void __kill_css(struct cgroup_subsys_state *css)
 	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
 }
 
-/**
- * kill_css - destroy a css
- * @css: css to destroy
- *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference.  ->css_offline() will be invoked
- * asynchronously once css_tryget() is guaranteed to fail and when the
- * reference count reaches zero, @css will be released.
- */
-static void kill_css(struct cgroup_subsys_state *css)
-{
-	struct cgroup *cgrp = css->cgroup;
-
-	lockdep_assert_held(&cgroup_tree_mutex);
-
-	/* if already killed, noop */
-	if (cgrp->subsys_mask & (1 << css->ss->id)) {
-		cgrp->subsys_mask &= ~(1 << css->ss->id);
-		__kill_css(css);
-	}
-}
-
 /**
  * cgroup_destroy_locked - the first stage of cgroup destruction
  * @cgrp: cgroup to be destroyed
@@ -4145,7 +4135,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
 	BUG_ON(online_css(css));
 
-	cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
+	cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
@@ -4302,7 +4292,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(ss, ssid)
-			if (root->cgrp.subsys_mask & (1 << ssid))
+			if (root->subsys_mask & (1 << ssid))
 				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
-- 
cgit v1.2.3-71-gd317


From 2d8f243a5e6efa57fb7c46fe83fafa45b33d0ec2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Apr 2014 11:13:15 -0400
Subject: cgroup: implement cgroup->e_csets[]

On the default unified hierarchy, a cgroup may be associated with
csses of its ancestors, which means that a css of a given cgroup may
be associated with css_sets of descendant cgroups.  This means that we
can't walk all tasks associated with a css by iterating the css_sets
associated with the cgroup as there are css_sets which are pointing to
the css but linked on the descendants.

This patch adds per-subsystem list heads cgroup->e_csets[].  Any
css_set which is pointing to a css is linked to
css->cgroup->e_csets[$SUBSYS_ID] through
css_set->e_cset_node[$SUBSYS_ID].  The lists are protected by
css_set_rwsem and will allow us to walk all css_sets associated with a
given css so that we can find out all associated tasks.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 18 ++++++++++++++++++
 kernel/cgroup.c        | 30 ++++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1b5b2fe1b228..33a0043ef454 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -187,6 +187,15 @@ struct cgroup {
 	 */
 	struct list_head cset_links;
 
+	/*
+	 * On the default hierarchy, a css_set for a cgroup with some
+	 * susbsys disabled will point to css's which are associated with
+	 * the closest ancestor which has the subsys enabled.  The
+	 * following lists all css_sets which point to this cgroup's css
+	 * for the given subsystem.
+	 */
+	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
+
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
@@ -369,6 +378,15 @@ struct css_set {
 	struct cgroup *mg_src_cgrp;
 	struct css_set *mg_dst_cset;
 
+	/*
+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * attached to an ancestor instead of the cgroup this css_set is
+	 * associated with.  The following node is anchored at
+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+	 * iterate through all css's attached to a given cgroup.
+	 */
+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
+
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4eb2dd1bb5b1..37d966289978 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -425,6 +425,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 {
 	struct cgrp_cset_link *link, *tmp_link;
+	struct cgroup_subsys *ss;
+	int ssid;
 
 	lockdep_assert_held(&css_set_rwsem);
 
@@ -432,6 +434,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
 		return;
 
 	/* This css_set is dead. unlink it and release cgroup refcounts */
+	for_each_subsys(ss, ssid)
+		list_del(&cset->e_cset_node[ssid]);
 	hash_del(&cset->hlist);
 	css_set_count--;
 
@@ -673,7 +677,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	struct css_set *cset;
 	struct list_head tmp_links;
 	struct cgrp_cset_link *link;
+	struct cgroup_subsys *ss;
 	unsigned long key;
+	int ssid;
 
 	lockdep_assert_held(&cgroup_mutex);
 
@@ -724,10 +730,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	css_set_count++;
 
-	/* Add this cgroup group to the hash table */
+	/* Add @cset to the hash table */
 	key = css_set_hash(cset->subsys);
 	hash_add(css_set_table, &cset->hlist, key);
 
+	for_each_subsys(ss, ssid)
+		list_add_tail(&cset->e_cset_node[ssid],
+			      &cset->subsys[ssid]->cgroup->e_csets[ssid]);
+
 	up_write(&css_set_rwsem);
 
 	return cset;
@@ -1028,7 +1038,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
-	int ssid, ret;
+	int ssid, i, ret;
 
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
@@ -1081,6 +1091,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 	for_each_subsys(ss, ssid) {
 		struct cgroup_root *src_root;
 		struct cgroup_subsys_state *css;
+		struct css_set *cset;
 
 		if (!(ss_mask & (1 << ssid)))
 			continue;
@@ -1095,6 +1106,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		ss->root = dst_root;
 		css->cgroup = &dst_root->cgrp;
 
+		down_write(&css_set_rwsem);
+		hash_for_each(css_set_table, i, cset, hlist)
+			list_move_tail(&cset->e_cset_node[ss->id],
+				       &dst_root->cgrp.e_csets[ss->id]);
+		up_write(&css_set_rwsem);
+
 		src_root->subsys_mask &= ~(1 << ssid);
 		src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
 
@@ -1417,6 +1434,9 @@ out_unlock:
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
+	struct cgroup_subsys *ss;
+	int ssid;
+
 	atomic_set(&cgrp->refcnt, 1);
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
@@ -1425,6 +1445,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->dummy_css.cgroup = cgrp;
+
+	for_each_subsys(ss, ssid)
+		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 }
 
 static void init_cgroup_root(struct cgroup_root *root,
@@ -4249,6 +4272,9 @@ int __init cgroup_init(void)
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 
+		list_add_tail(&init_css_set.e_cset_node[ssid],
+			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
+
 		/*
 		 * cftype registration needs kmalloc and can't be done
 		 * during early_init.  Register base cftypes separately.
-- 
cgit v1.2.3-71-gd317


From 0f0a2b4fa6210147131082999f1f16d7fb79abf8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Apr 2014 11:13:15 -0400
Subject: cgroup: reorganize css_task_iter

This patch reorganizes css_task_iter so that adding effective css
support is easier.

* s/->cset_link/->cset_pos/ and s/->task/->task_pos/ for consistency

* ->origin_css is used to determine whether the iteration reached the
  last css_set.  Replace it with explicit ->cset_head so that
  css_advance_task_iter() doesn't have to know the termination
  condition directly.

* css_task_iter_next() currently assumes that it's walking list of
  cgrp_cset_link and reaches into the current cset through the current
  link to determine the termination conditions for task walking.  As
  this won't always be true for effective css walking, add
  ->tasks_head and ->mg_tasks_head and use them to control task
  walking so that css_task_iter_next() doesn't have to know how
  css_sets are being walked.

This patch doesn't make any behavior changes.  The iteration logic
stays unchanged after the patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  9 ++++++---
 kernel/cgroup.c        | 33 +++++++++++++++++----------------
 2 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 33a0043ef454..bee390586120 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -842,9 +842,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 
 /* A css_task_iter should be treated as an opaque object */
 struct css_task_iter {
-	struct cgroup_subsys_state	*origin_css;
-	struct list_head		*cset_link;
-	struct list_head		*task;
+	struct list_head		*cset_pos;
+	struct list_head		*cset_head;
+
+	struct list_head		*task_pos;
+	struct list_head		*tasks_head;
+	struct list_head		*mg_tasks_head;
 };
 
 void css_task_iter_start(struct cgroup_subsys_state *css,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0edc186cd545..d48163b26196 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2857,27 +2857,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
  */
 static void css_advance_task_iter(struct css_task_iter *it)
 {
-	struct list_head *l = it->cset_link;
+	struct list_head *l = it->cset_pos;
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
-		if (l == &it->origin_css->cgroup->cset_links) {
-			it->cset_link = NULL;
+		if (l == it->cset_head) {
+			it->cset_pos = NULL;
 			return;
 		}
 		link = list_entry(l, struct cgrp_cset_link, cset_link);
 		cset = link->cset;
 	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
 
-	it->cset_link = l;
+	it->cset_pos = l;
 
 	if (!list_empty(&cset->tasks))
-		it->task = cset->tasks.next;
+		it->task_pos = cset->tasks.next;
 	else
-		it->task = cset->mg_tasks.next;
+		it->task_pos = cset->mg_tasks.next;
+
+	it->tasks_head = &cset->tasks;
+	it->mg_tasks_head = &cset->mg_tasks;
 }
 
 /**
@@ -2903,8 +2906,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	down_read(&css_set_rwsem);
 
-	it->origin_css = css;
-	it->cset_link = &css->cgroup->cset_links;
+	it->cset_pos = &css->cgroup->cset_links;
+	it->cset_head = it->cset_pos;
 
 	css_advance_task_iter(it);
 }
@@ -2920,12 +2923,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
 	struct task_struct *res;
-	struct list_head *l = it->task;
-	struct cgrp_cset_link *link = list_entry(it->cset_link,
-					struct cgrp_cset_link, cset_link);
+	struct list_head *l = it->task_pos;
 
 	/* If the iterator cg is NULL, we have no tasks */
-	if (!it->cset_link)
+	if (!it->cset_pos)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 
@@ -2936,13 +2937,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 	 */
 	l = l->next;
 
-	if (l == &link->cset->tasks)
-		l = link->cset->mg_tasks.next;
+	if (l == it->tasks_head)
+		l = it->mg_tasks_head->next;
 
-	if (l == &link->cset->mg_tasks)
+	if (l == it->mg_tasks_head)
 		css_advance_task_iter(it);
 	else
-		it->task = l;
+		it->task_pos = l;
 
 	return res;
 }
-- 
cgit v1.2.3-71-gd317


From 3ebb2b6ef38875b866ec0118bfae7bc52afd0166 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Apr 2014 11:13:15 -0400
Subject: cgroup: teach css_task_iter about effective csses

Currently, css_task_iter iterates tasks associated with a css by
visiting each css_set associated with the owning cgroup and walking
tasks of each of them.  This works fine for !unified hierarchies as
each cgroup has its own css for each associated subsystem on the
hierarchy; however, on the planned unified hierarchy, a cgroup may not
have csses associated and its tasks would be considered associated
with the matching css of the nearest ancestor which has the subsystem
enabled.

This means that on the default unified hierarchy, just walking all
tasks associated with a cgroup isn't enough to walk all tasks which
are associated with the specified css.  If any of its children doesn't
have the matching css enabled, task iteration should also include all
tasks from the subtree.  We already added cgroup->e_csets[] to list
all css_sets effectively associated with a given css and walk css_sets
on that list instead to achieve such iteration.

This patch updates css_task_iter iteration such that it walks css_sets
on cgroup->e_csets[] instead of cgroup->cset_links if iteration is
requested on an non-dummy css.  Thanks to the previous iteration
update, this change can be achieved with the addition of
css_task_iter->ss and minimal updates to css_advance_task_iter() and
css_task_iter_start().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  2 ++
 kernel/cgroup.c        | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bee390586120..18fcae39e63e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -842,6 +842,8 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 
 /* A css_task_iter should be treated as an opaque object */
 struct css_task_iter {
+	struct cgroup_subsys		*ss;
+
 	struct list_head		*cset_pos;
 	struct list_head		*cset_head;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d48163b26196..ad28866ed44c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2868,8 +2868,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
 			it->cset_pos = NULL;
 			return;
 		}
-		link = list_entry(l, struct cgrp_cset_link, cset_link);
-		cset = link->cset;
+
+		if (it->ss) {
+			cset = container_of(l, struct css_set,
+					    e_cset_node[it->ss->id]);
+		} else {
+			link = list_entry(l, struct cgrp_cset_link, cset_link);
+			cset = link->cset;
+		}
 	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
 
 	it->cset_pos = l;
@@ -2906,7 +2912,13 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	down_read(&css_set_rwsem);
 
-	it->cset_pos = &css->cgroup->cset_links;
+	it->ss = css->ss;
+
+	if (it->ss)
+		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+	else
+		it->cset_pos = &css->cgroup->cset_links;
+
 	it->cset_head = it->cset_pos;
 
 	css_advance_task_iter(it);
-- 
cgit v1.2.3-71-gd317


From 6803c006282768ec850760766a6e4eb1a6ff87df Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Apr 2014 11:13:16 -0400
Subject: cgroup: add css_set->dfl_cgrp

To implement the unified hierarchy behavior, we'll need to be able to
determine the associated cgroup on the default hierarchy from css_set.
Let's add css_set->dfl_cgrp so that it can be accessed conveniently
and efficiently.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 3 +++
 kernel/cgroup.c        | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 18fcae39e63e..c49d161a71cd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -354,6 +354,9 @@ struct css_set {
 	 */
 	struct list_head cgrp_links;
 
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
+
 	/*
 	 * Set of subsystem states, one for each subsystem. This array is
 	 * immutable after creation apart from the init_css_set during
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2a4f88db3205..c66bfc8ee8a7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -651,6 +651,10 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 	struct cgrp_cset_link *link;
 
 	BUG_ON(list_empty(tmp_links));
+
+	if (cgroup_on_dfl(cgrp))
+		cset->dfl_cgrp = cgrp;
+
 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 	link->cset = cset;
 	link->cgrp = cgrp;
-- 
cgit v1.2.3-71-gd317


From f8f22e53a262ebee37fc98004f16b066cf5bc125 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Apr 2014 11:13:16 -0400
Subject: cgroup: implement dynamic subtree controller enable/disable on the
 default hierarchy

cgroup is switching away from multiple hierarchies and will use one
unified default hierarchy where controllers can be dynamically enabled
and disabled per subtree.  The default hierarchy will serve as the
unified hierarchy to which all controllers are attached and a css on
the default hierarchy would need to also serve the tasks of descendant
cgroups which don't have the controller enabled - ie. the tree may be
collapsed from leaf towards root when viewed from specific
controllers.  This has been implemented through effective css in the
previous patches.

This patch finally implements dynamic subtree controller
enable/disable on the default hierarchy via a new knob -
"cgroup.subtree_control" which controls which controllers are enabled
on the child cgroups.  Let's assume a hierarchy like the following.

  root - A - B - C
               \ D

root's "cgroup.subtree_control" determines which controllers are
enabled on A.  A's on B.  B's on C and D.  This coincides with the
fact that controllers on the immediate sub-level are used to
distribute the resources of the parent.  In fact, it's natural to
assume that resource control knobs of a child belong to its parent.
Enabling a controller in "cgroup.subtree_control" declares that
distribution of the respective resources of the cgroup will be
controlled.  Note that this means that controller enable states are
shared among siblings.

The default hierarchy has an extra restriction - only cgroups which
don't contain any task may have controllers enabled in
"cgroup.subtree_control".  Combined with the other properties of the
default hierarchy, this guarantees that, from the view point of
controllers, tasks are only on the leaf cgroups.  In other words, only
leaf csses may contain tasks.  This rules out situations where child
cgroups compete against internal tasks of the parent, which is a
competition between two different types of entities without any clear
way to determine resource distribution between the two.  Different
controllers handle it differently and all the implemented behaviors
are ambiguous, ad-hoc, cumbersome and/or just wrong.  Having this
structural constraints imposed from cgroup core removes the burden
from controller implementations and enables showing one consistent
behavior across all controllers.

When a controller is enabled or disabled, css associations for the
controller in the subtrees of each child should be updated.  After
enabling, the whole subtree of a child should point to the new css of
the child.  After disabling, the whole subtree of a child should point
to the cgroup's css.  This is implemented by first updating cgroup
states such that cgroup_e_css() result points to the appropriate css
and then invoking cgroup_update_dfl_csses() which migrates all tasks
in the affected subtrees to the self cgroup on the default hierarchy.

* When read, "cgroup.subtree_control" lists all the currently enabled
  controllers on the children of the cgroup.

* White-space separated list of controller names prefixed with either
  '+' or '-' can be written to "cgroup.subtree_control".  The ones
  prefixed with '+' are enabled on the controller and '-' disabled.

* A controller can be enabled iff the parent's
  "cgroup.subtree_control" enables it and disabled iff no child's
  "cgroup.subtree_control" has it enabled.

* If a cgroup has tasks, no controller can be enabled via
  "cgroup.subtree_control".  Likewise, if "cgroup.subtree_control" has
  some controllers enabled, tasks can't be migrated into the cgroup.

* All controllers which aren't bound on other hierarchies are
  automatically associated with the root cgroup of the default
  hierarchy.  All the controllers which are bound to the default
  hierarchy are listed in the read-only file "cgroup.controllers" in
  the root directory.

* "cgroup.controllers" in all non-root cgroups is read-only file whose
  content is equal to that of "cgroup.subtree_control" of the parent.
  This indicates which controllers can be used in the cgroup's
  "cgroup.subtree_control".

This is still experimental and there are some holes, one of which is
that ->can_attach() failure during cgroup_update_dfl_csses() may leave
the cgroups in an undefined state.  The issues will be addressed by
future patches.

v2: Non-root cgroups now also have "cgroup.controllers".

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |   5 +
 kernel/cgroup.c        | 367 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 370 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c49d161a71cd..ada239253ec7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -21,6 +21,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
+#include <linux/wait.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -164,6 +165,7 @@ struct cgroup {
 
 	struct cgroup *parent;		/* my parent */
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
+	struct kernfs_node *control_kn;	/* kn for "cgroup.subtree_control" */
 
 	/*
 	 * Monotonically increasing unique serial number which defines a
@@ -216,6 +218,9 @@ struct cgroup {
 	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
+
+	/* used to wait for offlining of csses */
+	wait_queue_head_t offline_waitq;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8c2835a9e192..809dd903ceb8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -182,6 +182,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
@@ -338,6 +340,14 @@ static int notify_on_release(const struct cgroup *cgrp)
 #define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
 
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp)				\
+	list_for_each_entry((child), &(cgrp)->children, sibling)	\
+		if (({ lockdep_assert_held(&cgroup_tree_mutex);		\
+		       cgroup_is_dead(child); }))			\
+			;						\
+		else
+
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
@@ -1450,6 +1460,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+	init_waitqueue_head(&cgrp->offline_waitq);
 }
 
 static void init_cgroup_root(struct cgroup_root *root,
@@ -1938,6 +1950,14 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
 
 	lockdep_assert_held(&cgroup_mutex);
 
+	/*
+	 * Except for the root, child_subsys_mask must be zero for a cgroup
+	 * with tasks so that child cgroups don't compete against tasks.
+	 */
+	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
+	    dst_cgrp->child_subsys_mask)
+		return -EBUSY;
+
 	/* look up the dst cset for each src cset and link it to src */
 	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		struct css_set *dst_cset;
@@ -2303,6 +2323,326 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+	struct cgroup_subsys *ss;
+	bool printed = false;
+	int ssid;
+
+	for_each_subsys(ss, ssid) {
+		if (ss_mask & (1 << ssid)) {
+			if (printed)
+				seq_putc(seq, ' ');
+			seq_printf(seq, "%s", ss->name);
+			printed = true;
+		}
+	}
+	if (printed)
+		seq_putc(seq, '\n');
+}
+
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
+	return 0;
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
+	return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+	return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly.  This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+	LIST_HEAD(preloaded_csets);
+	struct cgroup_subsys_state *css;
+	struct css_set *src_cset;
+	int ret;
+
+	lockdep_assert_held(&cgroup_tree_mutex);
+	lockdep_assert_held(&cgroup_mutex);
+
+	/* look up all csses currently attached to @cgrp's subtree */
+	down_read(&css_set_rwsem);
+	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+		struct cgrp_cset_link *link;
+
+		/* self is not affected by child_subsys_mask change */
+		if (css->cgroup == cgrp)
+			continue;
+
+		list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+			cgroup_migrate_add_src(link->cset, cgrp,
+					       &preloaded_csets);
+	}
+	up_read(&css_set_rwsem);
+
+	/* NULL dst indicates self on default hierarchy */
+	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+	if (ret)
+		goto out_finish;
+
+	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+		struct task_struct *last_task = NULL, *task;
+
+		/* src_csets precede dst_csets, break on the first dst_cset */
+		if (!src_cset->mg_src_cgrp)
+			break;
+
+		/*
+		 * All tasks in src_cset need to be migrated to the
+		 * matching dst_cset.  Empty it process by process.  We
+		 * walk tasks but migrate processes.  The leader might even
+		 * belong to a different cset but such src_cset would also
+		 * be among the target src_csets because the default
+		 * hierarchy enforces per-process membership.
+		 */
+		while (true) {
+			down_read(&css_set_rwsem);
+			task = list_first_entry_or_null(&src_cset->tasks,
+						struct task_struct, cg_list);
+			if (task) {
+				task = task->group_leader;
+				WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+				get_task_struct(task);
+			}
+			up_read(&css_set_rwsem);
+
+			if (!task)
+				break;
+
+			/* guard against possible infinite loop */
+			if (WARN(last_task == task,
+				 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+				goto out_finish;
+			last_task = task;
+
+			threadgroup_lock(task);
+			/* raced against de_thread() from another thread? */
+			if (!thread_group_leader(task)) {
+				threadgroup_unlock(task);
+				put_task_struct(task);
+				continue;
+			}
+
+			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+			threadgroup_unlock(task);
+			put_task_struct(task);
+
+			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+				goto out_finish;
+		}
+	}
+
+out_finish:
+	cgroup_migrate_finish(&preloaded_csets);
+	return ret;
+}
+
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css,
+					struct cftype *cft, char *buffer)
+{
+	unsigned long enable_req = 0, disable_req = 0, enable, disable;
+	struct cgroup *cgrp = dummy_css->cgroup, *child;
+	struct cgroup_subsys *ss;
+	char *tok, *p;
+	int ssid, ret;
+
+	/*
+	 * Parse input - white space separated list of subsystem names
+	 * prefixed with either + or -.
+	 */
+	p = buffer;
+	while ((tok = strsep(&p, " \t\n"))) {
+		for_each_subsys(ss, ssid) {
+			if (ss->disabled || strcmp(tok + 1, ss->name))
+				continue;
+
+			if (*tok == '+') {
+				enable_req |= 1 << ssid;
+				disable_req &= ~(1 << ssid);
+			} else if (*tok == '-') {
+				disable_req |= 1 << ssid;
+				enable_req &= ~(1 << ssid);
+			} else {
+				return -EINVAL;
+			}
+			break;
+		}
+		if (ssid == CGROUP_SUBSYS_COUNT)
+			return -EINVAL;
+	}
+
+	/*
+	 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
+	 * active_ref.  cgroup_lock_live_group() already provides enough
+	 * protection.  Ensure @cgrp stays accessible and break the
+	 * active_ref protection.
+	 */
+	cgroup_get(cgrp);
+	kernfs_break_active_protection(cgrp->control_kn);
+retry:
+	enable = enable_req;
+	disable = disable_req;
+
+	mutex_lock(&cgroup_tree_mutex);
+
+	for_each_subsys(ss, ssid) {
+		if (enable & (1 << ssid)) {
+			if (cgrp->child_subsys_mask & (1 << ssid)) {
+				enable &= ~(1 << ssid);
+				continue;
+			}
+
+			/*
+			 * Because css offlining is asynchronous, userland
+			 * might try to re-enable the same controller while
+			 * the previous instance is still around.  In such
+			 * cases, wait till it's gone using offline_waitq.
+			 */
+			cgroup_for_each_live_child(child, cgrp) {
+				wait_queue_t wait;
+
+				if (!cgroup_css(child, ss))
+					continue;
+
+				prepare_to_wait(&child->offline_waitq, &wait,
+						TASK_UNINTERRUPTIBLE);
+				mutex_unlock(&cgroup_tree_mutex);
+				schedule();
+				finish_wait(&child->offline_waitq, &wait);
+				goto retry;
+			}
+
+			/* unavailable or not enabled on the parent? */
+			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+			    (cgrp->parent &&
+			     !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
+				ret = -ENOENT;
+				goto out_unlock_tree;
+			}
+		} else if (disable & (1 << ssid)) {
+			if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+				disable &= ~(1 << ssid);
+				continue;
+			}
+
+			/* a child has it enabled? */
+			cgroup_for_each_live_child(child, cgrp) {
+				if (child->child_subsys_mask & (1 << ssid)) {
+					ret = -EBUSY;
+					goto out_unlock_tree;
+				}
+			}
+		}
+	}
+
+	if (!enable && !disable) {
+		ret = 0;
+		goto out_unlock_tree;
+	}
+
+	if (!cgroup_lock_live_group(cgrp)) {
+		ret = -ENODEV;
+		goto out_unlock_tree;
+	}
+
+	/*
+	 * Except for the root, child_subsys_mask must be zero for a cgroup
+	 * with tasks so that child cgroups don't compete against tasks.
+	 */
+	if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	/*
+	 * Create csses for enables and update child_subsys_mask.  This
+	 * changes cgroup_e_css() results which in turn makes the
+	 * subsequent cgroup_update_dfl_csses() associate all tasks in the
+	 * subtree to the updated csses.
+	 */
+	for_each_subsys(ss, ssid) {
+		if (!(enable & (1 << ssid)))
+			continue;
+
+		cgroup_for_each_live_child(child, cgrp) {
+			ret = create_css(child, ss);
+			if (ret)
+				goto err_undo_css;
+		}
+	}
+
+	cgrp->child_subsys_mask |= enable;
+	cgrp->child_subsys_mask &= ~disable;
+
+	ret = cgroup_update_dfl_csses(cgrp);
+	if (ret)
+		goto err_undo_css;
+
+	/* all tasks are now migrated away from the old csses, kill them */
+	for_each_subsys(ss, ssid) {
+		if (!(disable & (1 << ssid)))
+			continue;
+
+		cgroup_for_each_live_child(child, cgrp)
+			kill_css(cgroup_css(child, ss));
+	}
+
+	kernfs_activate(cgrp->kn);
+	ret = 0;
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+out_unlock_tree:
+	mutex_unlock(&cgroup_tree_mutex);
+	kernfs_unbreak_active_protection(cgrp->control_kn);
+	cgroup_put(cgrp);
+	return ret;
+
+err_undo_css:
+	cgrp->child_subsys_mask &= ~enable;
+	cgrp->child_subsys_mask |= disable;
+
+	for_each_subsys(ss, ssid) {
+		if (!(enable & (1 << ssid)))
+			continue;
+
+		cgroup_for_each_live_child(child, cgrp) {
+			struct cgroup_subsys_state *css = cgroup_css(child, ss);
+			if (css)
+				kill_css(css);
+		}
+	}
+	goto out_unlock;
+}
+
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
@@ -2462,9 +2802,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 		return PTR_ERR(kn);
 
 	ret = cgroup_kn_set_ugid(kn);
-	if (ret)
+	if (ret) {
 		kernfs_remove(kn);
-	return ret;
+		return ret;
+	}
+
+	if (cft->seq_show == cgroup_subtree_control_show)
+		cgrp->control_kn = kn;
+	return 0;
 }
 
 /**
@@ -3557,6 +3902,22 @@ static struct cftype cgroup_base_files[] = {
 		.flags = CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_sane_behavior_show,
 	},
+	{
+		.name = "cgroup.controllers",
+		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_root_controllers_show,
+	},
+	{
+		.name = "cgroup.controllers",
+		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_controllers_show,
+	},
+	{
+		.name = "cgroup.subtree_control",
+		.flags = CFTYPE_ONLY_ON_DFL,
+		.seq_show = cgroup_subtree_control_show,
+		.write_string = cgroup_subtree_control_write,
+	},
 
 	/*
 	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
@@ -3725,6 +4086,8 @@ static void offline_css(struct cgroup_subsys_state *css)
 	css->flags &= ~CSS_ONLINE;
 	css->cgroup->nr_css--;
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+	wake_up_all(&css->cgroup->offline_waitq);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From be8f274323c26ddc7e6fd6c44254b7abcdbe6389 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 17 Apr 2014 17:16:58 +0900
Subject: kprobes: Prohibit probing on .entry.text code

.entry.text is a code area which is used for interrupt/syscall
entries, which includes many sensitive code.
Thus, it is better to prohibit probing on all of such code
instead of a part of that.
Since some symbols are already registered on kprobe blacklist,
this also removes them from the blacklist.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Jonathan Lebon <jlebon@redhat.com>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Link: http://lkml.kernel.org/r/20140417081658.26341.57354.stgit@ltc230.yrl.intra.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_32.S     | 33 ---------------------------------
 arch/x86/kernel/entry_64.S     | 20 --------------------
 arch/x86/kernel/kprobes/core.c |  8 ++++++++
 include/linux/kprobes.h        |  1 +
 kernel/kprobes.c               | 13 ++++++++-----
 5 files changed, 17 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a2a4f4697889..0ca5bf1697bb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -314,10 +314,6 @@ ENTRY(ret_from_kernel_thread)
 	CFI_ENDPROC
 ENDPROC(ret_from_kernel_thread)
 
-/*
- * Interrupt exit functions should be protected against kprobes
- */
-	.pushsection .kprobes.text, "ax"
 /*
  * Return to user mode is not as complex as all this looks,
  * but we want the default path for a system call return to
@@ -372,10 +368,6 @@ need_resched:
 END(resume_kernel)
 #endif
 	CFI_ENDPROC
-/*
- * End of kprobes section
- */
-	.popsection
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
@@ -495,10 +487,6 @@ sysexit_audit:
 	PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
-/*
- * syscall stub including irq exit should be protected against kprobes
- */
-	.pushsection .kprobes.text, "ax"
 	# system call handler stub
 ENTRY(system_call)
 	RING0_INT_FRAME			# can't unwind into user space anyway
@@ -691,10 +679,6 @@ syscall_badsys:
 	jmp resume_userspace
 END(syscall_badsys)
 	CFI_ENDPROC
-/*
- * End of kprobes section
- */
-	.popsection
 
 .macro FIXUP_ESPFIX_STACK
 /*
@@ -781,10 +765,6 @@ common_interrupt:
 ENDPROC(common_interrupt)
 	CFI_ENDPROC
 
-/*
- *  Irq entries should be protected against kprobes
- */
-	.pushsection .kprobes.text, "ax"
 #define BUILD_INTERRUPT3(name, nr, fn)	\
 ENTRY(name)				\
 	RING0_INT_FRAME;		\
@@ -961,10 +941,6 @@ ENTRY(spurious_interrupt_bug)
 	jmp error_code
 	CFI_ENDPROC
 END(spurious_interrupt_bug)
-/*
- * End of kprobes section
- */
-	.popsection
 
 #ifdef CONFIG_XEN
 /* Xen doesn't set %esp to be precisely what the normal sysenter
@@ -1239,11 +1215,6 @@ return_to_handler:
 	jmp *%ecx
 #endif
 
-/*
- * Some functions should be protected against kprobes
- */
-	.pushsection .kprobes.text, "ax"
-
 #ifdef CONFIG_TRACING
 ENTRY(trace_page_fault)
 	RING0_EC_FRAME
@@ -1453,7 +1424,3 @@ ENTRY(async_page_fault)
 END(async_page_fault)
 #endif
 
-/*
- * End of kprobes section
- */
-	.popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..43bb38951660 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -487,8 +487,6 @@ ENDPROC(native_usergs_sysret64)
 	TRACE_IRQS_OFF
 	.endm
 
-/* save complete stack frame */
-	.pushsection .kprobes.text, "ax"
 ENTRY(save_paranoid)
 	XCPT_FRAME 1 RDI+8
 	cld
@@ -517,7 +515,6 @@ ENTRY(save_paranoid)
 1:	ret
 	CFI_ENDPROC
 END(save_paranoid)
-	.popsection
 
 /*
  * A newly forked process directly context switches into this address.
@@ -975,10 +972,6 @@ END(interrupt)
 	call \func
 	.endm
 
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
-	.pushsection .kprobes.text, "ax"
 	/*
 	 * The interrupt stubs push (~vector+0x80) onto the stack and
 	 * then jump to common_interrupt.
@@ -1113,10 +1106,6 @@ ENTRY(retint_kernel)
 
 	CFI_ENDPROC
 END(common_interrupt)
-/*
- * End of kprobes section
- */
-       .popsection
 
 /*
  * APIC interrupts.
@@ -1477,11 +1466,6 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
 	hyperv_callback_vector hyperv_vector_handler
 #endif /* CONFIG_HYPERV */
 
-/*
- * Some functions should be protected against kprobes
- */
-	.pushsection .kprobes.text, "ax"
-
 paranoidzeroentry_ist debug do_debug DEBUG_STACK
 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
 paranoiderrorentry stack_segment do_stack_segment
@@ -1898,7 +1882,3 @@ ENTRY(ignore_sysret)
 	CFI_ENDPROC
 END(ignore_sysret)
 
-/*
- * End of kprobes section
- */
-	.popsection
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index da7bdaa3ce15..7751b3dee53a 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1065,6 +1065,14 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+	return  (addr >= (unsigned long)__kprobes_text_start &&
+		 addr < (unsigned long)__kprobes_text_end) ||
+		(addr >= (unsigned long)__entry_text_start &&
+		 addr < (unsigned long)__entry_text_end);
+}
+
 int __init arch_init_kprobes(void)
 {
 	return 0;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 925eaf28fca9..cdf9251f8249 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -265,6 +265,7 @@ extern void arch_disarm_kprobe(struct kprobe *p);
 extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
+extern bool arch_within_kprobe_blacklist(unsigned long addr);
 
 struct kprobe_insn_cache {
 	struct mutex mutex;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfcabb76..5b5ac76671e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -96,9 +96,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 static struct kprobe_blackpoint kprobe_blacklist[] = {
 	{"preempt_schedule",},
 	{"native_get_debugreg",},
-	{"irq_entries_start",},
-	{"common_interrupt",},
-	{"mcount",},	/* mcount can be called from everywhere */
 	{NULL}    /* Terminator */
 };
 
@@ -1324,12 +1321,18 @@ out:
 	return ret;
 }
 
+bool __weak arch_within_kprobe_blacklist(unsigned long addr)
+{
+	/* The __kprobes marked functions and entry code must not be probed */
+	return addr >= (unsigned long)__kprobes_text_start &&
+	       addr < (unsigned long)__kprobes_text_end;
+}
+
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
 	struct kprobe_blackpoint *kb;
 
-	if (addr >= (unsigned long)__kprobes_text_start &&
-	    addr < (unsigned long)__kprobes_text_end)
+	if (arch_within_kprobe_blacklist(addr))
 		return -EINVAL;
 	/*
 	 * If there exists a kprobe_blacklist, verify and
-- 
cgit v1.2.3-71-gd317


From 376e242429bf8539ef39a080ac113c8799840b13 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 17 Apr 2014 17:17:05 +0900
Subject: kprobes: Introduce NOKPROBE_SYMBOL() macro to maintain kprobes
 blacklist
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce NOKPROBE_SYMBOL() macro which builds a kprobes
blacklist at kernel build time.

The usage of this macro is similar to EXPORT_SYMBOL(),
placed after the function definition:

  NOKPROBE_SYMBOL(function);

Since this macro will inhibit inlining of static/inline
functions, this patch also introduces a nokprobe_inline macro
for static/inline functions. In this case, we must use
NOKPROBE_SYMBOL() for the inline function caller.

When CONFIG_KPROBES=y, the macro stores the given function
address in the "_kprobe_blacklist" section.

Since the data structures are not fully initialized by the
macro (because there is no "size" information),  those
are re-initialized at boot time by using kallsyms.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Link: http://lkml.kernel.org/r/20140417081705.26341.96719.stgit@ltc230.yrl.intra.hitachi.co.jp
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christopher Li <sparse@chrisli.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jan-Simon Möller <dl9pf@gmx.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-sparse@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/kprobes.txt         |  16 +++++-
 arch/x86/include/asm/asm.h        |   7 +++
 arch/x86/kernel/paravirt.c        |   4 ++
 include/asm-generic/vmlinux.lds.h |   9 ++++
 include/linux/compiler.h          |   2 +
 include/linux/kprobes.h           |  20 ++++++--
 kernel/kprobes.c                  | 100 ++++++++++++++++++++------------------
 kernel/sched/core.c               |   1 +
 8 files changed, 107 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 0cfb00fd86ff..4bbeca8483ed 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -22,8 +22,9 @@ Appendix B: The kprobes sysctl interface
 
 Kprobes enables you to dynamically break into any kernel routine and
 collect debugging and performance information non-disruptively. You
-can trap at almost any kernel code address, specifying a handler
+can trap at almost any kernel code address(*), specifying a handler
 routine to be invoked when the breakpoint is hit.
+(*: some parts of the kernel code can not be trapped, see 1.5 Blacklist)
 
 There are currently three types of probes: kprobes, jprobes, and
 kretprobes (also called return probes).  A kprobe can be inserted
@@ -273,6 +274,19 @@ using one of the following techniques:
  or
 - Execute 'sysctl -w debug.kprobes_optimization=n'
 
+1.5 Blacklist
+
+Kprobes can probe most of the kernel except itself. This means
+that there are some functions where kprobes cannot probe. Probing
+(trapping) such functions can cause a recursive trap (e.g. double
+fault) or the nested probe handler may never be called.
+Kprobes manages such functions as a blacklist.
+If you want to add a function into the blacklist, you just need
+to (1) include linux/kprobes.h and (2) use NOKPROBE_SYMBOL() macro
+to specify a blacklisted function.
+Kprobes checks the given probe address against the blacklist and
+rejects registering it, if the given address is in the blacklist.
+
 2. Architectures Supported
 
 Kprobes, jprobes, and return probes are implemented on the following
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 4582e8e1cd1a..7730c1c5c83a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -57,6 +57,12 @@
 	.long (from) - . ;					\
 	.long (to) - . + 0x7ffffff0 ;				\
 	.popsection
+
+# define _ASM_NOKPROBE(entry)					\
+	.pushsection "_kprobe_blacklist","aw" ;			\
+	_ASM_ALIGN ;						\
+	_ASM_PTR (entry);					\
+	.popsection
 #else
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
@@ -71,6 +77,7 @@
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - . + 0x7ffffff0\n"			\
 	" .popsection\n"
+/* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b10af835c31..e136869ae42e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -23,6 +23,7 @@
 #include <linux/efi.h>
 #include <linux/bcd.h>
 #include <linux/highmem.h>
+#include <linux/kprobes.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -389,6 +390,9 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.end_context_switch = paravirt_nop,
 };
 
+/* At this point, native_get_debugreg has a real function entry */
+NOKPROBE_SYMBOL(native_get_debugreg);
+
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.startup_ipi_hook = paravirt_nop,
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 146e4fffd710..40ceb3ceba79 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -109,6 +109,14 @@
 #define BRANCH_PROFILE()
 #endif
 
+#ifdef CONFIG_KPROBES
+#define KPROBE_BLACKLIST()	VMLINUX_SYMBOL(__start_kprobe_blacklist) = .; \
+				*(_kprobe_blacklist)			      \
+				VMLINUX_SYMBOL(__stop_kprobe_blacklist) = .;
+#else
+#define KPROBE_BLACKLIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS()	. = ALIGN(8);					\
 			VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
@@ -507,6 +515,7 @@
 	*(.init.rodata)							\
 	FTRACE_EVENTS()							\
 	TRACE_SYSCALLS()						\
+	KPROBE_BLACKLIST()						\
 	MEM_DISCARD(init.rodata)					\
 	CLK_OF_TABLES()							\
 	RESERVEDMEM_OF_TABLES()						\
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ee7239ea1583..0300c0f5c88b 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -374,7 +374,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
 #ifdef CONFIG_KPROBES
 # define __kprobes	__attribute__((__section__(".kprobes.text")))
+# define nokprobe_inline	__always_inline
 #else
 # define __kprobes
+# define nokprobe_inline	inline
 #endif
 #endif /* __LINUX_COMPILER_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index cdf9251f8249..e059507c465d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -205,10 +205,10 @@ struct kretprobe_blackpoint {
 	void *addr;
 };
 
-struct kprobe_blackpoint {
-	const char *name;
+struct kprobe_blacklist_entry {
+	struct list_head list;
 	unsigned long start_addr;
-	unsigned long range;
+	unsigned long end_addr;
 };
 
 #ifdef CONFIG_KPROBES
@@ -477,4 +477,18 @@ static inline int enable_jprobe(struct jprobe *jp)
 	return enable_kprobe(&jp->kp);
 }
 
+#ifdef CONFIG_KPROBES
+/*
+ * Blacklist ganerating macro. Specify functions which is not probed
+ * by using this macro.
+ */
+#define __NOKPROBE_SYMBOL(fname)			\
+static unsigned long __used				\
+	__attribute__((section("_kprobe_blacklist")))	\
+	_kbl_addr_##fname = (unsigned long)fname;
+#define NOKPROBE_SYMBOL(fname)	__NOKPROBE_SYMBOL(fname)
+#else
+#define NOKPROBE_SYMBOL(fname)
+#endif
+
 #endif /* _LINUX_KPROBES_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5b5ac76671e7..5ffc6875d2a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -86,18 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 	return &(kretprobe_table_locks[hash].lock);
 }
 
-/*
- * Normally, functions that we'd want to prohibit kprobes in, are marked
- * __kprobes. But, there are cases where such functions already belong to
- * a different section (__sched for preempt_schedule)
- *
- * For such cases, we now have a blacklist
- */
-static struct kprobe_blackpoint kprobe_blacklist[] = {
-	{"preempt_schedule",},
-	{"native_get_debugreg",},
-	{NULL}    /* Terminator */
-};
+/* Blacklist -- list of struct kprobe_blacklist_entry */
+static LIST_HEAD(kprobe_blacklist);
 
 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 /*
@@ -1328,24 +1318,22 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
 	       addr < (unsigned long)__kprobes_text_end;
 }
 
-static int __kprobes in_kprobes_functions(unsigned long addr)
+static bool __kprobes within_kprobe_blacklist(unsigned long addr)
 {
-	struct kprobe_blackpoint *kb;
+	struct kprobe_blacklist_entry *ent;
 
 	if (arch_within_kprobe_blacklist(addr))
-		return -EINVAL;
+		return true;
 	/*
 	 * If there exists a kprobe_blacklist, verify and
 	 * fail any probe registration in the prohibited area
 	 */
-	for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
-		if (kb->start_addr) {
-			if (addr >= kb->start_addr &&
-			    addr < (kb->start_addr + kb->range))
-				return -EINVAL;
-		}
+	list_for_each_entry(ent, &kprobe_blacklist, list) {
+		if (addr >= ent->start_addr && addr < ent->end_addr)
+			return true;
 	}
-	return 0;
+
+	return false;
 }
 
 /*
@@ -1436,7 +1424,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
 
 	/* Ensure it is not in reserved area nor out of text */
 	if (!kernel_text_address((unsigned long) p->addr) ||
-	    in_kprobes_functions((unsigned long) p->addr) ||
+	    within_kprobe_blacklist((unsigned long) p->addr) ||
 	    jump_label_text_reserved(p->addr, p->addr)) {
 		ret = -EINVAL;
 		goto out;
@@ -2022,6 +2010,38 @@ void __kprobes dump_kprobe(struct kprobe *kp)
 	       kp->symbol_name, kp->addr, kp->offset);
 }
 
+/*
+ * Lookup and populate the kprobe_blacklist.
+ *
+ * Unlike the kretprobe blacklist, we'll need to determine
+ * the range of addresses that belong to the said functions,
+ * since a kprobe need not necessarily be at the beginning
+ * of a function.
+ */
+static int __init populate_kprobe_blacklist(unsigned long *start,
+					     unsigned long *end)
+{
+	unsigned long *iter;
+	struct kprobe_blacklist_entry *ent;
+	unsigned long offset = 0, size = 0;
+
+	for (iter = start; iter < end; iter++) {
+		if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
+			pr_err("Failed to find blacklist %p\n", (void *)*iter);
+			continue;
+		}
+
+		ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+		if (!ent)
+			return -ENOMEM;
+		ent->start_addr = *iter;
+		ent->end_addr = *iter + size;
+		INIT_LIST_HEAD(&ent->list);
+		list_add_tail(&ent->list, &kprobe_blacklist);
+	}
+	return 0;
+}
+
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
 					     unsigned long val, void *data)
@@ -2065,14 +2085,13 @@ static struct notifier_block kprobe_module_nb = {
 	.priority = 0
 };
 
+/* Markers of _kprobe_blacklist section */
+extern unsigned long __start_kprobe_blacklist[];
+extern unsigned long __stop_kprobe_blacklist[];
+
 static int __init init_kprobes(void)
 {
 	int i, err = 0;
-	unsigned long offset = 0, size = 0;
-	char *modname, namebuf[KSYM_NAME_LEN];
-	const char *symbol_name;
-	void *addr;
-	struct kprobe_blackpoint *kb;
 
 	/* FIXME allocate the probe table, currently defined statically */
 	/* initialize all list heads */
@@ -2082,26 +2101,11 @@ static int __init init_kprobes(void)
 		raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
 	}
 
-	/*
-	 * Lookup and populate the kprobe_blacklist.
-	 *
-	 * Unlike the kretprobe blacklist, we'll need to determine
-	 * the range of addresses that belong to the said functions,
-	 * since a kprobe need not necessarily be at the beginning
-	 * of a function.
-	 */
-	for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
-		kprobe_lookup_name(kb->name, addr);
-		if (!addr)
-			continue;
-
-		kb->start_addr = (unsigned long)addr;
-		symbol_name = kallsyms_lookup(kb->start_addr,
-				&size, &offset, &modname, namebuf);
-		if (!symbol_name)
-			kb->range = 0;
-		else
-			kb->range = size;
+	err = populate_kprobe_blacklist(__start_kprobe_blacklist,
+					__stop_kprobe_blacklist);
+	if (err) {
+		pr_err("kprobes: failed to populate blacklist: %d\n", err);
+		pr_err("Please take care of using kprobes.\n");
 	}
 
 	if (kretprobe_blacklist_size) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45ea238c..6863631e8cd0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2804,6 +2804,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
 		barrier();
 	} while (need_resched());
 }
+NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 #endif /* CONFIG_PREEMPT */
 
-- 
cgit v1.2.3-71-gd317


From 842b597ee0a7e1aa5a3148164ffdba00ec17f614 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 25 Apr 2014 18:28:02 -0400
Subject: cgroup: implement cgroup.populated for the default hierarchy

cgroup users often need a way to determine when a cgroup's
subhierarchy becomes empty so that it can be cleaned up.  cgroup
currently provides release_agent for it; unfortunately, this mechanism
is riddled with issues.

* It delivers events by forking and execing a userland binary
  specified as the release_agent.  This is a long deprecated method of
  notification delivery.  It's extremely heavy, slow and cumbersome to
  integrate with larger infrastructure.

* There is single monitoring point at the root.  There's no way to
  delegate management of a subtree.

* The event isn't recursive.  It triggers when a cgroup doesn't have
  any tasks or child cgroups.  Events for internal nodes trigger only
  after all children are removed.  This again makes it impossible to
  delegate management of a subtree.

* Events are filtered from the kernel side.  "notify_on_release" file
  is used to subscribe to or suppress release event.  This is
  unnecessarily complicated and probably done this way because event
  delivery itself was expensive.

This patch implements interface file "cgroup.populated" which can be
used to monitor whether the cgroup's subhierarchy has tasks in it or
not.  Its value is 0 if there is no task in the cgroup and its
descendants; otherwise, 1, and kernfs_notify() notificaiton is
triggers when the value changes, which can be monitored through poll
and [di]notify.

This is a lot ligther and simpler and trivially allows delegating
management of subhierarchy - subhierarchy monitoring can block further
propgation simply by putting itself or another process in the root of
the subhierarchy and monitor events that it's interested in from there
without interfering with monitoring higher in the tree.

v2: Patch description updated as per Serge.

v3: "cgroup.subtree_populated" renamed to "cgroup.populated".  The
    subtree_ prefix was a bit confusing because
    "cgroup.subtree_control" uses it to denote the tree rooted at the
    cgroup sans the cgroup itself while the populated state includes
    the cgroup itself.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Lennart Poettering <lennart@poettering.net>
---
 include/linux/cgroup.h | 15 ++++++++++++
 kernel/cgroup.c        | 65 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ada239253ec7..4b38e2d6110d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -154,6 +154,14 @@ struct cgroup {
 	/* the number of attached css's */
 	int nr_css;
 
+	/*
+	 * If this cgroup contains any tasks, it contributes one to
+	 * populated_cnt.  All children with non-zero popuplated_cnt of
+	 * their own contribute one.  The count is zero iff there's no task
+	 * in this cgroup or its subtree.
+	 */
+	int populated_cnt;
+
 	atomic_t refcnt;
 
 	/*
@@ -166,6 +174,7 @@ struct cgroup {
 	struct cgroup *parent;		/* my parent */
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct kernfs_node *control_kn;	/* kn for "cgroup.subtree_control" */
+	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
 
 	/*
 	 * Monotonically increasing unique serial number which defines a
@@ -264,6 +273,12 @@ enum {
 	 *
 	 * - "cgroup.clone_children" is removed.
 	 *
+	 * - "cgroup.subtree_populated" is available.  Its value is 0 if
+	 *   the cgroup and its descendants contain no task; otherwise, 1.
+	 *   The file also generates kernfs notification which can be
+	 *   monitored through poll and [di]notify when the value of the
+	 *   file changes.
+	 *
 	 * - If mount is requested with sane_behavior but without any
 	 *   subsystem, the default unified hierarchy is mounted.
 	 *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 809dd903ceb8..0f986f7afee4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -411,6 +411,43 @@ static struct css_set init_css_set = {
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
 
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly.  The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed.  This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+	lockdep_assert_held(&css_set_rwsem);
+
+	do {
+		bool trigger;
+
+		if (populated)
+			trigger = !cgrp->populated_cnt++;
+		else
+			trigger = !--cgrp->populated_cnt;
+
+		if (!trigger)
+			break;
+
+		if (cgrp->populated_kn)
+			kernfs_notify(cgrp->populated_kn);
+		cgrp = cgrp->parent;
+	} while (cgrp);
+}
+
 /*
  * hash table for cgroup groups. This improves the performance to find
  * an existing css_set. This hash doesn't (currently) take into
@@ -456,10 +493,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
 		list_del(&link->cgrp_link);
 
 		/* @cgrp can't go away while we're holding css_set_rwsem */
-		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
-			if (taskexit)
-				set_bit(CGRP_RELEASABLE, &cgrp->flags);
-			check_for_release(cgrp);
+		if (list_empty(&cgrp->cset_links)) {
+			cgroup_update_populated(cgrp, false);
+			if (notify_on_release(cgrp)) {
+				if (taskexit)
+					set_bit(CGRP_RELEASABLE, &cgrp->flags);
+				check_for_release(cgrp);
+			}
 		}
 
 		kfree(link);
@@ -668,7 +708,11 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 	link->cset = cset;
 	link->cgrp = cgrp;
+
+	if (list_empty(&cgrp->cset_links))
+		cgroup_update_populated(cgrp, true);
 	list_move(&link->cset_link, &cgrp->cset_links);
+
 	/*
 	 * Always add links to the tail of the list so that the list
 	 * is sorted by order of hierarchy creation
@@ -2643,6 +2687,12 @@ err_undo_css:
 	goto out_unlock;
 }
 
+static int cgroup_populated_show(struct seq_file *seq, void *v)
+{
+	seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+	return 0;
+}
+
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
@@ -2809,6 +2859,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 
 	if (cft->seq_show == cgroup_subtree_control_show)
 		cgrp->control_kn = kn;
+	else if (cft->seq_show == cgroup_populated_show)
+		cgrp->populated_kn = kn;
 	return 0;
 }
 
@@ -3918,6 +3970,11 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_subtree_control_show,
 		.write_string = cgroup_subtree_control_write,
 	},
+	{
+		.name = "cgroup.populated",
+		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_populated_show,
+	},
 
 	/*
 	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
-- 
cgit v1.2.3-71-gd317


From 4af619ae2ccf47a2b3a108e1926736484721370f Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Thu, 24 Apr 2014 19:09:05 +0200
Subject: at86rf230: use irq_get_trigger_type

This patch removes the platform data for the irq_type. We use instead
the irq_get_trigger_type function to get these flags which should
already configured by the interrupt controller.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ieee802154/at86rf230.c | 28 ++++++++--------------------
 include/linux/spi/at86rf230.h      | 14 --------------
 2 files changed, 8 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index e36f194673a4..17b9c9aea9be 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/gpio.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
@@ -970,8 +971,7 @@ static int at86rf230_irq_polarity(struct at86rf230_local *lp, int pol)
 
 static int at86rf230_hw_init(struct at86rf230_local *lp)
 {
-	struct at86rf230_platform_data *pdata = lp->spi->dev.platform_data;
-	int rc, irq_pol;
+	int rc, irq_pol, irq_type;
 	u8 status;
 	u8 csma_seed[2];
 
@@ -983,8 +983,9 @@ static int at86rf230_hw_init(struct at86rf230_local *lp)
 	if (rc)
 		return rc;
 
+	irq_type = irq_get_trigger_type(lp->spi->irq);
 	/* configure irq polarity, defaults to high active */
-	if (pdata->irq_type & (IRQF_TRIGGER_FALLING | IRQF_TRIGGER_LOW))
+	if (irq_type & (IRQF_TRIGGER_FALLING | IRQF_TRIGGER_LOW))
 		irq_pol = IRQ_ACTIVE_LOW;
 	else
 		irq_pol = IRQ_ACTIVE_HIGH;
@@ -1032,7 +1033,6 @@ static struct at86rf230_platform_data *
 at86rf230_get_pdata(struct spi_device *spi)
 {
 	struct at86rf230_platform_data *pdata;
-	const char *irq_type;
 
 	if (!IS_ENABLED(CONFIG_OF) || !spi->dev.of_node)
 		return spi->dev.platform_data;
@@ -1044,19 +1044,6 @@ at86rf230_get_pdata(struct spi_device *spi)
 	pdata->rstn = of_get_named_gpio(spi->dev.of_node, "reset-gpio", 0);
 	pdata->slp_tr = of_get_named_gpio(spi->dev.of_node, "sleep-gpio", 0);
 
-	pdata->irq_type = IRQF_TRIGGER_RISING;
-	of_property_read_string(spi->dev.of_node, "irq-type", &irq_type);
-	if (!strcmp(irq_type, "level-high"))
-		pdata->irq_type = IRQF_TRIGGER_HIGH;
-	else if (!strcmp(irq_type, "level-low"))
-		pdata->irq_type = IRQF_TRIGGER_LOW;
-	else if (!strcmp(irq_type, "edge-rising"))
-		pdata->irq_type = IRQF_TRIGGER_RISING;
-	else if (!strcmp(irq_type, "edge-falling"))
-		pdata->irq_type = IRQF_TRIGGER_FALLING;
-	else
-		dev_warn(&spi->dev, "wrong irq-type specified using edge-rising\n");
-
 	spi->dev.platform_data = pdata;
 done:
 	return pdata;
@@ -1071,7 +1058,7 @@ static int at86rf230_probe(struct spi_device *spi)
 	u8 part = 0, version = 0, status;
 	irq_handler_t irq_handler;
 	work_func_t irq_worker;
-	int rc;
+	int rc, irq_type;
 	const char *chip;
 	struct ieee802154_ops *ops = NULL;
 
@@ -1176,7 +1163,8 @@ static int at86rf230_probe(struct spi_device *spi)
 	dev->extra_tx_headroom = 0;
 	dev->flags = IEEE802154_HW_OMIT_CKSUM | IEEE802154_HW_AACK;
 
-	if (pdata->irq_type & (IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING)) {
+	irq_type = irq_get_trigger_type(spi->irq);
+	if (irq_type & (IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING)) {
 		irq_worker = at86rf230_irqwork;
 		irq_handler = at86rf230_isr;
 	} else {
@@ -1203,7 +1191,7 @@ static int at86rf230_probe(struct spi_device *spi)
 		goto err_hw_init;
 
 	rc = request_irq(spi->irq, irq_handler,
-			 IRQF_SHARED | pdata->irq_type,
+			 IRQF_SHARED | irq_type,
 			 dev_name(&spi->dev), lp);
 	if (rc)
 		goto err_hw_init;
diff --git a/include/linux/spi/at86rf230.h b/include/linux/spi/at86rf230.h
index aa327a8105ad..b2b1afbb3202 100644
--- a/include/linux/spi/at86rf230.h
+++ b/include/linux/spi/at86rf230.h
@@ -26,20 +26,6 @@ struct at86rf230_platform_data {
 	int rstn;
 	int slp_tr;
 	int dig2;
-
-	/* Setting the irq_type will configure the driver to request
-	 * the platform irq trigger type according to the given value
-	 * and configure the interrupt polarity of the device to the
-	 * corresponding polarity.
-	 *
-	 * Allowed values are: IRQF_TRIGGER_RISING, IRQF_TRIGGER_FALLING,
-	 *                     IRQF_TRIGGER_HIGH and IRQF_TRIGGER_LOW
-	 *
-	 * Setting it to 0, the driver does not touch the trigger type
-	 * configuration of the interrupt and sets the interrupt polarity
-	 * of the device to high active (the default value).
-	 */
-	int irq_type;
 };
 
 #endif
-- 
cgit v1.2.3-71-gd317


From e2dcdfe95c0bd67e37db6057edd9c4ee1f1c7b17 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Apr 2014 11:15:08 +0930
Subject: virtio: virtio_break_device() to mark all virtqueues broken.

Good for post-apocalyptic scenarios, like S/390 hotplug.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 15 +++++++++++++++
 include/linux/virtio.h       |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 1e443629f76d..4d08f45a9c29 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -865,4 +865,19 @@ bool virtqueue_is_broken(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_is_broken);
 
+/*
+ * This should prevent the device from being used, allowing drivers to
+ * recover.  You may need to grab appropriate locks to flush.
+ */
+void virtio_break_device(struct virtio_device *dev)
+{
+	struct virtqueue *_vq;
+
+	list_for_each_entry(_vq, &dev->vqs, list) {
+		struct vring_virtqueue *vq = to_vvq(_vq);
+		vq->broken = true;
+	}
+}
+EXPORT_SYMBOL_GPL(virtio_break_device);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index e4abb84199be..b46671e28de2 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -106,6 +106,8 @@ static inline struct virtio_device *dev_to_virtio(struct device *_dev)
 int register_virtio_device(struct virtio_device *dev);
 void unregister_virtio_device(struct virtio_device *dev);
 
+void virtio_break_device(struct virtio_device *dev);
+
 /**
  * virtio_driver - operations for a virtio I/O driver
  * @driver: underlying device driver (populate name and owner).
-- 
cgit v1.2.3-71-gd317


From 51e158c12aca3c9ac63988611a97c05109b14dc9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Apr 2014 11:34:33 +0930
Subject: param: hand arguments after -- straight to init

The kernel passes any args it doesn't need through to init, except it
assumes anything containing '.' belongs to the kernel (for a module).
This change means all users can clearly distinguish which arguments
are for init.

For example, the kernel uses debug ("dee-bug") to mean log everything to
the console, where systemd uses the debug from the Scandinavian "day-boog"
meaning "fail to boot".  If a future versions uses argv[] instead of
reading /proc/cmdline, this confusion will be avoided.

eg: test 'FOO="this is --foo"' -- 'systemd.debug="true true true"'

Gives:
argv[0] = '/debug-init'
argv[1] = 'test'
argv[2] = 'systemd.debug=true true true'
envp[0] = 'HOME=/'
envp[1] = 'TERM=linux'
envp[2] = 'FOO=this is --foo'

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h |  2 +-
 init/main.c                 | 33 +++++++++++++++++++++++++++++----
 kernel/module.c             | 12 +++++++++---
 kernel/params.c             | 25 ++++++++++++++-----------
 4 files changed, 53 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 204a67743804..b1990c5524e1 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -321,7 +321,7 @@ extern bool parameq(const char *name1, const char *name2);
 extern bool parameqn(const char *name1, const char *name2, size_t n);
 
 /* Called on module insert or kernel boot */
-extern int parse_args(const char *name,
+extern char *parse_args(const char *name,
 		      char *args,
 		      const struct kernel_param *params,
 		      unsigned num,
diff --git a/init/main.c b/init/main.c
index 9c7fd4c9249f..e9d458b5d77b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -252,6 +252,27 @@ static int __init repair_env_string(char *param, char *val, const char *unused)
 	return 0;
 }
 
+/* Anything after -- gets handed straight to init. */
+static int __init set_init_arg(char *param, char *val, const char *unused)
+{
+	unsigned int i;
+
+	if (panic_later)
+		return 0;
+
+	repair_env_string(param, val, unused);
+
+	for (i = 0; argv_init[i]; i++) {
+		if (i == MAX_INIT_ARGS) {
+			panic_later = "init";
+			panic_param = param;
+			return 0;
+		}
+	}
+	argv_init[i] = param;
+	return 0;
+}
+
 /*
  * Unknown boot options get handed to init, unless they look like
  * unused parameters (modprobe will find them in /proc/cmdline).
@@ -478,7 +499,7 @@ static void __init mm_init(void)
 
 asmlinkage void __init start_kernel(void)
 {
-	char * command_line;
+	char * command_line, *after_dashes;
 	extern const struct kernel_param __start___param[], __stop___param[];
 
 	/*
@@ -519,9 +540,13 @@ asmlinkage void __init start_kernel(void)
 
 	pr_notice("Kernel command line: %s\n", boot_command_line);
 	parse_early_param();
-	parse_args("Booting kernel", static_command_line, __start___param,
-		   __stop___param - __start___param,
-		   -1, -1, &unknown_bootoption);
+	after_dashes = parse_args("Booting kernel",
+				  static_command_line, __start___param,
+				  __stop___param - __start___param,
+				  -1, -1, &unknown_bootoption);
+	if (after_dashes)
+		parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
+			   set_init_arg);
 
 	jump_label_init();
 
diff --git a/kernel/module.c b/kernel/module.c
index 11869408f79b..66e4e0d260a9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3193,6 +3193,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 {
 	struct module *mod;
 	long err;
+	char *after_dashes;
 
 	err = module_sig_check(info);
 	if (err)
@@ -3277,10 +3278,15 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		goto ddebug_cleanup;
 
 	/* Module is ready to execute: parsing args may do that. */
-	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-			 -32768, 32767, unknown_module_param_cb);
-	if (err < 0)
+	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+				  -32768, 32767, unknown_module_param_cb);
+	if (IS_ERR(after_dashes)) {
+		err = PTR_ERR(after_dashes);
 		goto bug_cleanup;
+	} else if (after_dashes) {
+		pr_warn("%s: parameters '%s' after `--' ignored\n",
+		       mod->name, after_dashes);
+	}
 
 	/* Link in to syfs. */
 	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
diff --git a/kernel/params.c b/kernel/params.c
index b00142e7f3ba..1e52ca233fd9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)
 }
 
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *doing,
-	       char *args,
-	       const struct kernel_param *params,
-	       unsigned num,
-	       s16 min_level,
-	       s16 max_level,
-	       int (*unknown)(char *param, char *val, const char *doing))
+char *parse_args(const char *doing,
+		 char *args,
+		 const struct kernel_param *params,
+		 unsigned num,
+		 s16 min_level,
+		 s16 max_level,
+		 int (*unknown)(char *param, char *val, const char *doing))
 {
 	char *param, *val;
 
@@ -198,6 +198,9 @@ int parse_args(const char *doing,
 		int irq_was_disabled;
 
 		args = next_arg(args, &param, &val);
+		/* Stop at -- */
+		if (!val && strcmp(param, "--") == 0)
+			return args;
 		irq_was_disabled = irqs_disabled();
 		ret = parse_one(param, val, doing, params, num,
 				min_level, max_level, unknown);
@@ -208,22 +211,22 @@ int parse_args(const char *doing,
 		switch (ret) {
 		case -ENOENT:
 			pr_err("%s: Unknown parameter `%s'\n", doing, param);
-			return ret;
+			return ERR_PTR(ret);
 		case -ENOSPC:
 			pr_err("%s: `%s' too large for parameter `%s'\n",
 			       doing, val ?: "", param);
-			return ret;
+			return ERR_PTR(ret);
 		case 0:
 			break;
 		default:
 			pr_err("%s: `%s' invalid for parameter `%s'\n",
 			       doing, val ?: "", param);
-			return ret;
+			return ERR_PTR(ret);
 		}
 	}
 
 	/* All parsed OK. */
-	return 0;
+	return NULL;
 }
 
 /* Lazy bastard, eh? */
-- 
cgit v1.2.3-71-gd317


From 683399eddb9fff742b1a14c5a5d03e12bfc0afff Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Sun, 20 Apr 2014 18:57:36 -0600
Subject: netfilter: nfnetlink_acct: Adding quota support to accounting
 framework

nfacct objects already support accounting at the byte and packet
level.  As such it is a natural extension to add the possiblity to
define a ceiling limit for both metrics.

All the support for quotas itself is added to nfnetlink acctounting
framework to stay coherent with current accounting object management.
Quota limit checks are implemented in xt_nfacct filter where
statistic collection is already done.

Pablo Neira Ayuso has also contributed to this feature.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_acct.h      |  8 ++-
 include/uapi/linux/netfilter/nfnetlink.h      |  2 +
 include/uapi/linux/netfilter/nfnetlink_acct.h |  9 +++
 net/netfilter/nfnetlink_acct.c                | 85 +++++++++++++++++++++++++++
 net/netfilter/xt_nfacct.c                     |  5 +-
 5 files changed, 107 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h
index b2e85e59f760..6ec975748742 100644
--- a/include/linux/netfilter/nfnetlink_acct.h
+++ b/include/linux/netfilter/nfnetlink_acct.h
@@ -3,11 +3,17 @@
 
 #include <uapi/linux/netfilter/nfnetlink_acct.h>
 
+enum {
+	NFACCT_NO_QUOTA		= -1,
+	NFACCT_UNDERQUOTA,
+	NFACCT_OVERQUOTA,
+};
 
 struct nf_acct;
 
 struct nf_acct *nfnl_acct_find_get(const char *filter_name);
 void nfnl_acct_put(struct nf_acct *acct);
 void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct);
-
+extern int nfnl_acct_overquota(const struct sk_buff *skb,
+			      struct nf_acct *nfacct);
 #endif /* _NFNL_ACCT_H */
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 596ddd45253c..354a7e5e50f2 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -20,6 +20,8 @@ enum nfnetlink_groups {
 #define NFNLGRP_CONNTRACK_EXP_DESTROY	NFNLGRP_CONNTRACK_EXP_DESTROY
 	NFNLGRP_NFTABLES,
 #define NFNLGRP_NFTABLES                NFNLGRP_NFTABLES
+	NFNLGRP_ACCT_QUOTA,
+#define NFNLGRP_ACCT_QUOTA		NFNLGRP_ACCT_QUOTA
 	__NFNLGRP_MAX,
 };
 #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
diff --git a/include/uapi/linux/netfilter/nfnetlink_acct.h b/include/uapi/linux/netfilter/nfnetlink_acct.h
index c7b6269e760b..51404ec19022 100644
--- a/include/uapi/linux/netfilter/nfnetlink_acct.h
+++ b/include/uapi/linux/netfilter/nfnetlink_acct.h
@@ -10,15 +10,24 @@ enum nfnl_acct_msg_types {
 	NFNL_MSG_ACCT_GET,
 	NFNL_MSG_ACCT_GET_CTRZERO,
 	NFNL_MSG_ACCT_DEL,
+	NFNL_MSG_ACCT_OVERQUOTA,
 	NFNL_MSG_ACCT_MAX
 };
 
+enum nfnl_acct_flags {
+	NFACCT_F_QUOTA_PKTS	= (1 << 0),
+	NFACCT_F_QUOTA_BYTES	= (1 << 1),
+	NFACCT_F_OVERQUOTA	= (1 << 2), /* can't be set from userspace */
+};
+
 enum nfnl_acct_type {
 	NFACCT_UNSPEC,
 	NFACCT_NAME,
 	NFACCT_PKTS,
 	NFACCT_BYTES,
 	NFACCT_USE,
+	NFACCT_FLAGS,
+	NFACCT_QUOTA,
 	__NFACCT_MAX
 };
 #define NFACCT_MAX (__NFACCT_MAX - 1)
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index c7b6d466a662..70e86bbb3637 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -32,18 +32,24 @@ static LIST_HEAD(nfnl_acct_list);
 struct nf_acct {
 	atomic64_t		pkts;
 	atomic64_t		bytes;
+	unsigned long		flags;
 	struct list_head	head;
 	atomic_t		refcnt;
 	char			name[NFACCT_NAME_MAX];
 	struct rcu_head		rcu_head;
+	char			data[0];
 };
 
+#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
+
 static int
 nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
 	struct nf_acct *nfacct, *matching = NULL;
 	char *acct_name;
+	unsigned int size = 0;
+	u32 flags = 0;
 
 	if (!tb[NFACCT_NAME])
 		return -EINVAL;
@@ -68,15 +74,39 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 			/* reset counters if you request a replacement. */
 			atomic64_set(&matching->pkts, 0);
 			atomic64_set(&matching->bytes, 0);
+			smp_mb__before_clear_bit();
+			/* reset overquota flag if quota is enabled. */
+			if ((matching->flags & NFACCT_F_QUOTA))
+				clear_bit(NFACCT_F_OVERQUOTA, &matching->flags);
 			return 0;
 		}
 		return -EBUSY;
 	}
 
 	nfacct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL);
+	if (tb[NFACCT_FLAGS]) {
+		flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS]));
+		if (flags & ~NFACCT_F_QUOTA)
+			return -EOPNOTSUPP;
+		if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA)
+			return -EINVAL;
+		if (flags & NFACCT_F_OVERQUOTA)
+			return -EINVAL;
+
+		size += sizeof(u64);
+	}
+
+	nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL);
 	if (nfacct == NULL)
 		return -ENOMEM;
 
+	if (flags & NFACCT_F_QUOTA) {
+		u64 *quota = (u64 *)nfacct->data;
+
+		*quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA]));
+		nfacct->flags = flags;
+	}
+
 	strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
 
 	if (tb[NFACCT_BYTES]) {
@@ -117,6 +147,9 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (type == NFNL_MSG_ACCT_GET_CTRZERO) {
 		pkts = atomic64_xchg(&acct->pkts, 0);
 		bytes = atomic64_xchg(&acct->bytes, 0);
+		smp_mb__before_clear_bit();
+		if (acct->flags & NFACCT_F_QUOTA)
+			clear_bit(NFACCT_F_OVERQUOTA, &acct->flags);
 	} else {
 		pkts = atomic64_read(&acct->pkts);
 		bytes = atomic64_read(&acct->bytes);
@@ -125,7 +158,13 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	    nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) ||
 	    nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))))
 		goto nla_put_failure;
+	if (acct->flags & NFACCT_F_QUOTA) {
+		u64 *quota = (u64 *)acct->data;
 
+		if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) ||
+		    nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota)))
+			goto nla_put_failure;
+	}
 	nlmsg_end(skb, nlh);
 	return skb->len;
 
@@ -270,6 +309,8 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
 	[NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 },
 	[NFACCT_BYTES] = { .type = NLA_U64 },
 	[NFACCT_PKTS] = { .type = NLA_U64 },
+	[NFACCT_FLAGS] = { .type = NLA_U32 },
+	[NFACCT_QUOTA] = { .type = NLA_U64 },
 };
 
 static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
@@ -336,6 +377,50 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_update);
 
+static void nfnl_overquota_report(struct nf_acct *nfacct)
+{
+	int ret;
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+
+	ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0,
+				  nfacct);
+	if (ret <= 0) {
+		kfree_skb(skb);
+		return;
+	}
+	netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
+			  GFP_ATOMIC);
+}
+
+int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+	u64 now;
+	u64 *quota;
+	int ret = NFACCT_UNDERQUOTA;
+
+	/* no place here if we don't have a quota */
+	if (!(nfacct->flags & NFACCT_F_QUOTA))
+		return NFACCT_NO_QUOTA;
+
+	quota = (u64 *)nfacct->data;
+	now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ?
+	       atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes);
+
+	ret = now > *quota;
+
+	if (now >= *quota &&
+	    !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) {
+		nfnl_overquota_report(nfacct);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
+
 static int __init nfnl_acct_init(void)
 {
 	int ret;
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index b3be0ef21f19..8c646ed9c921 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -21,11 +21,14 @@ MODULE_ALIAS("ip6t_nfacct");
 
 static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
+	int overquota;
 	const struct xt_nfacct_match_info *info = par->targinfo;
 
 	nfnl_acct_update(skb, info->nfacct);
 
-	return true;
+	overquota = nfnl_acct_overquota(skb, info->nfacct);
+
+	return overquota == NFACCT_UNDERQUOTA ? false : true;
 }
 
 static int
-- 
cgit v1.2.3-71-gd317


From 9cd4360de6090a6daf7fbe024e34953f2ae60ef2 Mon Sep 17 00:00:00 2001
From: Srikanth Thokala <sthokal@xilinx.com>
Date: Wed, 23 Apr 2014 20:23:26 +0530
Subject: dma: Add Xilinx AXI Video Direct Memory Access Engine driver support

This is the driver for the AXI Video Direct Memory Access (AXI
VDMA) core, which is a soft Xilinx IP core that provides high-
bandwidth direct memory access between memory and AXI4-Stream
type video target peripherals. The core provides efficient two
dimensional DMA operations with independent asynchronous read
and write channel operation.

This module works on Zynq (ARM Based SoC) and Microblaze platforms.

Signed-off-by: Srikanth Thokala <sthokal@xilinx.com>
Acked-by: Jassi Brar <jassisinghbrar@gmail.com>
Reviewed-by: Levente Kurusa <levex@linux.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig              |   14 +
 drivers/dma/Makefile             |    1 +
 drivers/dma/xilinx/Makefile      |    1 +
 drivers/dma/xilinx/xilinx_vdma.c | 1379 ++++++++++++++++++++++++++++++++++++++
 include/linux/amba/xilinx_dma.h  |   47 ++
 5 files changed, 1442 insertions(+)
 create mode 100644 drivers/dma/xilinx/Makefile
 create mode 100644 drivers/dma/xilinx/xilinx_vdma.c
 create mode 100644 include/linux/amba/xilinx_dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 5c5863842de9..b30b7ed89fb2 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -361,6 +361,20 @@ config FSL_EDMA
 	  multiplexing capability for DMA request sources(slot).
 	  This module can be found on Freescale Vybrid and LS-1 SoCs.
 
+config XILINX_VDMA
+	tristate "Xilinx AXI VDMA Engine"
+	depends on (ARCH_ZYNQ || MICROBLAZE)
+	select DMA_ENGINE
+	help
+	  Enable support for Xilinx AXI VDMA Soft IP.
+
+	  This engine provides high-bandwidth direct memory access
+	  between memory and AXI4-Stream video type target
+	  peripherals including peripherals which support AXI4-
+	  Stream Video Protocol.  It has two stream interfaces/
+	  channels, Memory Mapped to Stream (MM2S) and Stream to
+	  Memory Mapped (S2MM) for the data transfers.
+
 config DMA_ENGINE
 	bool
 
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 5150c82c9caf..c779e1eb2db2 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -46,3 +46,4 @@ obj-$(CONFIG_K3_DMA) += k3dma.o
 obj-$(CONFIG_MOXART_DMA) += moxart-dma.o
 obj-$(CONFIG_FSL_EDMA) += fsl-edma.o
 obj-$(CONFIG_QCOM_BAM_DMA) += qcom_bam_dma.o
+obj-y += xilinx/
diff --git a/drivers/dma/xilinx/Makefile b/drivers/dma/xilinx/Makefile
new file mode 100644
index 000000000000..3c4e9f2fea28
--- /dev/null
+++ b/drivers/dma/xilinx/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_XILINX_VDMA) += xilinx_vdma.o
diff --git a/drivers/dma/xilinx/xilinx_vdma.c b/drivers/dma/xilinx/xilinx_vdma.c
new file mode 100644
index 000000000000..42a13e8d4607
--- /dev/null
+++ b/drivers/dma/xilinx/xilinx_vdma.c
@@ -0,0 +1,1379 @@
+/*
+ * DMA driver for Xilinx Video DMA Engine
+ *
+ * Copyright (C) 2010-2014 Xilinx, Inc. All rights reserved.
+ *
+ * Based on the Freescale DMA driver.
+ *
+ * Description:
+ * The AXI Video Direct Memory Access (AXI VDMA) core is a soft Xilinx IP
+ * core that provides high-bandwidth direct memory access between memory
+ * and AXI4-Stream type video target peripherals. The core provides efficient
+ * two dimensional DMA operations with independent asynchronous read (S2MM)
+ * and write (MM2S) channel operation. It can be configured to have either
+ * one channel or two channels. If configured as two channels, one is to
+ * transmit to the video device (MM2S) and another is to receive from the
+ * video device (S2MM). Initialization, status, interrupt and management
+ * registers are accessed through an AXI4-Lite slave interface.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/amba/xilinx_dma.h>
+#include <linux/bitops.h>
+#include <linux/dmapool.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_dma.h>
+#include <linux/of_platform.h>
+#include <linux/of_irq.h>
+#include <linux/slab.h>
+
+#include "../dmaengine.h"
+
+/* Register/Descriptor Offsets */
+#define XILINX_VDMA_MM2S_CTRL_OFFSET		0x0000
+#define XILINX_VDMA_S2MM_CTRL_OFFSET		0x0030
+#define XILINX_VDMA_MM2S_DESC_OFFSET		0x0050
+#define XILINX_VDMA_S2MM_DESC_OFFSET		0x00a0
+
+/* Control Registers */
+#define XILINX_VDMA_REG_DMACR			0x0000
+#define XILINX_VDMA_DMACR_DELAY_MAX		0xff
+#define XILINX_VDMA_DMACR_DELAY_SHIFT		24
+#define XILINX_VDMA_DMACR_FRAME_COUNT_MAX	0xff
+#define XILINX_VDMA_DMACR_FRAME_COUNT_SHIFT	16
+#define XILINX_VDMA_DMACR_ERR_IRQ		BIT(14)
+#define XILINX_VDMA_DMACR_DLY_CNT_IRQ		BIT(13)
+#define XILINX_VDMA_DMACR_FRM_CNT_IRQ		BIT(12)
+#define XILINX_VDMA_DMACR_MASTER_SHIFT		8
+#define XILINX_VDMA_DMACR_FSYNCSRC_SHIFT	5
+#define XILINX_VDMA_DMACR_FRAMECNT_EN		BIT(4)
+#define XILINX_VDMA_DMACR_GENLOCK_EN		BIT(3)
+#define XILINX_VDMA_DMACR_RESET			BIT(2)
+#define XILINX_VDMA_DMACR_CIRC_EN		BIT(1)
+#define XILINX_VDMA_DMACR_RUNSTOP		BIT(0)
+#define XILINX_VDMA_DMACR_FSYNCSRC_MASK		GENMASK(6, 5)
+
+#define XILINX_VDMA_REG_DMASR			0x0004
+#define XILINX_VDMA_DMASR_EOL_LATE_ERR		BIT(15)
+#define XILINX_VDMA_DMASR_ERR_IRQ		BIT(14)
+#define XILINX_VDMA_DMASR_DLY_CNT_IRQ		BIT(13)
+#define XILINX_VDMA_DMASR_FRM_CNT_IRQ		BIT(12)
+#define XILINX_VDMA_DMASR_SOF_LATE_ERR		BIT(11)
+#define XILINX_VDMA_DMASR_SG_DEC_ERR		BIT(10)
+#define XILINX_VDMA_DMASR_SG_SLV_ERR		BIT(9)
+#define XILINX_VDMA_DMASR_EOF_EARLY_ERR		BIT(8)
+#define XILINX_VDMA_DMASR_SOF_EARLY_ERR		BIT(7)
+#define XILINX_VDMA_DMASR_DMA_DEC_ERR		BIT(6)
+#define XILINX_VDMA_DMASR_DMA_SLAVE_ERR		BIT(5)
+#define XILINX_VDMA_DMASR_DMA_INT_ERR		BIT(4)
+#define XILINX_VDMA_DMASR_IDLE			BIT(1)
+#define XILINX_VDMA_DMASR_HALTED		BIT(0)
+#define XILINX_VDMA_DMASR_DELAY_MASK		GENMASK(31, 24)
+#define XILINX_VDMA_DMASR_FRAME_COUNT_MASK	GENMASK(23, 16)
+
+#define XILINX_VDMA_REG_CURDESC			0x0008
+#define XILINX_VDMA_REG_TAILDESC		0x0010
+#define XILINX_VDMA_REG_REG_INDEX		0x0014
+#define XILINX_VDMA_REG_FRMSTORE		0x0018
+#define XILINX_VDMA_REG_THRESHOLD		0x001c
+#define XILINX_VDMA_REG_FRMPTR_STS		0x0024
+#define XILINX_VDMA_REG_PARK_PTR		0x0028
+#define XILINX_VDMA_PARK_PTR_WR_REF_SHIFT	8
+#define XILINX_VDMA_PARK_PTR_RD_REF_SHIFT	0
+#define XILINX_VDMA_REG_VDMA_VERSION		0x002c
+
+/* Register Direct Mode Registers */
+#define XILINX_VDMA_REG_VSIZE			0x0000
+#define XILINX_VDMA_REG_HSIZE			0x0004
+
+#define XILINX_VDMA_REG_FRMDLY_STRIDE		0x0008
+#define XILINX_VDMA_FRMDLY_STRIDE_FRMDLY_SHIFT	24
+#define XILINX_VDMA_FRMDLY_STRIDE_STRIDE_SHIFT	0
+
+#define XILINX_VDMA_REG_START_ADDRESS(n)	(0x000c + 4 * (n))
+
+/* HW specific definitions */
+#define XILINX_VDMA_MAX_CHANS_PER_DEVICE	0x2
+
+#define XILINX_VDMA_DMAXR_ALL_IRQ_MASK	\
+		(XILINX_VDMA_DMASR_FRM_CNT_IRQ | \
+		 XILINX_VDMA_DMASR_DLY_CNT_IRQ | \
+		 XILINX_VDMA_DMASR_ERR_IRQ)
+
+#define XILINX_VDMA_DMASR_ALL_ERR_MASK	\
+		(XILINX_VDMA_DMASR_EOL_LATE_ERR | \
+		 XILINX_VDMA_DMASR_SOF_LATE_ERR | \
+		 XILINX_VDMA_DMASR_SG_DEC_ERR | \
+		 XILINX_VDMA_DMASR_SG_SLV_ERR | \
+		 XILINX_VDMA_DMASR_EOF_EARLY_ERR | \
+		 XILINX_VDMA_DMASR_SOF_EARLY_ERR | \
+		 XILINX_VDMA_DMASR_DMA_DEC_ERR | \
+		 XILINX_VDMA_DMASR_DMA_SLAVE_ERR | \
+		 XILINX_VDMA_DMASR_DMA_INT_ERR)
+
+/*
+ * Recoverable errors are DMA Internal error, SOF Early, EOF Early
+ * and SOF Late. They are only recoverable when C_FLUSH_ON_FSYNC
+ * is enabled in the h/w system.
+ */
+#define XILINX_VDMA_DMASR_ERR_RECOVER_MASK	\
+		(XILINX_VDMA_DMASR_SOF_LATE_ERR | \
+		 XILINX_VDMA_DMASR_EOF_EARLY_ERR | \
+		 XILINX_VDMA_DMASR_SOF_EARLY_ERR | \
+		 XILINX_VDMA_DMASR_DMA_INT_ERR)
+
+/* Axi VDMA Flush on Fsync bits */
+#define XILINX_VDMA_FLUSH_S2MM		3
+#define XILINX_VDMA_FLUSH_MM2S		2
+#define XILINX_VDMA_FLUSH_BOTH		1
+
+/* Delay loop counter to prevent hardware failure */
+#define XILINX_VDMA_LOOP_COUNT		1000000
+
+/**
+ * struct xilinx_vdma_desc_hw - Hardware Descriptor
+ * @next_desc: Next Descriptor Pointer @0x00
+ * @pad1: Reserved @0x04
+ * @buf_addr: Buffer address @0x08
+ * @pad2: Reserved @0x0C
+ * @vsize: Vertical Size @0x10
+ * @hsize: Horizontal Size @0x14
+ * @stride: Number of bytes between the first
+ *	    pixels of each horizontal line @0x18
+ */
+struct xilinx_vdma_desc_hw {
+	u32 next_desc;
+	u32 pad1;
+	u32 buf_addr;
+	u32 pad2;
+	u32 vsize;
+	u32 hsize;
+	u32 stride;
+} __aligned(64);
+
+/**
+ * struct xilinx_vdma_tx_segment - Descriptor segment
+ * @hw: Hardware descriptor
+ * @node: Node in the descriptor segments list
+ * @phys: Physical address of segment
+ */
+struct xilinx_vdma_tx_segment {
+	struct xilinx_vdma_desc_hw hw;
+	struct list_head node;
+	dma_addr_t phys;
+} __aligned(64);
+
+/**
+ * struct xilinx_vdma_tx_descriptor - Per Transaction structure
+ * @async_tx: Async transaction descriptor
+ * @segments: TX segments list
+ * @node: Node in the channel descriptors list
+ */
+struct xilinx_vdma_tx_descriptor {
+	struct dma_async_tx_descriptor async_tx;
+	struct list_head segments;
+	struct list_head node;
+};
+
+/**
+ * struct xilinx_vdma_chan - Driver specific VDMA channel structure
+ * @xdev: Driver specific device structure
+ * @ctrl_offset: Control registers offset
+ * @desc_offset: TX descriptor registers offset
+ * @lock: Descriptor operation lock
+ * @pending_list: Descriptors waiting
+ * @active_desc: Active descriptor
+ * @allocated_desc: Allocated descriptor
+ * @done_list: Complete descriptors
+ * @common: DMA common channel
+ * @desc_pool: Descriptors pool
+ * @dev: The dma device
+ * @irq: Channel IRQ
+ * @id: Channel ID
+ * @direction: Transfer direction
+ * @num_frms: Number of frames
+ * @has_sg: Support scatter transfers
+ * @genlock: Support genlock mode
+ * @err: Channel has errors
+ * @tasklet: Cleanup work after irq
+ * @config: Device configuration info
+ * @flush_on_fsync: Flush on Frame sync
+ */
+struct xilinx_vdma_chan {
+	struct xilinx_vdma_device *xdev;
+	u32 ctrl_offset;
+	u32 desc_offset;
+	spinlock_t lock;
+	struct list_head pending_list;
+	struct xilinx_vdma_tx_descriptor *active_desc;
+	struct xilinx_vdma_tx_descriptor *allocated_desc;
+	struct list_head done_list;
+	struct dma_chan common;
+	struct dma_pool *desc_pool;
+	struct device *dev;
+	int irq;
+	int id;
+	enum dma_transfer_direction direction;
+	int num_frms;
+	bool has_sg;
+	bool genlock;
+	bool err;
+	struct tasklet_struct tasklet;
+	struct xilinx_vdma_config config;
+	bool flush_on_fsync;
+};
+
+/**
+ * struct xilinx_vdma_device - VDMA device structure
+ * @regs: I/O mapped base address
+ * @dev: Device Structure
+ * @common: DMA device structure
+ * @chan: Driver specific VDMA channel
+ * @has_sg: Specifies whether Scatter-Gather is present or not
+ * @flush_on_fsync: Flush on frame sync
+ */
+struct xilinx_vdma_device {
+	void __iomem *regs;
+	struct device *dev;
+	struct dma_device common;
+	struct xilinx_vdma_chan *chan[XILINX_VDMA_MAX_CHANS_PER_DEVICE];
+	bool has_sg;
+	u32 flush_on_fsync;
+};
+
+/* Macros */
+#define to_xilinx_chan(chan) \
+	container_of(chan, struct xilinx_vdma_chan, common)
+#define to_vdma_tx_descriptor(tx) \
+	container_of(tx, struct xilinx_vdma_tx_descriptor, async_tx)
+
+/* IO accessors */
+static inline u32 vdma_read(struct xilinx_vdma_chan *chan, u32 reg)
+{
+	return ioread32(chan->xdev->regs + reg);
+}
+
+static inline void vdma_write(struct xilinx_vdma_chan *chan, u32 reg, u32 value)
+{
+	iowrite32(value, chan->xdev->regs + reg);
+}
+
+static inline void vdma_desc_write(struct xilinx_vdma_chan *chan, u32 reg,
+				   u32 value)
+{
+	vdma_write(chan, chan->desc_offset + reg, value);
+}
+
+static inline u32 vdma_ctrl_read(struct xilinx_vdma_chan *chan, u32 reg)
+{
+	return vdma_read(chan, chan->ctrl_offset + reg);
+}
+
+static inline void vdma_ctrl_write(struct xilinx_vdma_chan *chan, u32 reg,
+				   u32 value)
+{
+	vdma_write(chan, chan->ctrl_offset + reg, value);
+}
+
+static inline void vdma_ctrl_clr(struct xilinx_vdma_chan *chan, u32 reg,
+				 u32 clr)
+{
+	vdma_ctrl_write(chan, reg, vdma_ctrl_read(chan, reg) & ~clr);
+}
+
+static inline void vdma_ctrl_set(struct xilinx_vdma_chan *chan, u32 reg,
+				 u32 set)
+{
+	vdma_ctrl_write(chan, reg, vdma_ctrl_read(chan, reg) | set);
+}
+
+/* -----------------------------------------------------------------------------
+ * Descriptors and segments alloc and free
+ */
+
+/**
+ * xilinx_vdma_alloc_tx_segment - Allocate transaction segment
+ * @chan: Driver specific VDMA channel
+ *
+ * Return: The allocated segment on success and NULL on failure.
+ */
+static struct xilinx_vdma_tx_segment *
+xilinx_vdma_alloc_tx_segment(struct xilinx_vdma_chan *chan)
+{
+	struct xilinx_vdma_tx_segment *segment;
+	dma_addr_t phys;
+
+	segment = dma_pool_alloc(chan->desc_pool, GFP_ATOMIC, &phys);
+	if (!segment)
+		return NULL;
+
+	memset(segment, 0, sizeof(*segment));
+	segment->phys = phys;
+
+	return segment;
+}
+
+/**
+ * xilinx_vdma_free_tx_segment - Free transaction segment
+ * @chan: Driver specific VDMA channel
+ * @segment: VDMA transaction segment
+ */
+static void xilinx_vdma_free_tx_segment(struct xilinx_vdma_chan *chan,
+					struct xilinx_vdma_tx_segment *segment)
+{
+	dma_pool_free(chan->desc_pool, segment, segment->phys);
+}
+
+/**
+ * xilinx_vdma_tx_descriptor - Allocate transaction descriptor
+ * @chan: Driver specific VDMA channel
+ *
+ * Return: The allocated descriptor on success and NULL on failure.
+ */
+static struct xilinx_vdma_tx_descriptor *
+xilinx_vdma_alloc_tx_descriptor(struct xilinx_vdma_chan *chan)
+{
+	struct xilinx_vdma_tx_descriptor *desc;
+	unsigned long flags;
+
+	if (chan->allocated_desc)
+		return chan->allocated_desc;
+
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		return NULL;
+
+	spin_lock_irqsave(&chan->lock, flags);
+	chan->allocated_desc = desc;
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	INIT_LIST_HEAD(&desc->segments);
+
+	return desc;
+}
+
+/**
+ * xilinx_vdma_free_tx_descriptor - Free transaction descriptor
+ * @chan: Driver specific VDMA channel
+ * @desc: VDMA transaction descriptor
+ */
+static void
+xilinx_vdma_free_tx_descriptor(struct xilinx_vdma_chan *chan,
+			       struct xilinx_vdma_tx_descriptor *desc)
+{
+	struct xilinx_vdma_tx_segment *segment, *next;
+
+	if (!desc)
+		return;
+
+	list_for_each_entry_safe(segment, next, &desc->segments, node) {
+		list_del(&segment->node);
+		xilinx_vdma_free_tx_segment(chan, segment);
+	}
+
+	kfree(desc);
+}
+
+/* Required functions */
+
+/**
+ * xilinx_vdma_free_desc_list - Free descriptors list
+ * @chan: Driver specific VDMA channel
+ * @list: List to parse and delete the descriptor
+ */
+static void xilinx_vdma_free_desc_list(struct xilinx_vdma_chan *chan,
+					struct list_head *list)
+{
+	struct xilinx_vdma_tx_descriptor *desc, *next;
+
+	list_for_each_entry_safe(desc, next, list, node) {
+		list_del(&desc->node);
+		xilinx_vdma_free_tx_descriptor(chan, desc);
+	}
+}
+
+/**
+ * xilinx_vdma_free_descriptors - Free channel descriptors
+ * @chan: Driver specific VDMA channel
+ */
+static void xilinx_vdma_free_descriptors(struct xilinx_vdma_chan *chan)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->lock, flags);
+
+	xilinx_vdma_free_desc_list(chan, &chan->pending_list);
+	xilinx_vdma_free_desc_list(chan, &chan->done_list);
+
+	xilinx_vdma_free_tx_descriptor(chan, chan->active_desc);
+	chan->active_desc = NULL;
+
+	spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+/**
+ * xilinx_vdma_free_chan_resources - Free channel resources
+ * @dchan: DMA channel
+ */
+static void xilinx_vdma_free_chan_resources(struct dma_chan *dchan)
+{
+	struct xilinx_vdma_chan *chan = to_xilinx_chan(dchan);
+
+	dev_dbg(chan->dev, "Free all channel resources.\n");
+
+	xilinx_vdma_free_descriptors(chan);
+	dma_pool_destroy(chan->desc_pool);
+	chan->desc_pool = NULL;
+}
+
+/**
+ * xilinx_vdma_chan_desc_cleanup - Clean channel descriptors
+ * @chan: Driver specific VDMA channel
+ */
+static void xilinx_vdma_chan_desc_cleanup(struct xilinx_vdma_chan *chan)
+{
+	struct xilinx_vdma_tx_descriptor *desc, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->lock, flags);
+
+	list_for_each_entry_safe(desc, next, &chan->done_list, node) {
+		dma_async_tx_callback callback;
+		void *callback_param;
+
+		/* Remove from the list of running transactions */
+		list_del(&desc->node);
+
+		/* Run the link descriptor callback function */
+		callback = desc->async_tx.callback;
+		callback_param = desc->async_tx.callback_param;
+		if (callback) {
+			spin_unlock_irqrestore(&chan->lock, flags);
+			callback(callback_param);
+			spin_lock_irqsave(&chan->lock, flags);
+		}
+
+		/* Run any dependencies, then free the descriptor */
+		dma_run_dependencies(&desc->async_tx);
+		xilinx_vdma_free_tx_descriptor(chan, desc);
+	}
+
+	spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+/**
+ * xilinx_vdma_do_tasklet - Schedule completion tasklet
+ * @data: Pointer to the Xilinx VDMA channel structure
+ */
+static void xilinx_vdma_do_tasklet(unsigned long data)
+{
+	struct xilinx_vdma_chan *chan = (struct xilinx_vdma_chan *)data;
+
+	xilinx_vdma_chan_desc_cleanup(chan);
+}
+
+/**
+ * xilinx_vdma_alloc_chan_resources - Allocate channel resources
+ * @dchan: DMA channel
+ *
+ * Return: '0' on success and failure value on error
+ */
+static int xilinx_vdma_alloc_chan_resources(struct dma_chan *dchan)
+{
+	struct xilinx_vdma_chan *chan = to_xilinx_chan(dchan);
+
+	/* Has this channel already been allocated? */
+	if (chan->desc_pool)
+		return 0;
+
+	/*
+	 * We need the descriptor to be aligned to 64bytes
+	 * for meeting Xilinx VDMA specification requirement.
+	 */
+	chan->desc_pool = dma_pool_create("xilinx_vdma_desc_pool",
+				chan->dev,
+				sizeof(struct xilinx_vdma_tx_segment),
+				__alignof__(struct xilinx_vdma_tx_segment), 0);
+	if (!chan->desc_pool) {
+		dev_err(chan->dev,
+			"unable to allocate channel %d descriptor pool\n",
+			chan->id);
+		return -ENOMEM;
+	}
+
+	dma_cookie_init(dchan);
+	return 0;
+}
+
+/**
+ * xilinx_vdma_tx_status - Get VDMA transaction status
+ * @dchan: DMA channel
+ * @cookie: Transaction identifier
+ * @txstate: Transaction state
+ *
+ * Return: DMA transaction status
+ */
+static enum dma_status xilinx_vdma_tx_status(struct dma_chan *dchan,
+					dma_cookie_t cookie,
+					struct dma_tx_state *txstate)
+{
+	return dma_cookie_status(dchan, cookie, txstate);
+}
+
+/**
+ * xilinx_vdma_is_running - Check if VDMA channel is running
+ * @chan: Driver specific VDMA channel
+ *
+ * Return: '1' if running, '0' if not.
+ */
+static bool xilinx_vdma_is_running(struct xilinx_vdma_chan *chan)
+{
+	return !(vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR) &
+		 XILINX_VDMA_DMASR_HALTED) &&
+		(vdma_ctrl_read(chan, XILINX_VDMA_REG_DMACR) &
+		 XILINX_VDMA_DMACR_RUNSTOP);
+}
+
+/**
+ * xilinx_vdma_is_idle - Check if VDMA channel is idle
+ * @chan: Driver specific VDMA channel
+ *
+ * Return: '1' if idle, '0' if not.
+ */
+static bool xilinx_vdma_is_idle(struct xilinx_vdma_chan *chan)
+{
+	return vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR) &
+		XILINX_VDMA_DMASR_IDLE;
+}
+
+/**
+ * xilinx_vdma_halt - Halt VDMA channel
+ * @chan: Driver specific VDMA channel
+ */
+static void xilinx_vdma_halt(struct xilinx_vdma_chan *chan)
+{
+	int loop = XILINX_VDMA_LOOP_COUNT;
+
+	vdma_ctrl_clr(chan, XILINX_VDMA_REG_DMACR, XILINX_VDMA_DMACR_RUNSTOP);
+
+	/* Wait for the hardware to halt */
+	do {
+		if (vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR) &
+		    XILINX_VDMA_DMASR_HALTED)
+			break;
+	} while (loop--);
+
+	if (!loop) {
+		dev_err(chan->dev, "Cannot stop channel %p: %x\n",
+			chan, vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR));
+		chan->err = true;
+	}
+
+	return;
+}
+
+/**
+ * xilinx_vdma_start - Start VDMA channel
+ * @chan: Driver specific VDMA channel
+ */
+static void xilinx_vdma_start(struct xilinx_vdma_chan *chan)
+{
+	int loop = XILINX_VDMA_LOOP_COUNT;
+
+	vdma_ctrl_set(chan, XILINX_VDMA_REG_DMACR, XILINX_VDMA_DMACR_RUNSTOP);
+
+	/* Wait for the hardware to start */
+	do {
+		if (!(vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR) &
+		      XILINX_VDMA_DMASR_HALTED))
+			break;
+	} while (loop--);
+
+	if (!loop) {
+		dev_err(chan->dev, "Cannot start channel %p: %x\n",
+			chan, vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR));
+
+		chan->err = true;
+	}
+
+	return;
+}
+
+/**
+ * xilinx_vdma_start_transfer - Starts VDMA transfer
+ * @chan: Driver specific channel struct pointer
+ */
+static void xilinx_vdma_start_transfer(struct xilinx_vdma_chan *chan)
+{
+	struct xilinx_vdma_config *config = &chan->config;
+	struct xilinx_vdma_tx_descriptor *desc;
+	unsigned long flags;
+	u32 reg;
+	struct xilinx_vdma_tx_segment *head, *tail = NULL;
+
+	if (chan->err)
+		return;
+
+	spin_lock_irqsave(&chan->lock, flags);
+
+	/* There's already an active descriptor, bail out. */
+	if (chan->active_desc)
+		goto out_unlock;
+
+	if (list_empty(&chan->pending_list))
+		goto out_unlock;
+
+	desc = list_first_entry(&chan->pending_list,
+				struct xilinx_vdma_tx_descriptor, node);
+
+	/* If it is SG mode and hardware is busy, cannot submit */
+	if (chan->has_sg && xilinx_vdma_is_running(chan) &&
+	    !xilinx_vdma_is_idle(chan)) {
+		dev_dbg(chan->dev, "DMA controller still busy\n");
+		goto out_unlock;
+	}
+
+	/*
+	 * If hardware is idle, then all descriptors on the running lists are
+	 * done, start new transfers
+	 */
+	if (chan->has_sg) {
+		head = list_first_entry(&desc->segments,
+					struct xilinx_vdma_tx_segment, node);
+		tail = list_entry(desc->segments.prev,
+				  struct xilinx_vdma_tx_segment, node);
+
+		vdma_ctrl_write(chan, XILINX_VDMA_REG_CURDESC, head->phys);
+	}
+
+	/* Configure the hardware using info in the config structure */
+	reg = vdma_ctrl_read(chan, XILINX_VDMA_REG_DMACR);
+
+	if (config->frm_cnt_en)
+		reg |= XILINX_VDMA_DMACR_FRAMECNT_EN;
+	else
+		reg &= ~XILINX_VDMA_DMACR_FRAMECNT_EN;
+
+	/*
+	 * With SG, start with circular mode, so that BDs can be fetched.
+	 * In direct register mode, if not parking, enable circular mode
+	 */
+	if (chan->has_sg || !config->park)
+		reg |= XILINX_VDMA_DMACR_CIRC_EN;
+
+	if (config->park)
+		reg &= ~XILINX_VDMA_DMACR_CIRC_EN;
+
+	vdma_ctrl_write(chan, XILINX_VDMA_REG_DMACR, reg);
+
+	if (config->park && (config->park_frm >= 0) &&
+			(config->park_frm < chan->num_frms)) {
+		if (chan->direction == DMA_MEM_TO_DEV)
+			vdma_write(chan, XILINX_VDMA_REG_PARK_PTR,
+				config->park_frm <<
+					XILINX_VDMA_PARK_PTR_RD_REF_SHIFT);
+		else
+			vdma_write(chan, XILINX_VDMA_REG_PARK_PTR,
+				config->park_frm <<
+					XILINX_VDMA_PARK_PTR_WR_REF_SHIFT);
+	}
+
+	/* Start the hardware */
+	xilinx_vdma_start(chan);
+
+	if (chan->err)
+		goto out_unlock;
+
+	/* Start the transfer */
+	if (chan->has_sg) {
+		vdma_ctrl_write(chan, XILINX_VDMA_REG_TAILDESC, tail->phys);
+	} else {
+		struct xilinx_vdma_tx_segment *segment, *last = NULL;
+		int i = 0;
+
+		list_for_each_entry(segment, &desc->segments, node) {
+			vdma_desc_write(chan,
+					XILINX_VDMA_REG_START_ADDRESS(i++),
+					segment->hw.buf_addr);
+			last = segment;
+		}
+
+		if (!last)
+			goto out_unlock;
+
+		/* HW expects these parameters to be same for one transaction */
+		vdma_desc_write(chan, XILINX_VDMA_REG_HSIZE, last->hw.hsize);
+		vdma_desc_write(chan, XILINX_VDMA_REG_FRMDLY_STRIDE,
+				last->hw.stride);
+		vdma_desc_write(chan, XILINX_VDMA_REG_VSIZE, last->hw.vsize);
+	}
+
+	list_del(&desc->node);
+	chan->active_desc = desc;
+
+out_unlock:
+	spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+/**
+ * xilinx_vdma_issue_pending - Issue pending transactions
+ * @dchan: DMA channel
+ */
+static void xilinx_vdma_issue_pending(struct dma_chan *dchan)
+{
+	struct xilinx_vdma_chan *chan = to_xilinx_chan(dchan);
+
+	xilinx_vdma_start_transfer(chan);
+}
+
+/**
+ * xilinx_vdma_complete_descriptor - Mark the active descriptor as complete
+ * @chan : xilinx DMA channel
+ *
+ * CONTEXT: hardirq
+ */
+static void xilinx_vdma_complete_descriptor(struct xilinx_vdma_chan *chan)
+{
+	struct xilinx_vdma_tx_descriptor *desc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->lock, flags);
+
+	desc = chan->active_desc;
+	if (!desc) {
+		dev_dbg(chan->dev, "no running descriptors\n");
+		goto out_unlock;
+	}
+
+	dma_cookie_complete(&desc->async_tx);
+	list_add_tail(&desc->node, &chan->done_list);
+
+	chan->active_desc = NULL;
+
+out_unlock:
+	spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+/**
+ * xilinx_vdma_reset - Reset VDMA channel
+ * @chan: Driver specific VDMA channel
+ *
+ * Return: '0' on success and failure value on error
+ */
+static int xilinx_vdma_reset(struct xilinx_vdma_chan *chan)
+{
+	int loop = XILINX_VDMA_LOOP_COUNT;
+	u32 tmp;
+
+	vdma_ctrl_set(chan, XILINX_VDMA_REG_DMACR, XILINX_VDMA_DMACR_RESET);
+
+	tmp = vdma_ctrl_read(chan, XILINX_VDMA_REG_DMACR) &
+		XILINX_VDMA_DMACR_RESET;
+
+	/* Wait for the hardware to finish reset */
+	do {
+		tmp = vdma_ctrl_read(chan, XILINX_VDMA_REG_DMACR) &
+			XILINX_VDMA_DMACR_RESET;
+	} while (loop-- && tmp);
+
+	if (!loop) {
+		dev_err(chan->dev, "reset timeout, cr %x, sr %x\n",
+			vdma_ctrl_read(chan, XILINX_VDMA_REG_DMACR),
+			vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR));
+		return -ETIMEDOUT;
+	}
+
+	chan->err = false;
+
+	return 0;
+}
+
+/**
+ * xilinx_vdma_chan_reset - Reset VDMA channel and enable interrupts
+ * @chan: Driver specific VDMA channel
+ *
+ * Return: '0' on success and failure value on error
+ */
+static int xilinx_vdma_chan_reset(struct xilinx_vdma_chan *chan)
+{
+	int err;
+
+	/* Reset VDMA */
+	err = xilinx_vdma_reset(chan);
+	if (err)
+		return err;
+
+	/* Enable interrupts */
+	vdma_ctrl_set(chan, XILINX_VDMA_REG_DMACR,
+		      XILINX_VDMA_DMAXR_ALL_IRQ_MASK);
+
+	return 0;
+}
+
+/**
+ * xilinx_vdma_irq_handler - VDMA Interrupt handler
+ * @irq: IRQ number
+ * @data: Pointer to the Xilinx VDMA channel structure
+ *
+ * Return: IRQ_HANDLED/IRQ_NONE
+ */
+static irqreturn_t xilinx_vdma_irq_handler(int irq, void *data)
+{
+	struct xilinx_vdma_chan *chan = data;
+	u32 status;
+
+	/* Read the status and ack the interrupts. */
+	status = vdma_ctrl_read(chan, XILINX_VDMA_REG_DMASR);
+	if (!(status & XILINX_VDMA_DMAXR_ALL_IRQ_MASK))
+		return IRQ_NONE;
+
+	vdma_ctrl_write(chan, XILINX_VDMA_REG_DMASR,
+			status & XILINX_VDMA_DMAXR_ALL_IRQ_MASK);
+
+	if (status & XILINX_VDMA_DMASR_ERR_IRQ) {
+		/*
+		 * An error occurred. If C_FLUSH_ON_FSYNC is enabled and the
+		 * error is recoverable, ignore it. Otherwise flag the error.
+		 *
+		 * Only recoverable errors can be cleared in the DMASR register,
+		 * make sure not to write to other error bits to 1.
+		 */
+		u32 errors = status & XILINX_VDMA_DMASR_ALL_ERR_MASK;
+		vdma_ctrl_write(chan, XILINX_VDMA_REG_DMASR,
+				errors & XILINX_VDMA_DMASR_ERR_RECOVER_MASK);
+
+		if (!chan->flush_on_fsync ||
+		    (errors & ~XILINX_VDMA_DMASR_ERR_RECOVER_MASK)) {
+			dev_err(chan->dev,
+				"Channel %p has errors %x, cdr %x tdr %x\n",
+				chan, errors,
+				vdma_ctrl_read(chan, XILINX_VDMA_REG_CURDESC),
+				vdma_ctrl_read(chan, XILINX_VDMA_REG_TAILDESC));
+			chan->err = true;
+		}
+	}
+
+	if (status & XILINX_VDMA_DMASR_DLY_CNT_IRQ) {
+		/*
+		 * Device takes too long to do the transfer when user requires
+		 * responsiveness.
+		 */
+		dev_dbg(chan->dev, "Inter-packet latency too long\n");
+	}
+
+	if (status & XILINX_VDMA_DMASR_FRM_CNT_IRQ) {
+		xilinx_vdma_complete_descriptor(chan);
+		xilinx_vdma_start_transfer(chan);
+	}
+
+	tasklet_schedule(&chan->tasklet);
+	return IRQ_HANDLED;
+}
+
+/**
+ * xilinx_vdma_tx_submit - Submit DMA transaction
+ * @tx: Async transaction descriptor
+ *
+ * Return: cookie value on success and failure value on error
+ */
+static dma_cookie_t xilinx_vdma_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct xilinx_vdma_tx_descriptor *desc = to_vdma_tx_descriptor(tx);
+	struct xilinx_vdma_chan *chan = to_xilinx_chan(tx->chan);
+	dma_cookie_t cookie;
+	unsigned long flags;
+	int err;
+
+	if (chan->err) {
+		/*
+		 * If reset fails, need to hard reset the system.
+		 * Channel is no longer functional
+		 */
+		err = xilinx_vdma_chan_reset(chan);
+		if (err < 0)
+			return err;
+	}
+
+	spin_lock_irqsave(&chan->lock, flags);
+
+	cookie = dma_cookie_assign(tx);
+
+	/* Append the transaction to the pending transactions queue. */
+	list_add_tail(&desc->node, &chan->pending_list);
+
+	/* Free the allocated desc */
+	chan->allocated_desc = NULL;
+
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	return cookie;
+}
+
+/**
+ * xilinx_vdma_dma_prep_interleaved - prepare a descriptor for a
+ *	DMA_SLAVE transaction
+ * @dchan: DMA channel
+ * @xt: Interleaved template pointer
+ * @flags: transfer ack flags
+ *
+ * Return: Async transaction descriptor on success and NULL on failure
+ */
+static struct dma_async_tx_descriptor *
+xilinx_vdma_dma_prep_interleaved(struct dma_chan *dchan,
+				 struct dma_interleaved_template *xt,
+				 unsigned long flags)
+{
+	struct xilinx_vdma_chan *chan = to_xilinx_chan(dchan);
+	struct xilinx_vdma_tx_descriptor *desc;
+	struct xilinx_vdma_tx_segment *segment, *prev = NULL;
+	struct xilinx_vdma_desc_hw *hw;
+
+	if (!is_slave_direction(xt->dir))
+		return NULL;
+
+	if (!xt->numf || !xt->sgl[0].size)
+		return NULL;
+
+	/* Allocate a transaction descriptor. */
+	desc = xilinx_vdma_alloc_tx_descriptor(chan);
+	if (!desc)
+		return NULL;
+
+	dma_async_tx_descriptor_init(&desc->async_tx, &chan->common);
+	desc->async_tx.tx_submit = xilinx_vdma_tx_submit;
+	async_tx_ack(&desc->async_tx);
+
+	/* Allocate the link descriptor from DMA pool */
+	segment = xilinx_vdma_alloc_tx_segment(chan);
+	if (!segment)
+		goto error;
+
+	/* Fill in the hardware descriptor */
+	hw = &segment->hw;
+	hw->vsize = xt->numf;
+	hw->hsize = xt->sgl[0].size;
+	hw->stride = xt->sgl[0].icg <<
+			XILINX_VDMA_FRMDLY_STRIDE_STRIDE_SHIFT;
+	hw->stride |= chan->config.frm_dly <<
+			XILINX_VDMA_FRMDLY_STRIDE_FRMDLY_SHIFT;
+
+	if (xt->dir != DMA_MEM_TO_DEV)
+		hw->buf_addr = xt->dst_start;
+	else
+		hw->buf_addr = xt->src_start;
+
+	/* Link the previous next descriptor to current */
+	prev = list_last_entry(&desc->segments,
+				struct xilinx_vdma_tx_segment, node);
+	prev->hw.next_desc = segment->phys;
+
+	/* Insert the segment into the descriptor segments list. */
+	list_add_tail(&segment->node, &desc->segments);
+
+	prev = segment;
+
+	/* Link the last hardware descriptor with the first. */
+	segment = list_first_entry(&desc->segments,
+				   struct xilinx_vdma_tx_segment, node);
+	prev->hw.next_desc = segment->phys;
+
+	return &desc->async_tx;
+
+error:
+	xilinx_vdma_free_tx_descriptor(chan, desc);
+	return NULL;
+}
+
+/**
+ * xilinx_vdma_terminate_all - Halt the channel and free descriptors
+ * @chan: Driver specific VDMA Channel pointer
+ */
+static void xilinx_vdma_terminate_all(struct xilinx_vdma_chan *chan)
+{
+	/* Halt the DMA engine */
+	xilinx_vdma_halt(chan);
+
+	/* Remove and free all of the descriptors in the lists */
+	xilinx_vdma_free_descriptors(chan);
+}
+
+/**
+ * xilinx_vdma_channel_set_config - Configure VDMA channel
+ * Run-time configuration for Axi VDMA, supports:
+ * . halt the channel
+ * . configure interrupt coalescing and inter-packet delay threshold
+ * . start/stop parking
+ * . enable genlock
+ *
+ * @dchan: DMA channel
+ * @cfg: VDMA device configuration pointer
+ *
+ * Return: '0' on success and failure value on error
+ */
+int xilinx_vdma_channel_set_config(struct dma_chan *dchan,
+					struct xilinx_vdma_config *cfg)
+{
+	struct xilinx_vdma_chan *chan = to_xilinx_chan(dchan);
+	u32 dmacr;
+
+	if (cfg->reset)
+		return xilinx_vdma_chan_reset(chan);
+
+	dmacr = vdma_ctrl_read(chan, XILINX_VDMA_REG_DMACR);
+
+	chan->config.frm_dly = cfg->frm_dly;
+	chan->config.park = cfg->park;
+
+	/* genlock settings */
+	chan->config.gen_lock = cfg->gen_lock;
+	chan->config.master = cfg->master;
+
+	if (cfg->gen_lock && chan->genlock) {
+		dmacr |= XILINX_VDMA_DMACR_GENLOCK_EN;
+		dmacr |= cfg->master << XILINX_VDMA_DMACR_MASTER_SHIFT;
+	}
+
+	chan->config.frm_cnt_en = cfg->frm_cnt_en;
+	if (cfg->park)
+		chan->config.park_frm = cfg->park_frm;
+	else
+		chan->config.park_frm = -1;
+
+	chan->config.coalesc = cfg->coalesc;
+	chan->config.delay = cfg->delay;
+
+	if (cfg->coalesc <= XILINX_VDMA_DMACR_FRAME_COUNT_MAX) {
+		dmacr |= cfg->coalesc << XILINX_VDMA_DMACR_FRAME_COUNT_SHIFT;
+		chan->config.coalesc = cfg->coalesc;
+	}
+
+	if (cfg->delay <= XILINX_VDMA_DMACR_DELAY_MAX) {
+		dmacr |= cfg->delay << XILINX_VDMA_DMACR_DELAY_SHIFT;
+		chan->config.delay = cfg->delay;
+	}
+
+	/* FSync Source selection */
+	dmacr &= ~XILINX_VDMA_DMACR_FSYNCSRC_MASK;
+	dmacr |= cfg->ext_fsync << XILINX_VDMA_DMACR_FSYNCSRC_SHIFT;
+
+	vdma_ctrl_write(chan, XILINX_VDMA_REG_DMACR, dmacr);
+
+	return 0;
+}
+EXPORT_SYMBOL(xilinx_vdma_channel_set_config);
+
+/**
+ * xilinx_vdma_device_control - Configure DMA channel of the device
+ * @dchan: DMA Channel pointer
+ * @cmd: DMA control command
+ * @arg: Channel configuration
+ *
+ * Return: '0' on success and failure value on error
+ */
+static int xilinx_vdma_device_control(struct dma_chan *dchan,
+				      enum dma_ctrl_cmd cmd, unsigned long arg)
+{
+	struct xilinx_vdma_chan *chan = to_xilinx_chan(dchan);
+
+	if (cmd != DMA_TERMINATE_ALL)
+		return -ENXIO;
+
+	xilinx_vdma_terminate_all(chan);
+
+	return 0;
+}
+
+/* -----------------------------------------------------------------------------
+ * Probe and remove
+ */
+
+/**
+ * xilinx_vdma_chan_remove - Per Channel remove function
+ * @chan: Driver specific VDMA channel
+ */
+static void xilinx_vdma_chan_remove(struct xilinx_vdma_chan *chan)
+{
+	/* Disable all interrupts */
+	vdma_ctrl_clr(chan, XILINX_VDMA_REG_DMACR,
+		      XILINX_VDMA_DMAXR_ALL_IRQ_MASK);
+
+	if (chan->irq > 0)
+		free_irq(chan->irq, chan);
+
+	tasklet_kill(&chan->tasklet);
+
+	list_del(&chan->common.device_node);
+}
+
+/**
+ * xilinx_vdma_chan_probe - Per Channel Probing
+ * It get channel features from the device tree entry and
+ * initialize special channel handling routines
+ *
+ * @xdev: Driver specific device structure
+ * @node: Device node
+ *
+ * Return: '0' on success and failure value on error
+ */
+static int xilinx_vdma_chan_probe(struct xilinx_vdma_device *xdev,
+				  struct device_node *node)
+{
+	struct xilinx_vdma_chan *chan;
+	bool has_dre = false;
+	u32 value, width;
+	int err;
+
+	/* Allocate and initialize the channel structure */
+	chan = devm_kzalloc(xdev->dev, sizeof(*chan), GFP_KERNEL);
+	if (!chan)
+		return -ENOMEM;
+
+	chan->dev = xdev->dev;
+	chan->xdev = xdev;
+	chan->has_sg = xdev->has_sg;
+
+	spin_lock_init(&chan->lock);
+	INIT_LIST_HEAD(&chan->pending_list);
+	INIT_LIST_HEAD(&chan->done_list);
+
+	/* Retrieve the channel properties from the device tree */
+	has_dre = of_property_read_bool(node, "xlnx,include-dre");
+
+	chan->genlock = of_property_read_bool(node, "xlnx,genlock-mode");
+
+	err = of_property_read_u32(node, "xlnx,datawidth", &value);
+	if (err) {
+		dev_err(xdev->dev, "missing xlnx,datawidth property\n");
+		return err;
+	}
+	width = value >> 3; /* Convert bits to bytes */
+
+	/* If data width is greater than 8 bytes, DRE is not in hw */
+	if (width > 8)
+		has_dre = false;
+
+	if (!has_dre)
+		xdev->common.copy_align = fls(width - 1);
+
+	if (of_device_is_compatible(node, "xlnx,axi-vdma-mm2s-channel")) {
+		chan->direction = DMA_MEM_TO_DEV;
+		chan->id = 0;
+
+		chan->ctrl_offset = XILINX_VDMA_MM2S_CTRL_OFFSET;
+		chan->desc_offset = XILINX_VDMA_MM2S_DESC_OFFSET;
+
+		if (xdev->flush_on_fsync == XILINX_VDMA_FLUSH_BOTH ||
+		    xdev->flush_on_fsync == XILINX_VDMA_FLUSH_MM2S)
+			chan->flush_on_fsync = true;
+	} else if (of_device_is_compatible(node,
+					    "xlnx,axi-vdma-s2mm-channel")) {
+		chan->direction = DMA_DEV_TO_MEM;
+		chan->id = 1;
+
+		chan->ctrl_offset = XILINX_VDMA_S2MM_CTRL_OFFSET;
+		chan->desc_offset = XILINX_VDMA_S2MM_DESC_OFFSET;
+
+		if (xdev->flush_on_fsync == XILINX_VDMA_FLUSH_BOTH ||
+		    xdev->flush_on_fsync == XILINX_VDMA_FLUSH_S2MM)
+			chan->flush_on_fsync = true;
+	} else {
+		dev_err(xdev->dev, "Invalid channel compatible node\n");
+		return -EINVAL;
+	}
+
+	/* Request the interrupt */
+	chan->irq = irq_of_parse_and_map(node, 0);
+	err = request_irq(chan->irq, xilinx_vdma_irq_handler, IRQF_SHARED,
+			  "xilinx-vdma-controller", chan);
+	if (err) {
+		dev_err(xdev->dev, "unable to request IRQ %d\n", chan->irq);
+		return err;
+	}
+
+	/* Initialize the tasklet */
+	tasklet_init(&chan->tasklet, xilinx_vdma_do_tasklet,
+			(unsigned long)chan);
+
+	/*
+	 * Initialize the DMA channel and add it to the DMA engine channels
+	 * list.
+	 */
+	chan->common.device = &xdev->common;
+
+	list_add_tail(&chan->common.device_node, &xdev->common.channels);
+	xdev->chan[chan->id] = chan;
+
+	/* Reset the channel */
+	err = xilinx_vdma_chan_reset(chan);
+	if (err < 0) {
+		dev_err(xdev->dev, "Reset channel failed\n");
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * of_dma_xilinx_xlate - Translation function
+ * @dma_spec: Pointer to DMA specifier as found in the device tree
+ * @ofdma: Pointer to DMA controller data
+ *
+ * Return: DMA channel pointer on success and NULL on error
+ */
+static struct dma_chan *of_dma_xilinx_xlate(struct of_phandle_args *dma_spec,
+						struct of_dma *ofdma)
+{
+	struct xilinx_vdma_device *xdev = ofdma->of_dma_data;
+	int chan_id = dma_spec->args[0];
+
+	if (chan_id >= XILINX_VDMA_MAX_CHANS_PER_DEVICE)
+		return NULL;
+
+	return dma_get_slave_channel(&xdev->chan[chan_id]->common);
+}
+
+/**
+ * xilinx_vdma_probe - Driver probe function
+ * @pdev: Pointer to the platform_device structure
+ *
+ * Return: '0' on success and failure value on error
+ */
+static int xilinx_vdma_probe(struct platform_device *pdev)
+{
+	struct device_node *node = pdev->dev.of_node;
+	struct xilinx_vdma_device *xdev;
+	struct device_node *child;
+	struct resource *io;
+	u32 num_frames;
+	int i, err;
+
+	/* Allocate and initialize the DMA engine structure */
+	xdev = devm_kzalloc(&pdev->dev, sizeof(*xdev), GFP_KERNEL);
+	if (!xdev)
+		return -ENOMEM;
+
+	xdev->dev = &pdev->dev;
+
+	/* Request and map I/O memory */
+	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	xdev->regs = devm_ioremap_resource(&pdev->dev, io);
+	if (IS_ERR(xdev->regs))
+		return PTR_ERR(xdev->regs);
+
+	/* Retrieve the DMA engine properties from the device tree */
+	xdev->has_sg = of_property_read_bool(node, "xlnx,include-sg");
+
+	err = of_property_read_u32(node, "xlnx,num-fstores", &num_frames);
+	if (err < 0) {
+		dev_err(xdev->dev, "missing xlnx,num-fstores property\n");
+		return err;
+	}
+
+	err = of_property_read_u32(node, "xlnx,flush-fsync",
+					&xdev->flush_on_fsync);
+	if (err < 0)
+		dev_warn(xdev->dev, "missing xlnx,flush-fsync property\n");
+
+	/* Initialize the DMA engine */
+	xdev->common.dev = &pdev->dev;
+
+	INIT_LIST_HEAD(&xdev->common.channels);
+	dma_cap_set(DMA_SLAVE, xdev->common.cap_mask);
+	dma_cap_set(DMA_PRIVATE, xdev->common.cap_mask);
+
+	xdev->common.device_alloc_chan_resources =
+				xilinx_vdma_alloc_chan_resources;
+	xdev->common.device_free_chan_resources =
+				xilinx_vdma_free_chan_resources;
+	xdev->common.device_prep_interleaved_dma =
+				xilinx_vdma_dma_prep_interleaved;
+	xdev->common.device_control = xilinx_vdma_device_control;
+	xdev->common.device_tx_status = xilinx_vdma_tx_status;
+	xdev->common.device_issue_pending = xilinx_vdma_issue_pending;
+
+	platform_set_drvdata(pdev, xdev);
+
+	/* Initialize the channels */
+	for_each_child_of_node(node, child) {
+		err = xilinx_vdma_chan_probe(xdev, child);
+		if (err < 0)
+			goto error;
+	}
+
+	for (i = 0; i < XILINX_VDMA_MAX_CHANS_PER_DEVICE; i++)
+		if (xdev->chan[i])
+			xdev->chan[i]->num_frms = num_frames;
+
+	/* Register the DMA engine with the core */
+	dma_async_device_register(&xdev->common);
+
+	err = of_dma_controller_register(node, of_dma_xilinx_xlate,
+					 xdev);
+	if (err < 0) {
+		dev_err(&pdev->dev, "Unable to register DMA to DT\n");
+		dma_async_device_unregister(&xdev->common);
+		goto error;
+	}
+
+	dev_info(&pdev->dev, "Xilinx AXI VDMA Engine Driver Probed!!\n");
+
+	return 0;
+
+error:
+	for (i = 0; i < XILINX_VDMA_MAX_CHANS_PER_DEVICE; i++)
+		if (xdev->chan[i])
+			xilinx_vdma_chan_remove(xdev->chan[i]);
+
+	return err;
+}
+
+/**
+ * xilinx_vdma_remove - Driver remove function
+ * @pdev: Pointer to the platform_device structure
+ *
+ * Return: Always '0'
+ */
+static int xilinx_vdma_remove(struct platform_device *pdev)
+{
+	struct xilinx_vdma_device *xdev = platform_get_drvdata(pdev);
+	int i;
+
+	of_dma_controller_free(pdev->dev.of_node);
+
+	dma_async_device_unregister(&xdev->common);
+
+	for (i = 0; i < XILINX_VDMA_MAX_CHANS_PER_DEVICE; i++)
+		if (xdev->chan[i])
+			xilinx_vdma_chan_remove(xdev->chan[i]);
+
+	return 0;
+}
+
+static const struct of_device_id xilinx_vdma_of_ids[] = {
+	{ .compatible = "xlnx,axi-vdma-1.00.a",},
+	{}
+};
+
+static struct platform_driver xilinx_vdma_driver = {
+	.driver = {
+		.name = "xilinx-vdma",
+		.owner = THIS_MODULE,
+		.of_match_table = xilinx_vdma_of_ids,
+	},
+	.probe = xilinx_vdma_probe,
+	.remove = xilinx_vdma_remove,
+};
+
+module_platform_driver(xilinx_vdma_driver);
+
+MODULE_AUTHOR("Xilinx, Inc.");
+MODULE_DESCRIPTION("Xilinx VDMA driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/amba/xilinx_dma.h b/include/linux/amba/xilinx_dma.h
new file mode 100644
index 000000000000..34b98f276ed0
--- /dev/null
+++ b/include/linux/amba/xilinx_dma.h
@@ -0,0 +1,47 @@
+/*
+ * Xilinx DMA Engine drivers support header file
+ *
+ * Copyright (C) 2010-2014 Xilinx, Inc. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __DMA_XILINX_DMA_H
+#define __DMA_XILINX_DMA_H
+
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+
+/**
+ * struct xilinx_vdma_config - VDMA Configuration structure
+ * @frm_dly: Frame delay
+ * @gen_lock: Whether in gen-lock mode
+ * @master: Master that it syncs to
+ * @frm_cnt_en: Enable frame count enable
+ * @park: Whether wants to park
+ * @park_frm: Frame to park on
+ * @coalesc: Interrupt coalescing threshold
+ * @delay: Delay counter
+ * @reset: Reset Channel
+ * @ext_fsync: External Frame Sync source
+ */
+struct xilinx_vdma_config {
+	int frm_dly;
+	int gen_lock;
+	int master;
+	int frm_cnt_en;
+	int park;
+	int park_frm;
+	int coalesc;
+	int delay;
+	int reset;
+	int ext_fsync;
+};
+
+int xilinx_vdma_channel_set_config(struct dma_chan *dchan,
+					struct xilinx_vdma_config *cfg);
+
+#endif
-- 
cgit v1.2.3-71-gd317


From dfbb85cab5f0819d0424a3637b03e7892704fa42 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 2 Apr 2014 20:17:00 -0700
Subject: DMA: shdma: add cyclic transfer support

This patch add cyclic transfer support and enables dmaengine_prep_dma_cyclic()

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
[reflown changelog for readablity]
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/sh/shdma-base.c | 72 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/shdma-base.h  |  1 +
 2 files changed, 66 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c
index 6786ecbd5ed4..974794cdb6ed 100644
--- a/drivers/dma/sh/shdma-base.c
+++ b/drivers/dma/sh/shdma-base.c
@@ -304,6 +304,7 @@ static dma_async_tx_callback __ld_cleanup(struct shdma_chan *schan, bool all)
 	dma_async_tx_callback callback = NULL;
 	void *param = NULL;
 	unsigned long flags;
+	LIST_HEAD(cyclic_list);
 
 	spin_lock_irqsave(&schan->chan_lock, flags);
 	list_for_each_entry_safe(desc, _desc, &schan->ld_queue, node) {
@@ -369,10 +370,16 @@ static dma_async_tx_callback __ld_cleanup(struct shdma_chan *schan, bool all)
 		if (((desc->mark == DESC_COMPLETED ||
 		      desc->mark == DESC_WAITING) &&
 		     async_tx_test_ack(&desc->async_tx)) || all) {
-			/* Remove from ld_queue list */
-			desc->mark = DESC_IDLE;
 
-			list_move(&desc->node, &schan->ld_free);
+			if (all || !desc->cyclic) {
+				/* Remove from ld_queue list */
+				desc->mark = DESC_IDLE;
+				list_move(&desc->node, &schan->ld_free);
+			} else {
+				/* reuse as cyclic */
+				desc->mark = DESC_SUBMITTED;
+				list_move_tail(&desc->node, &cyclic_list);
+			}
 
 			if (list_empty(&schan->ld_queue)) {
 				dev_dbg(schan->dev, "Bring down channel %d\n", schan->id);
@@ -389,6 +396,8 @@ static dma_async_tx_callback __ld_cleanup(struct shdma_chan *schan, bool all)
 		 */
 		schan->dma_chan.completed_cookie = schan->dma_chan.cookie;
 
+	list_splice_tail(&cyclic_list, &schan->ld_queue);
+
 	spin_unlock_irqrestore(&schan->chan_lock, flags);
 
 	if (callback)
@@ -521,7 +530,7 @@ static struct shdma_desc *shdma_add_desc(struct shdma_chan *schan,
  */
 static struct dma_async_tx_descriptor *shdma_prep_sg(struct shdma_chan *schan,
 	struct scatterlist *sgl, unsigned int sg_len, dma_addr_t *addr,
-	enum dma_transfer_direction direction, unsigned long flags)
+	enum dma_transfer_direction direction, unsigned long flags, bool cyclic)
 {
 	struct scatterlist *sg;
 	struct shdma_desc *first = NULL, *new = NULL /* compiler... */;
@@ -569,7 +578,11 @@ static struct dma_async_tx_descriptor *shdma_prep_sg(struct shdma_chan *schan,
 			if (!new)
 				goto err_get_desc;
 
-			new->chunks = chunks--;
+			new->cyclic = cyclic;
+			if (cyclic)
+				new->chunks = 1;
+			else
+				new->chunks = chunks--;
 			list_add_tail(&new->node, &tx_list);
 		} while (len);
 	}
@@ -612,7 +625,8 @@ static struct dma_async_tx_descriptor *shdma_prep_memcpy(
 	sg_dma_address(&sg) = dma_src;
 	sg_dma_len(&sg) = len;
 
-	return shdma_prep_sg(schan, &sg, 1, &dma_dest, DMA_MEM_TO_MEM, flags);
+	return shdma_prep_sg(schan, &sg, 1, &dma_dest, DMA_MEM_TO_MEM,
+			     flags, false);
 }
 
 static struct dma_async_tx_descriptor *shdma_prep_slave_sg(
@@ -640,7 +654,50 @@ static struct dma_async_tx_descriptor *shdma_prep_slave_sg(
 	slave_addr = ops->slave_addr(schan);
 
 	return shdma_prep_sg(schan, sgl, sg_len, &slave_addr,
-			      direction, flags);
+			     direction, flags, false);
+}
+
+struct dma_async_tx_descriptor *shdma_prep_dma_cyclic(
+	struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
+	size_t period_len, enum dma_transfer_direction direction,
+	unsigned long flags, void *context)
+{
+	struct shdma_chan *schan = to_shdma_chan(chan);
+	struct shdma_dev *sdev = to_shdma_dev(schan->dma_chan.device);
+	const struct shdma_ops *ops = sdev->ops;
+	unsigned int sg_len = buf_len / period_len;
+	int slave_id = schan->slave_id;
+	dma_addr_t slave_addr;
+	struct scatterlist sgl[sg_len];
+	int i;
+
+	if (!chan)
+		return NULL;
+
+	BUG_ON(!schan->desc_num);
+
+	/* Someone calling slave DMA on a generic channel? */
+	if (slave_id < 0 || (buf_len < period_len)) {
+		dev_warn(schan->dev,
+			"%s: bad parameter: buf_len=%d, period_len=%d, id=%d\n",
+			__func__, buf_len, period_len, slave_id);
+		return NULL;
+	}
+
+	slave_addr = ops->slave_addr(schan);
+
+	sg_init_table(sgl, sg_len);
+	for (i = 0; i < sg_len; i++) {
+		dma_addr_t src = buf_addr + (period_len * i);
+
+		sg_set_page(&sgl[i], pfn_to_page(PFN_DOWN(src)), period_len,
+			    offset_in_page(src));
+		sg_dma_address(&sgl[i]) = src;
+		sg_dma_len(&sgl[i]) = period_len;
+	}
+
+	return shdma_prep_sg(schan, sgl, sg_len, &slave_addr,
+			     direction, flags, true);
 }
 
 static int shdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
@@ -915,6 +972,7 @@ int shdma_init(struct device *dev, struct shdma_dev *sdev,
 
 	/* Compulsory for DMA_SLAVE fields */
 	dma_dev->device_prep_slave_sg = shdma_prep_slave_sg;
+	dma_dev->device_prep_dma_cyclic = shdma_prep_dma_cyclic;
 	dma_dev->device_control = shdma_control;
 
 	dma_dev->dev = dev;
diff --git a/include/linux/shdma-base.h b/include/linux/shdma-base.h
index f92c0a43c54c..abdf1f229dc3 100644
--- a/include/linux/shdma-base.h
+++ b/include/linux/shdma-base.h
@@ -54,6 +54,7 @@ struct shdma_desc {
 	dma_cookie_t cookie;
 	int chunks;
 	int mark;
+	bool cyclic;			/* used as cyclic transfer */
 };
 
 struct shdma_chan {
-- 
cgit v1.2.3-71-gd317


From 6d48f44b7b2af67b33c1ae5994b8f642685c8bc8 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Wed, 30 Apr 2014 15:23:33 +0300
Subject: mdio_bus: implement devm_mdiobus_alloc/devm_mdiobus_free

Add a resource managed devm_mdiobus_alloc[_size]()/devm_mdiobus_free()
to automatically clean up MDIO bus alocations made by MDIO drivers,
thus leading to simplified MDIO drivers code.

Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Acked-and-tested-by: Lad, Prabhakar <prabhakar.csengg@gmail.com>
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/driver-model/devres.txt |  5 +++
 drivers/net/phy/mdio_bus.c            | 67 +++++++++++++++++++++++++++++++++++
 include/linux/phy.h                   |  7 ++++
 3 files changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 4f7897e99cba..c74e04494ade 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -308,3 +308,8 @@ SLAVE DMA ENGINE
 
 SPI
   devm_spi_register_master()
+
+MDIO
+  devm_mdiobus_alloc()
+  devm_mdiobus_alloc_size()
+  devm_mdiobus_free()
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 76f54b32a120..68a9a3867c0f 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -69,6 +69,73 @@ struct mii_bus *mdiobus_alloc_size(size_t size)
 }
 EXPORT_SYMBOL(mdiobus_alloc_size);
 
+static void _devm_mdiobus_free(struct device *dev, void *res)
+{
+	mdiobus_free(*(struct mii_bus **)res);
+}
+
+static int devm_mdiobus_match(struct device *dev, void *res, void *data)
+{
+	struct mii_bus **r = res;
+
+	if (WARN_ON(!r || !*r))
+		return 0;
+
+	return *r == data;
+}
+
+/**
+ * devm_mdiobus_alloc_size - Resource-managed mdiobus_alloc_size()
+ * @dev:		Device to allocate mii_bus for
+ * @sizeof_priv:	Space to allocate for private structure.
+ *
+ * Managed mdiobus_alloc_size. mii_bus allocated with this function is
+ * automatically freed on driver detach.
+ *
+ * If an mii_bus allocated with this function needs to be freed separately,
+ * devm_mdiobus_free() must be used.
+ *
+ * RETURNS:
+ * Pointer to allocated mii_bus on success, NULL on failure.
+ */
+struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv)
+{
+	struct mii_bus **ptr, *bus;
+
+	ptr = devres_alloc(_devm_mdiobus_free, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	/* use raw alloc_dr for kmalloc caller tracing */
+	bus = mdiobus_alloc_size(sizeof_priv);
+	if (bus) {
+		*ptr = bus;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return bus;
+}
+EXPORT_SYMBOL_GPL(devm_mdiobus_alloc);
+
+/**
+ * devm_mdiobus_free - Resource-managed mdiobus_free()
+ * @dev:		Device this mii_bus belongs to
+ * @bus:		the mii_bus associated with the device
+ *
+ * Free mii_bus allocated with devm_mdiobus_alloc_size().
+ */
+void devm_mdiobus_free(struct device *dev, struct mii_bus *bus)
+{
+	int rc;
+
+	rc = devres_release(dev, _devm_mdiobus_free,
+			    devm_mdiobus_match, bus);
+	WARN_ON(rc);
+}
+EXPORT_SYMBOL_GPL(devm_mdiobus_free);
+
 /**
  * mdiobus_release - mii_bus device release callback
  * @d: the target struct device that contains the mii_bus
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 51d15f684e7e..864ddafad8cc 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -198,6 +198,13 @@ static inline struct mii_bus *mdiobus_alloc(void)
 int mdiobus_register(struct mii_bus *bus);
 void mdiobus_unregister(struct mii_bus *bus);
 void mdiobus_free(struct mii_bus *bus);
+struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv);
+static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev)
+{
+	return devm_mdiobus_alloc_size(dev, 0);
+}
+
+void devm_mdiobus_free(struct device *dev, struct mii_bus *bus);
 struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr);
 int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum);
 int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
-- 
cgit v1.2.3-71-gd317


From e114a710aa5058c0ba4aa1dfb105132aefeb5e04 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Apr 2014 11:58:13 -0700
Subject: tcp: fix cwnd limited checking to improve congestion control

Yuchung discovered tcp_is_cwnd_limited() was returning false in
slow start phase even if the application filled the socket write queue.

All congestion modules take into account tcp_is_cwnd_limited()
before increasing cwnd, so this behavior limits slow start from
probing the bandwidth at full speed.

The problem is that even if write queue is full (aka we are _not_
application limited), cwnd can be under utilized if TSO should auto
defer or TCP Small queues decided to hold packets.

So the in_flight can be kept to smaller value, and we can get to the
point tcp_is_cwnd_limited() returns false.

With TCP Small Queues and FQ/pacing, this issue is more visible.

We fix this by having tcp_cwnd_validate(), which is supposed to track
such things, take into account unsent_segs, the number of segs that we
are not sending at the moment due to TSO or TSQ, but intend to send
real soon. Then when we are cwnd-limited, remember this fact while we
are processing the window of ACKs that comes back.

For example, suppose we have a brand new connection with cwnd=10; we
are in slow start, and we send a flight of 9 packets. By the time we
have received ACKs for all 9 packets we want our cwnd to be 18.
We implement this by setting tp->lsnd_pending to 9, and
considering ourselves to be cwnd-limited while cwnd is less than
twice tp->lsnd_pending (2*9 -> 18).

This makes tcp_is_cwnd_limited() more understandable, by removing
the GSO/TSO kludge, that tried to work around the issue.

Note the in_flight parameter can be removed in a followup cleanup
patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 +
 include/net/tcp.h     | 22 +++++++++++++++++++++-
 net/ipv4/tcp_cong.c   | 20 --------------------
 net/ipv4/tcp_output.c | 21 ++++++++++++++-------
 4 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 239946868142..4e37c71ecd74 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -230,6 +230,7 @@ struct tcp_sock {
 	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 	u32	snd_cwnd_used;
 	u32	snd_cwnd_stamp;
+	u32	lsnd_pending;	/* packets inflight or unsent since last xmit */
 	u32	prior_cwnd;	/* Congestion window at start of Recovery. */
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 163d2b467d78..a9fe7bc4f4bb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -974,7 +974,27 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
 {
 	return tp->snd_una + tp->snd_wnd;
 }
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight);
+
+/* We follow the spirit of RFC2861 to validate cwnd but implement a more
+ * flexible approach. The RFC suggests cwnd should not be raised unless
+ * it was fully used previously. But we allow cwnd to grow as long as the
+ * application has used half the cwnd.
+ * Example :
+ *    cwnd is 10 (IW10), but application sends 9 frames.
+ *    We allow cwnd to reach 18 when all frames are ACKed.
+ * This check is safe because it's as aggressive as slow start which already
+ * risks 100% overshoot. The advantage is that we discourage application to
+ * either send more filler packets or data to artificially blow up the cwnd
+ * usage, and allow application-limited process to probe bw more aggressively.
+ *
+ * TODO: remove in_flight once we can fix all callers, and their callers...
+ */
+static inline bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return tp->snd_cwnd < 2 * tp->lsnd_pending;
+}
 
 static inline void tcp_check_probe_timer(struct sock *sk)
 {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2b9464c93b88..a93b41ba05ff 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 	return err;
 }
 
-/* RFC2861 Check whether we are limited by application or congestion window
- * This is the inverse of cwnd check in tcp_tso_should_defer
- */
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	u32 left;
-
-	if (in_flight >= tp->snd_cwnd)
-		return true;
-
-	left = tp->snd_cwnd - in_flight;
-	if (sk_can_gso(sk) &&
-	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
-	    left < tp->xmit_size_goal_segs)
-		return true;
-	return left <= tcp_max_tso_deferred_mss(tp);
-}
-EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-
 /* Slow start is used when congestion window is no greater than the slow start
  * threshold. We base on RFC2581 and also handle stretch ACKs properly.
  * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20847de991ea..f9181a133462 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1402,12 +1402,13 @@ static void tcp_cwnd_application_limited(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-/* Congestion window validation. (RFC2861) */
-static void tcp_cwnd_validate(struct sock *sk)
+static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tp->packets_out >= tp->snd_cwnd) {
+	tp->lsnd_pending = tp->packets_out + unsent_segs;
+
+	if (tcp_is_cwnd_limited(sk, 0)) {
 		/* Network is feed fully. */
 		tp->snd_cwnd_used = 0;
 		tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1880,7 +1881,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int tso_segs, sent_pkts;
+	unsigned int tso_segs, sent_pkts, unsent_segs = 0;
 	int cwnd_quota;
 	int result;
 
@@ -1924,7 +1925,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		} else {
 			if (!push_one && tcp_tso_should_defer(sk, skb))
-				break;
+				goto compute_unsent_segs;
 		}
 
 		/* TCP Small Queues :
@@ -1949,8 +1950,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			 * there is no smp_mb__after_set_bit() yet
 			 */
 			smp_mb__after_clear_bit();
-			if (atomic_read(&sk->sk_wmem_alloc) > limit)
+			if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+				u32 unsent_bytes;
+
+compute_unsent_segs:
+				unsent_bytes = tp->write_seq - tp->snd_nxt;
+				unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
 				break;
+			}
 		}
 
 		limit = mss_now;
@@ -1990,7 +1997,7 @@ repair:
 		/* Send one loss probe per tail loss episode. */
 		if (push_one != 2)
 			tcp_schedule_loss_probe(sk);
-		tcp_cwnd_validate(sk);
+		tcp_cwnd_validate(sk, unsent_segs);
 		return false;
 	}
 	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
-- 
cgit v1.2.3-71-gd317


From 69dfa00ccb72a37f3810687ca110e5a8154c6eed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 4 May 2014 15:09:13 -0400
Subject: cgroup: make flags and subsys_masks unsigned int

There's no reason to use atomic bitops for cgroup_subsys_state->flags,
cgroup_root->flags and various subsys_masks.  This patch updates those
to use bitwise and/or operations instead and converts them form
unsigned long to unsigned int.

This makes the fields occupy (marginally) smaller space and makes it
clear that they don't require atomicity.

This patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  8 ++++----
 kernel/cgroup.c        | 37 ++++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4b38e2d6110d..c6c703f2486b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -62,7 +62,7 @@ struct cgroup_subsys_state {
 	/* the parent css */
 	struct cgroup_subsys_state *parent;
 
-	unsigned long flags;
+	unsigned int flags;
 
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
@@ -185,7 +185,7 @@ struct cgroup {
 	u64 serial_nr;
 
 	/* the bitmask of subsystems enabled on the child cgroups */
-	unsigned long child_subsys_mask;
+	unsigned int child_subsys_mask;
 
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
@@ -312,7 +312,7 @@ struct cgroup_root {
 	struct kernfs_root *kf_root;
 
 	/* The bitmask of subsystems attached to this hierarchy */
-	unsigned long subsys_mask;
+	unsigned int subsys_mask;
 
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
@@ -327,7 +327,7 @@ struct cgroup_root {
 	struct list_head root_list;
 
 	/* Hierarchy-specific flags */
-	unsigned long flags;
+	unsigned int flags;
 
 	/* IDs for cgroups in this hierarchy */
 	struct idr cgroup_idr;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3873267c9ee3..21667f396a1e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -181,7 +181,7 @@ static struct cftype cgroup_base_files[];
 
 static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
-			     unsigned long ss_mask);
+			     unsigned int ss_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
@@ -963,7 +963,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 
@@ -1079,7 +1079,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be removed
  */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -1087,15 +1087,14 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 
-		if (!test_bit(i, &subsys_mask))
+		if (!(subsys_mask & (1 << i)))
 			continue;
 		list_for_each_entry(cfts, &ss->cfts, node)
 			cgroup_addrm_files(cgrp, cfts, false);
 	}
 }
 
-static int rebind_subsystems(struct cgroup_root *dst_root,
-			     unsigned long ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
 {
 	struct cgroup_subsys *ss;
 	int ssid, i, ret;
@@ -1128,7 +1127,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		 * Just warn about it and continue.
 		 */
 		if (cgrp_dfl_root_visible) {
-			pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
+			pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
 				ret, ss_mask);
 			pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
 		}
@@ -1214,8 +1213,8 @@ static int cgroup_show_options(struct seq_file *seq,
 }
 
 struct cgroup_sb_opts {
-	unsigned long subsys_mask;
-	unsigned long flags;
+	unsigned int subsys_mask;
+	unsigned int flags;
 	char *release_agent;
 	bool cpuset_clone_children;
 	char *name;
@@ -1227,12 +1226,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
-	unsigned long mask = (unsigned long)-1;
+	unsigned int mask = -1U;
 	struct cgroup_subsys *ss;
 	int i;
 
 #ifdef CONFIG_CPUSETS
-	mask = ~(1UL << cpuset_cgrp_id);
+	mask = ~(1U << cpuset_cgrp_id);
 #endif
 
 	memset(opts, 0, sizeof(*opts));
@@ -1313,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (all_ss)
 				return -EINVAL;
-			set_bit(i, &opts->subsys_mask);
+			opts->subsys_mask |= (1 << i);
 			one_ss = true;
 
 			break;
@@ -1342,7 +1341,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 		if (all_ss || (!one_ss && !opts->none && !opts->name))
 			for_each_subsys(ss, i)
 				if (!ss->disabled)
-					set_bit(i, &opts->subsys_mask);
+					opts->subsys_mask |= (1 << i);
 
 		/*
 		 * We either have to specify by name or by subsystems. (So
@@ -1373,7 +1372,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	int ret = 0;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
-	unsigned long added_mask, removed_mask;
+	unsigned int added_mask, removed_mask;
 
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_err("sane_behavior: remount is not allowed\n");
@@ -1398,7 +1397,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	/* Don't allow flags or name to change at remount */
 	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
 	    (opts.name && strcmp(opts.name, root->name))) {
-		pr_err("option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
 		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
 		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
 		ret = -EINVAL;
@@ -1522,7 +1521,7 @@ static void init_cgroup_root(struct cgroup_root *root,
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
@@ -2507,7 +2506,7 @@ out_finish:
 static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css,
 					struct cftype *cft, char *buffer)
 {
-	unsigned long enable_req = 0, disable_req = 0, enable, disable;
+	unsigned int enable_req = 0, disable_req = 0, enable, disable;
 	struct cgroup *cgrp = dummy_css->cgroup, *child;
 	struct cgroup_subsys *ss;
 	char *tok, *p;
@@ -3998,7 +3997,7 @@ static struct cftype cgroup_base_files[] = {
  *
  * On failure, no file is added.
  */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i, ret = 0;
@@ -4007,7 +4006,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 
-		if (!test_bit(i, &subsys_mask))
+		if (!(subsys_mask & (1 << i)))
 			continue;
 
 		list_for_each_entry(cfts, &ss->cfts, node) {
-- 
cgit v1.2.3-71-gd317


From 7d699ddb2b181a2c76e5ea18b1bdf102c4bebe4b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 4 May 2014 15:09:13 -0400
Subject: cgroup, memcg: allocate cgroup ID from 1

Currently, cgroup->id is allocated from 0, which is always assigned to
the root cgroup; unfortunately, memcg wants to use ID 0 to indicate
invalid IDs and ends up incrementing all IDs by one.

It's reasonable to reserve 0 for special purposes.  This patch updates
cgroup core so that ID 0 is not used and the root cgroups get ID 1.
The ID incrementing is removed form memcg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 4 ++--
 kernel/cgroup.c        | 4 ++--
 mm/memcontrol.c        | 8 ++------
 3 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c6c703f2486b..793f70a48820 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -144,8 +144,8 @@ struct cgroup {
 	/*
 	 * idr allocated in-hierarchy ID.
 	 *
-	 * The ID of the root cgroup is always 0, and a new cgroup
-	 * will be assigned with a smallest available ID.
+	 * ID 0 is not used, the ID of the root cgroup is always 1, and a
+	 * new cgroup will be assigned with a smallest available ID.
 	 *
 	 * Allocating/Removing ID must be protected by cgroup_mutex.
 	 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 21667f396a1e..3fa0463e74bb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1531,7 +1531,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
-	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
@@ -4225,7 +4225,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
 	 */
-	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		err = -ENOMEM;
 		goto err_unlock;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29501f040568..1d0b29715b73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -527,18 +527,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
-	/*
-	 * The ID of the root cgroup is 0, but memcg treat 0 as an
-	 * invalid ID, so we return (cgroup_id + 1).
-	 */
-	return memcg->css.cgroup->id + 1;
+	return memcg->css.cgroup->id;
 }
 
 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 
-	css = css_from_id(id - 1, &memory_cgrp_subsys);
+	css = css_from_id(id, &memory_cgrp_subsys);
 	return mem_cgroup_from_css(css);
 }
 
-- 
cgit v1.2.3-71-gd317


From 15a4c835e4ed3e60dd68727cd1907e3dd89563f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 4 May 2014 15:09:14 -0400
Subject: cgroup, memcg: implement css->id and convert css_from_id() to use it

Until now, cgroup->id has been used to identify all the associated
csses and css_from_id() takes cgroup ID and returns the matching css
by looking up the cgroup and then dereferencing the css associated
with it; however, now that the lifetimes of cgroup and css are
separate, this is incorrect and breaks on the unified hierarchy when a
controller is disabled and enabled back again before the previous
instance is released.

This patch adds css->id which is a subsystem-unique ID and converts
css_from_id() to look up by the new css->id instead.  memcg is the
only user of css_from_id() and also converted to use css->id instead.

For traditional hierarchies, this shouldn't make any functional
difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jianyu Zhan <nasa4836@gmail.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  9 ++++++++
 kernel/cgroup.c        | 59 ++++++++++++++++++++++++++++++++------------------
 mm/memcontrol.c        |  4 ++--
 3 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 793f70a48820..2dfabb3b749a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -62,6 +62,12 @@ struct cgroup_subsys_state {
 	/* the parent css */
 	struct cgroup_subsys_state *parent;
 
+	/*
+	 * Subsys-unique ID.  0 is unused and root is always 1.  The
+	 * matching css can be looked up using css_from_id().
+	 */
+	int id;
+
 	unsigned int flags;
 
 	/* percpu_ref killing and RCU release */
@@ -655,6 +661,9 @@ struct cgroup_subsys {
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroup_root *root;
 
+	/* idr for css->id */
+	struct idr css_idr;
+
 	/*
 	 * List of cftypes.  Each entry is the first entry of an array
 	 * terminated by zero length name.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f1c98c527b2d..a1a20e8c973a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -100,8 +100,8 @@ static DECLARE_RWSEM(css_set_rwsem);
 #endif
 
 /*
- * Protects cgroup_idr so that IDs can be released without grabbing
- * cgroup_mutex.
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
  */
 static DEFINE_SPINLOCK(cgroup_idr_lock);
 
@@ -1089,12 +1089,6 @@ static void cgroup_put(struct cgroup *cgrp)
 	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
 		return;
 
-	/*
-	 * XXX: cgrp->id is only used to look up css's.  As cgroup and
-	 * css's lifetimes will be decoupled, it should be made
-	 * per-subsystem and moved to css->id so that lookups are
-	 * successful until the target css is released.
-	 */
 	cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 	cgrp->id = -1;
 
@@ -4104,8 +4098,11 @@ static void css_release(struct percpu_ref *ref)
 {
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
+	struct cgroup_subsys *ss = css->ss;
+
+	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+	cgroup_idr_remove(&ss->css_idr, css->id);
 
-	RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
 
@@ -4195,9 +4192,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	if (err)
 		goto err_free_css;
 
+	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+	if (err < 0)
+		goto err_free_percpu_ref;
+	css->id = err;
+
 	err = cgroup_populate_dir(cgrp, 1 << ss->id);
 	if (err)
-		goto err_free_percpu_ref;
+		goto err_free_id;
+
+	/* @css is ready to be brought online now, make it visible */
+	cgroup_idr_replace(&ss->css_idr, css, css->id);
 
 	err = online_css(css);
 	if (err)
@@ -4216,6 +4221,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 
 err_clear_dir:
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+	cgroup_idr_remove(&ss->css_idr, css->id);
 err_free_percpu_ref:
 	percpu_ref_cancel_init(&css->refcnt);
 err_free_css:
@@ -4642,7 +4649,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.rename			= cgroup_rename,
 };
 
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 {
 	struct cgroup_subsys_state *css;
 
@@ -4651,6 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
+	idr_init(&ss->css_idr);
 	INIT_LIST_HEAD(&ss->cfts);
 
 	/* Create the root cgroup state for this subsystem */
@@ -4659,6 +4667,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+	if (early) {
+		/* idr_alloc() can't be called safely during early init */
+		css->id = 1;
+	} else {
+		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+		BUG_ON(css->id < 0);
+	}
 
 	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
@@ -4709,7 +4724,7 @@ int __init cgroup_init_early(void)
 		ss->name = cgroup_subsys_name[i];
 
 		if (ss->early_init)
-			cgroup_init_subsys(ss);
+			cgroup_init_subsys(ss, true);
 	}
 	return 0;
 }
@@ -4741,8 +4756,16 @@ int __init cgroup_init(void)
 	mutex_unlock(&cgroup_tree_mutex);
 
 	for_each_subsys(ss, ssid) {
-		if (!ss->early_init)
-			cgroup_init_subsys(ss);
+		if (ss->early_init) {
+			struct cgroup_subsys_state *css =
+				init_css_set.subsys[ss->id];
+
+			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+						   GFP_KERNEL);
+			BUG_ON(css->id < 0);
+		} else {
+			cgroup_init_subsys(ss, false);
+		}
 
 		list_add_tail(&init_css_set.e_cset_node[ssid],
 			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
@@ -5196,14 +5219,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
  */
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
-	struct cgroup *cgrp;
-
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	cgrp = idr_find(&ss->root->cgroup_idr, id);
-	if (cgrp)
-		return cgroup_css(cgrp, ss);
-	return NULL;
+	return idr_find(&ss->css_idr, id);
 }
 
 #ifdef CONFIG_CGROUP_DEBUG
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1d0b29715b73..c3f82f69ef58 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -527,7 +527,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
-	return memcg->css.cgroup->id;
+	return memcg->css.id;
 }
 
 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
@@ -6401,7 +6401,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
 
-	if (css->cgroup->id > MEM_CGROUP_ID_MAX)
+	if (css->id > MEM_CGROUP_ID_MAX)
 		return -ENOSPC;
 
 	if (!parent)
-- 
cgit v1.2.3-71-gd317


From 5bcfedf06f7fdf9efcf65dc11198e9012f7530f4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 1 May 2014 18:34:18 +0200
Subject: net: filter: simplify label names from jump-table

This patch simplifies label naming for the BPF jump-table.
When we define labels via DL(), we just concatenate/textify
the combination of instruction opcode which consists of the
class, subclass, word size, target register and so on. Each
time we leave BPF_ prefix intact, so that e.g. the preprocessor
generates a label BPF_ALU_BPF_ADD_BPF_X for DL(BPF_ALU, BPF_ADD,
BPF_X) whereas a label name of ALU_ADD_X is much more easy
to grasp. Pure cleanup only.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |   3 +
 net/core/filter.c      | 308 ++++++++++++++++++++++++-------------------------
 2 files changed, 157 insertions(+), 154 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 759abf78dd61..b042d1db127f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -37,6 +37,9 @@
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
+/* Placeholder/dummy for 0 */
+#define BPF_0		0
+
 /* BPF has 10 general purpose 64-bit registers and stack frame. */
 #define MAX_BPF_REG	11
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 7c4db3dd3d1e..a1784e95b049 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -156,94 +156,94 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 	static const void *jumptable[256] = {
 		[0 ... 255] = &&default_label,
 		/* Now overwrite non-defaults ... */
-#define DL(A, B, C)	[A|B|C] = &&A##_##B##_##C
-		DL(BPF_ALU, BPF_ADD, BPF_X),
-		DL(BPF_ALU, BPF_ADD, BPF_K),
-		DL(BPF_ALU, BPF_SUB, BPF_X),
-		DL(BPF_ALU, BPF_SUB, BPF_K),
-		DL(BPF_ALU, BPF_AND, BPF_X),
-		DL(BPF_ALU, BPF_AND, BPF_K),
-		DL(BPF_ALU, BPF_OR, BPF_X),
-		DL(BPF_ALU, BPF_OR, BPF_K),
-		DL(BPF_ALU, BPF_LSH, BPF_X),
-		DL(BPF_ALU, BPF_LSH, BPF_K),
-		DL(BPF_ALU, BPF_RSH, BPF_X),
-		DL(BPF_ALU, BPF_RSH, BPF_K),
-		DL(BPF_ALU, BPF_XOR, BPF_X),
-		DL(BPF_ALU, BPF_XOR, BPF_K),
-		DL(BPF_ALU, BPF_MUL, BPF_X),
-		DL(BPF_ALU, BPF_MUL, BPF_K),
-		DL(BPF_ALU, BPF_MOV, BPF_X),
-		DL(BPF_ALU, BPF_MOV, BPF_K),
-		DL(BPF_ALU, BPF_DIV, BPF_X),
-		DL(BPF_ALU, BPF_DIV, BPF_K),
-		DL(BPF_ALU, BPF_MOD, BPF_X),
-		DL(BPF_ALU, BPF_MOD, BPF_K),
-		DL(BPF_ALU, BPF_NEG, 0),
-		DL(BPF_ALU, BPF_END, BPF_TO_BE),
-		DL(BPF_ALU, BPF_END, BPF_TO_LE),
-		DL(BPF_ALU64, BPF_ADD, BPF_X),
-		DL(BPF_ALU64, BPF_ADD, BPF_K),
-		DL(BPF_ALU64, BPF_SUB, BPF_X),
-		DL(BPF_ALU64, BPF_SUB, BPF_K),
-		DL(BPF_ALU64, BPF_AND, BPF_X),
-		DL(BPF_ALU64, BPF_AND, BPF_K),
-		DL(BPF_ALU64, BPF_OR, BPF_X),
-		DL(BPF_ALU64, BPF_OR, BPF_K),
-		DL(BPF_ALU64, BPF_LSH, BPF_X),
-		DL(BPF_ALU64, BPF_LSH, BPF_K),
-		DL(BPF_ALU64, BPF_RSH, BPF_X),
-		DL(BPF_ALU64, BPF_RSH, BPF_K),
-		DL(BPF_ALU64, BPF_XOR, BPF_X),
-		DL(BPF_ALU64, BPF_XOR, BPF_K),
-		DL(BPF_ALU64, BPF_MUL, BPF_X),
-		DL(BPF_ALU64, BPF_MUL, BPF_K),
-		DL(BPF_ALU64, BPF_MOV, BPF_X),
-		DL(BPF_ALU64, BPF_MOV, BPF_K),
-		DL(BPF_ALU64, BPF_ARSH, BPF_X),
-		DL(BPF_ALU64, BPF_ARSH, BPF_K),
-		DL(BPF_ALU64, BPF_DIV, BPF_X),
-		DL(BPF_ALU64, BPF_DIV, BPF_K),
-		DL(BPF_ALU64, BPF_MOD, BPF_X),
-		DL(BPF_ALU64, BPF_MOD, BPF_K),
-		DL(BPF_ALU64, BPF_NEG, 0),
-		DL(BPF_JMP, BPF_CALL, 0),
-		DL(BPF_JMP, BPF_JA, 0),
-		DL(BPF_JMP, BPF_JEQ, BPF_X),
-		DL(BPF_JMP, BPF_JEQ, BPF_K),
-		DL(BPF_JMP, BPF_JNE, BPF_X),
-		DL(BPF_JMP, BPF_JNE, BPF_K),
-		DL(BPF_JMP, BPF_JGT, BPF_X),
-		DL(BPF_JMP, BPF_JGT, BPF_K),
-		DL(BPF_JMP, BPF_JGE, BPF_X),
-		DL(BPF_JMP, BPF_JGE, BPF_K),
-		DL(BPF_JMP, BPF_JSGT, BPF_X),
-		DL(BPF_JMP, BPF_JSGT, BPF_K),
-		DL(BPF_JMP, BPF_JSGE, BPF_X),
-		DL(BPF_JMP, BPF_JSGE, BPF_K),
-		DL(BPF_JMP, BPF_JSET, BPF_X),
-		DL(BPF_JMP, BPF_JSET, BPF_K),
-		DL(BPF_JMP, BPF_EXIT, 0),
-		DL(BPF_STX, BPF_MEM, BPF_B),
-		DL(BPF_STX, BPF_MEM, BPF_H),
-		DL(BPF_STX, BPF_MEM, BPF_W),
-		DL(BPF_STX, BPF_MEM, BPF_DW),
-		DL(BPF_STX, BPF_XADD, BPF_W),
-		DL(BPF_STX, BPF_XADD, BPF_DW),
-		DL(BPF_ST, BPF_MEM, BPF_B),
-		DL(BPF_ST, BPF_MEM, BPF_H),
-		DL(BPF_ST, BPF_MEM, BPF_W),
-		DL(BPF_ST, BPF_MEM, BPF_DW),
-		DL(BPF_LDX, BPF_MEM, BPF_B),
-		DL(BPF_LDX, BPF_MEM, BPF_H),
-		DL(BPF_LDX, BPF_MEM, BPF_W),
-		DL(BPF_LDX, BPF_MEM, BPF_DW),
-		DL(BPF_LD, BPF_ABS, BPF_W),
-		DL(BPF_LD, BPF_ABS, BPF_H),
-		DL(BPF_LD, BPF_ABS, BPF_B),
-		DL(BPF_LD, BPF_IND, BPF_W),
-		DL(BPF_LD, BPF_IND, BPF_H),
-		DL(BPF_LD, BPF_IND, BPF_B),
+#define DL(A, B, C)	[BPF_##A|BPF_##B|BPF_##C] = &&A##_##B##_##C
+		DL(ALU, ADD, X),
+		DL(ALU, ADD, K),
+		DL(ALU, SUB, X),
+		DL(ALU, SUB, K),
+		DL(ALU, AND, X),
+		DL(ALU, AND, K),
+		DL(ALU, OR, X),
+		DL(ALU, OR, K),
+		DL(ALU, LSH, X),
+		DL(ALU, LSH, K),
+		DL(ALU, RSH, X),
+		DL(ALU, RSH, K),
+		DL(ALU, XOR, X),
+		DL(ALU, XOR, K),
+		DL(ALU, MUL, X),
+		DL(ALU, MUL, K),
+		DL(ALU, MOV, X),
+		DL(ALU, MOV, K),
+		DL(ALU, DIV, X),
+		DL(ALU, DIV, K),
+		DL(ALU, MOD, X),
+		DL(ALU, MOD, K),
+		DL(ALU, NEG, 0),
+		DL(ALU, END, TO_BE),
+		DL(ALU, END, TO_LE),
+		DL(ALU64, ADD, X),
+		DL(ALU64, ADD, K),
+		DL(ALU64, SUB, X),
+		DL(ALU64, SUB, K),
+		DL(ALU64, AND, X),
+		DL(ALU64, AND, K),
+		DL(ALU64, OR, X),
+		DL(ALU64, OR, K),
+		DL(ALU64, LSH, X),
+		DL(ALU64, LSH, K),
+		DL(ALU64, RSH, X),
+		DL(ALU64, RSH, K),
+		DL(ALU64, XOR, X),
+		DL(ALU64, XOR, K),
+		DL(ALU64, MUL, X),
+		DL(ALU64, MUL, K),
+		DL(ALU64, MOV, X),
+		DL(ALU64, MOV, K),
+		DL(ALU64, ARSH, X),
+		DL(ALU64, ARSH, K),
+		DL(ALU64, DIV, X),
+		DL(ALU64, DIV, K),
+		DL(ALU64, MOD, X),
+		DL(ALU64, MOD, K),
+		DL(ALU64, NEG, 0),
+		DL(JMP, CALL, 0),
+		DL(JMP, JA, 0),
+		DL(JMP, JEQ, X),
+		DL(JMP, JEQ, K),
+		DL(JMP, JNE, X),
+		DL(JMP, JNE, K),
+		DL(JMP, JGT, X),
+		DL(JMP, JGT, K),
+		DL(JMP, JGE, X),
+		DL(JMP, JGE, K),
+		DL(JMP, JSGT, X),
+		DL(JMP, JSGT, K),
+		DL(JMP, JSGE, X),
+		DL(JMP, JSGE, K),
+		DL(JMP, JSET, X),
+		DL(JMP, JSET, K),
+		DL(JMP, EXIT, 0),
+		DL(STX, MEM, B),
+		DL(STX, MEM, H),
+		DL(STX, MEM, W),
+		DL(STX, MEM, DW),
+		DL(STX, XADD, W),
+		DL(STX, XADD, DW),
+		DL(ST, MEM, B),
+		DL(ST, MEM, H),
+		DL(ST, MEM, W),
+		DL(ST, MEM, DW),
+		DL(LDX, MEM, B),
+		DL(LDX, MEM, H),
+		DL(LDX, MEM, W),
+		DL(LDX, MEM, DW),
+		DL(LD, ABS, W),
+		DL(LD, ABS, H),
+		DL(LD, ABS, B),
+		DL(LD, IND, W),
+		DL(LD, IND, H),
+		DL(LD, IND, B),
 #undef DL
 	};
 
@@ -257,93 +257,93 @@ select_insn:
 
 	/* ALU */
 #define ALU(OPCODE, OP)			\
-	BPF_ALU64_##OPCODE##_BPF_X:	\
+	ALU64_##OPCODE##_X:		\
 		A = A OP X;		\
 		CONT;			\
-	BPF_ALU_##OPCODE##_BPF_X:	\
+	ALU_##OPCODE##_X:		\
 		A = (u32) A OP (u32) X;	\
 		CONT;			\
-	BPF_ALU64_##OPCODE##_BPF_K:	\
+	ALU64_##OPCODE##_K:		\
 		A = A OP K;		\
 		CONT;			\
-	BPF_ALU_##OPCODE##_BPF_K:	\
+	ALU_##OPCODE##_K:		\
 		A = (u32) A OP (u32) K;	\
 		CONT;
 
-	ALU(BPF_ADD,  +)
-	ALU(BPF_SUB,  -)
-	ALU(BPF_AND,  &)
-	ALU(BPF_OR,   |)
-	ALU(BPF_LSH, <<)
-	ALU(BPF_RSH, >>)
-	ALU(BPF_XOR,  ^)
-	ALU(BPF_MUL,  *)
+	ALU(ADD,  +)
+	ALU(SUB,  -)
+	ALU(AND,  &)
+	ALU(OR,   |)
+	ALU(LSH, <<)
+	ALU(RSH, >>)
+	ALU(XOR,  ^)
+	ALU(MUL,  *)
 #undef ALU
-	BPF_ALU_BPF_NEG_0:
+	ALU_NEG_0:
 		A = (u32) -A;
 		CONT;
-	BPF_ALU64_BPF_NEG_0:
+	ALU64_NEG_0:
 		A = -A;
 		CONT;
-	BPF_ALU_BPF_MOV_BPF_X:
+	ALU_MOV_X:
 		A = (u32) X;
 		CONT;
-	BPF_ALU_BPF_MOV_BPF_K:
+	ALU_MOV_K:
 		A = (u32) K;
 		CONT;
-	BPF_ALU64_BPF_MOV_BPF_X:
+	ALU64_MOV_X:
 		A = X;
 		CONT;
-	BPF_ALU64_BPF_MOV_BPF_K:
+	ALU64_MOV_K:
 		A = K;
 		CONT;
-	BPF_ALU64_BPF_ARSH_BPF_X:
+	ALU64_ARSH_X:
 		(*(s64 *) &A) >>= X;
 		CONT;
-	BPF_ALU64_BPF_ARSH_BPF_K:
+	ALU64_ARSH_K:
 		(*(s64 *) &A) >>= K;
 		CONT;
-	BPF_ALU64_BPF_MOD_BPF_X:
+	ALU64_MOD_X:
 		if (unlikely(X == 0))
 			return 0;
 		tmp = A;
 		A = do_div(tmp, X);
 		CONT;
-	BPF_ALU_BPF_MOD_BPF_X:
+	ALU_MOD_X:
 		if (unlikely(X == 0))
 			return 0;
 		tmp = (u32) A;
 		A = do_div(tmp, (u32) X);
 		CONT;
-	BPF_ALU64_BPF_MOD_BPF_K:
+	ALU64_MOD_K:
 		tmp = A;
 		A = do_div(tmp, K);
 		CONT;
-	BPF_ALU_BPF_MOD_BPF_K:
+	ALU_MOD_K:
 		tmp = (u32) A;
 		A = do_div(tmp, (u32) K);
 		CONT;
-	BPF_ALU64_BPF_DIV_BPF_X:
+	ALU64_DIV_X:
 		if (unlikely(X == 0))
 			return 0;
 		do_div(A, X);
 		CONT;
-	BPF_ALU_BPF_DIV_BPF_X:
+	ALU_DIV_X:
 		if (unlikely(X == 0))
 			return 0;
 		tmp = (u32) A;
 		do_div(tmp, (u32) X);
 		A = (u32) tmp;
 		CONT;
-	BPF_ALU64_BPF_DIV_BPF_K:
+	ALU64_DIV_K:
 		do_div(A, K);
 		CONT;
-	BPF_ALU_BPF_DIV_BPF_K:
+	ALU_DIV_K:
 		tmp = (u32) A;
 		do_div(tmp, (u32) K);
 		A = (u32) tmp;
 		CONT;
-	BPF_ALU_BPF_END_BPF_TO_BE:
+	ALU_END_TO_BE:
 		switch (K) {
 		case 16:
 			A = (__force u16) cpu_to_be16(A);
@@ -356,7 +356,7 @@ select_insn:
 			break;
 		}
 		CONT;
-	BPF_ALU_BPF_END_BPF_TO_LE:
+	ALU_END_TO_LE:
 		switch (K) {
 		case 16:
 			A = (__force u16) cpu_to_le16(A);
@@ -371,7 +371,7 @@ select_insn:
 		CONT;
 
 	/* CALL */
-	BPF_JMP_BPF_CALL_0:
+	JMP_CALL_0:
 		/* Function call scratches R1-R5 registers, preserves R6-R9,
 		 * and stores return value into R0.
 		 */
@@ -380,122 +380,122 @@ select_insn:
 		CONT;
 
 	/* JMP */
-	BPF_JMP_BPF_JA_0:
+	JMP_JA_0:
 		insn += insn->off;
 		CONT;
-	BPF_JMP_BPF_JEQ_BPF_X:
+	JMP_JEQ_X:
 		if (A == X) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JEQ_BPF_K:
+	JMP_JEQ_K:
 		if (A == K) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JNE_BPF_X:
+	JMP_JNE_X:
 		if (A != X) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JNE_BPF_K:
+	JMP_JNE_K:
 		if (A != K) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JGT_BPF_X:
+	JMP_JGT_X:
 		if (A > X) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JGT_BPF_K:
+	JMP_JGT_K:
 		if (A > K) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JGE_BPF_X:
+	JMP_JGE_X:
 		if (A >= X) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JGE_BPF_K:
+	JMP_JGE_K:
 		if (A >= K) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JSGT_BPF_X:
-		if (((s64)A) > ((s64)X)) {
+	JMP_JSGT_X:
+		if (((s64) A) > ((s64) X)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JSGT_BPF_K:
-		if (((s64)A) > ((s64)K)) {
+	JMP_JSGT_K:
+		if (((s64) A) > ((s64) K)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JSGE_BPF_X:
-		if (((s64)A) >= ((s64)X)) {
+	JMP_JSGE_X:
+		if (((s64) A) >= ((s64) X)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JSGE_BPF_K:
-		if (((s64)A) >= ((s64)K)) {
+	JMP_JSGE_K:
+		if (((s64) A) >= ((s64) K)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JSET_BPF_X:
+	JMP_JSET_X:
 		if (A & X) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_JSET_BPF_K:
+	JMP_JSET_K:
 		if (A & K) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
-	BPF_JMP_BPF_EXIT_0:
+	JMP_EXIT_0:
 		return R0;
 
 	/* STX and ST and LDX*/
 #define LDST(SIZEOP, SIZE)					\
-	BPF_STX_BPF_MEM_##SIZEOP:				\
+	STX_MEM_##SIZEOP:					\
 		*(SIZE *)(unsigned long) (A + insn->off) = X;	\
 		CONT;						\
-	BPF_ST_BPF_MEM_##SIZEOP:				\
+	ST_MEM_##SIZEOP:					\
 		*(SIZE *)(unsigned long) (A + insn->off) = K;	\
 		CONT;						\
-	BPF_LDX_BPF_MEM_##SIZEOP:				\
+	LDX_MEM_##SIZEOP:					\
 		A = *(SIZE *)(unsigned long) (X + insn->off);	\
 		CONT;
 
-	LDST(BPF_B,   u8)
-	LDST(BPF_H,  u16)
-	LDST(BPF_W,  u32)
-	LDST(BPF_DW, u64)
+	LDST(B,   u8)
+	LDST(H,  u16)
+	LDST(W,  u32)
+	LDST(DW, u64)
 #undef LDST
-	BPF_STX_BPF_XADD_BPF_W: /* lock xadd *(u32 *)(A + insn->off) += X */
+	STX_XADD_W: /* lock xadd *(u32 *)(A + insn->off) += X */
 		atomic_add((u32) X, (atomic_t *)(unsigned long)
 			   (A + insn->off));
 		CONT;
-	BPF_STX_BPF_XADD_BPF_DW: /* lock xadd *(u64 *)(A + insn->off) += X */
+	STX_XADD_DW: /* lock xadd *(u64 *)(A + insn->off) += X */
 		atomic64_add((u64) X, (atomic64_t *)(unsigned long)
 			     (A + insn->off));
 		CONT;
-	BPF_LD_BPF_ABS_BPF_W: /* R0 = ntohl(*(u32 *) (skb->data + K)) */
+	LD_ABS_W: /* R0 = ntohl(*(u32 *) (skb->data + K)) */
 		off = K;
 load_word:
 		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
@@ -524,7 +524,7 @@ load_word:
 			CONT;
 		}
 		return 0;
-	BPF_LD_BPF_ABS_BPF_H: /* R0 = ntohs(*(u16 *) (skb->data + K)) */
+	LD_ABS_H: /* R0 = ntohs(*(u16 *) (skb->data + K)) */
 		off = K;
 load_half:
 		ptr = load_pointer((struct sk_buff *) ctx, off, 2, &tmp);
@@ -533,7 +533,7 @@ load_half:
 			CONT;
 		}
 		return 0;
-	BPF_LD_BPF_ABS_BPF_B: /* R0 = *(u8 *) (ctx + K) */
+	LD_ABS_B: /* R0 = *(u8 *) (ctx + K) */
 		off = K;
 load_byte:
 		ptr = load_pointer((struct sk_buff *) ctx, off, 1, &tmp);
@@ -542,13 +542,13 @@ load_byte:
 			CONT;
 		}
 		return 0;
-	BPF_LD_BPF_IND_BPF_W: /* R0 = ntohl(*(u32 *) (skb->data + X + K)) */
+	LD_IND_W: /* R0 = ntohl(*(u32 *) (skb->data + X + K)) */
 		off = K + X;
 		goto load_word;
-	BPF_LD_BPF_IND_BPF_H: /* R0 = ntohs(*(u16 *) (skb->data + X + K)) */
+	LD_IND_H: /* R0 = ntohs(*(u16 *) (skb->data + X + K)) */
 		off = K + X;
 		goto load_half;
-	BPF_LD_BPF_IND_BPF_B: /* R0 = *(u8 *) (skb->data + X + K) */
+	LD_IND_B: /* R0 = *(u8 *) (skb->data + X + K) */
 		off = K + X;
 		goto load_byte;
 
-- 
cgit v1.2.3-71-gd317


From 30743837dd204d2b04fd4e9d3db78cc7b118c81a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 1 May 2014 18:34:19 +0200
Subject: net: filter: make register naming more comprehensible

The current code is a bit hard to parse on which registers can be used,
how they are mapped and all play together. It makes much more sense to
define this a bit more clearly so that the code is a bit more intuitive.
This patch cleans this up, and makes naming a bit more consistent among
the code. This also allows for moving some of the defines into the header
file. Clearing of A and X registers in __sk_run_filter() do not get a
particular register name assigned as they have not an 'official' function,
but rather just result from the concrete initial mapping of old BPF
programs. Since for BPF helper functions for BPF_CALL we already use
small letters, so be consistent here as well. No functional changes.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  44 ++++++++--
 net/core/filter.c      | 215 +++++++++++++++++++++++++------------------------
 2 files changed, 145 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index b042d1db127f..ed1efab10b8f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -40,16 +40,47 @@
 /* Placeholder/dummy for 0 */
 #define BPF_0		0
 
+/* Register numbers */
+enum {
+	BPF_REG_0 = 0,
+	BPF_REG_1,
+	BPF_REG_2,
+	BPF_REG_3,
+	BPF_REG_4,
+	BPF_REG_5,
+	BPF_REG_6,
+	BPF_REG_7,
+	BPF_REG_8,
+	BPF_REG_9,
+	BPF_REG_10,
+	__MAX_BPF_REG,
+};
+
 /* BPF has 10 general purpose 64-bit registers and stack frame. */
-#define MAX_BPF_REG	11
+#define MAX_BPF_REG	__MAX_BPF_REG
+
+/* ArgX, context and stack frame pointer register positions. Note,
+ * Arg1, Arg2, Arg3, etc are used as argument mappings of function
+ * calls in BPF_CALL instruction.
+ */
+#define BPF_REG_ARG1	BPF_REG_1
+#define BPF_REG_ARG2	BPF_REG_2
+#define BPF_REG_ARG3	BPF_REG_3
+#define BPF_REG_ARG4	BPF_REG_4
+#define BPF_REG_ARG5	BPF_REG_5
+#define BPF_REG_CTX	BPF_REG_6
+#define BPF_REG_FP	BPF_REG_10
+
+/* Additional register mappings for converted user programs. */
+#define BPF_REG_A	BPF_REG_0
+#define BPF_REG_X	BPF_REG_7
+#define BPF_REG_TMP	BPF_REG_8
 
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
-/* Arg1, context and stack frame pointer register positions. */
-#define ARG1_REG	1
-#define CTX_REG		6
-#define FP_REG		10
+/* Macro to invoke filter function. */
+#define SK_RUN_FILTER(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
 
 struct sock_filter_int {
 	__u8	code;		/* opcode */
@@ -100,9 +131,6 @@ static inline unsigned int sk_filter_size(unsigned int proglen)
 #define sk_filter_proglen(fprog)			\
 		(fprog->len * sizeof(fprog->filter[0]))
 
-#define SK_RUN_FILTER(filter, ctx)			\
-		(*filter->bpf_func)(ctx, filter->insnsi)
-
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
 u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
diff --git a/net/core/filter.c b/net/core/filter.c
index a1784e95b049..c93ca8d66f37 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -45,6 +45,27 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 
+/* Registers */
+#define R0	regs[BPF_REG_0]
+#define R1	regs[BPF_REG_1]
+#define R2	regs[BPF_REG_2]
+#define R3	regs[BPF_REG_3]
+#define R4	regs[BPF_REG_4]
+#define R5	regs[BPF_REG_5]
+#define R6	regs[BPF_REG_6]
+#define R7	regs[BPF_REG_7]
+#define R8	regs[BPF_REG_8]
+#define R9	regs[BPF_REG_9]
+#define R10	regs[BPF_REG_10]
+
+/* Named registers */
+#define A	regs[insn->a_reg]
+#define X	regs[insn->x_reg]
+#define FP	regs[BPF_REG_FP]
+#define ARG1	regs[BPF_REG_ARG1]
+#define CTX	regs[BPF_REG_CTX]
+#define K	insn->imm
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
@@ -122,13 +143,6 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return 0;
 }
 
-/* Register mappings for user programs. */
-#define A_REG		0
-#define X_REG		7
-#define TMP_REG		8
-#define ARG2_REG	2
-#define ARG3_REG	3
-
 /**
  *	__sk_run_filter - run a filter on a given context
  *	@ctx: buffer to run the filter on
@@ -142,17 +156,6 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 {
 	u64 stack[MAX_BPF_STACK / sizeof(u64)];
 	u64 regs[MAX_BPF_REG], tmp;
-	void *ptr;
-	int off;
-
-#define K  insn->imm
-#define A  regs[insn->a_reg]
-#define X  regs[insn->x_reg]
-#define R0 regs[0]
-
-#define CONT	 ({insn++; goto select_insn; })
-#define CONT_JMP ({insn++; goto select_insn; })
-
 	static const void *jumptable[256] = {
 		[0 ... 255] = &&default_label,
 		/* Now overwrite non-defaults ... */
@@ -246,11 +249,18 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 		DL(LD, IND, B),
 #undef DL
 	};
+	void *ptr;
+	int off;
 
-	regs[FP_REG]  = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	regs[ARG1_REG] = (u64) (unsigned long) ctx;
-	regs[A_REG] = 0;
-	regs[X_REG] = 0;
+#define CONT	 ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+
+	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+	ARG1 = (u64) (unsigned long) ctx;
+
+	/* Register for user BPF programs need to be reset first. */
+	regs[BPF_REG_A] = 0;
+	regs[BPF_REG_X] = 0;
 
 select_insn:
 	goto *jumptable[insn->code];
@@ -375,8 +385,7 @@ select_insn:
 		/* Function call scratches R1-R5 registers, preserves R6-R9,
 		 * and stores return value into R0.
 		 */
-		R0 = (__bpf_call_base + insn->imm)(regs[1], regs[2], regs[3],
-						   regs[4], regs[5]);
+		R0 = (__bpf_call_base + insn->imm)(R1, R2, R3, R4, R5);
 		CONT;
 
 	/* JMP */
@@ -500,7 +509,7 @@ select_insn:
 load_word:
 		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
 		 * appearing in the programs where ctx == skb. All programs
-		 * keep 'ctx' in regs[CTX_REG] == R6, sk_convert_filter()
+		 * keep 'ctx' in regs[BPF_REG_CTX] == R6, sk_convert_filter()
 		 * saves it in R6, internal BPF verifier will check that
 		 * R6 == ctx.
 		 *
@@ -556,13 +565,6 @@ load_byte:
 		/* If we ever reach this, we have a bug somewhere. */
 		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
 		return 0;
-#undef CONT_JMP
-#undef CONT
-
-#undef R0
-#undef X
-#undef A
-#undef K
 }
 
 u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
@@ -594,14 +596,14 @@ static unsigned int pkt_type_offset(void)
 	return -1;
 }
 
-static u64 __skb_get_pay_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
 
 	return __skb_get_poff(skb);
 }
 
-static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
 	struct nlattr *nla;
@@ -612,17 +614,17 @@ static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
 	if (skb->len < sizeof(struct nlattr))
 		return 0;
 
-	if (A > skb->len - sizeof(struct nlattr))
+	if (a > skb->len - sizeof(struct nlattr))
 		return 0;
 
-	nla = nla_find((struct nlattr *) &skb->data[A], skb->len - A, X);
+	nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
 	if (nla)
 		return (void *) nla - (void *) skb->data;
 
 	return 0;
 }
 
-static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
 	struct nlattr *nla;
@@ -633,27 +635,27 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
 	if (skb->len < sizeof(struct nlattr))
 		return 0;
 
-	if (A > skb->len - sizeof(struct nlattr))
+	if (a > skb->len - sizeof(struct nlattr))
 		return 0;
 
-	nla = (struct nlattr *) &skb->data[A];
-	if (nla->nla_len > skb->len - A)
+	nla = (struct nlattr *) &skb->data[a];
+	if (nla->nla_len > skb->len - a)
 		return 0;
 
-	nla = nla_find_nested(nla, X);
+	nla = nla_find_nested(nla, x);
 	if (nla)
 		return (void *) nla - (void *) skb->data;
 
 	return 0;
 }
 
-static u64 __get_raw_cpu_id(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	return raw_smp_processor_id();
 }
 
 /* note that this only generates 32-bit random numbers */
-static u64 __get_random_u32(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	return (u64)prandom_u32();
 }
@@ -668,28 +670,28 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
 		insn->code = BPF_LDX | BPF_MEM | BPF_H;
-		insn->a_reg = A_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_CTX;
 		insn->off = offsetof(struct sk_buff, protocol);
 		insn++;
 
 		/* A = ntohs(A) [emitting a nop or swap16] */
 		insn->code = BPF_ALU | BPF_END | BPF_FROM_BE;
-		insn->a_reg = A_REG;
+		insn->a_reg = BPF_REG_A;
 		insn->imm = 16;
 		break;
 
 	case SKF_AD_OFF + SKF_AD_PKTTYPE:
 		insn->code = BPF_LDX | BPF_MEM | BPF_B;
-		insn->a_reg = A_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_CTX;
 		insn->off = pkt_type_offset();
 		if (insn->off < 0)
 			return false;
 		insn++;
 
 		insn->code = BPF_ALU | BPF_AND | BPF_K;
-		insn->a_reg = A_REG;
+		insn->a_reg = BPF_REG_A;
 		insn->imm = PKT_TYPE_MAX;
 		break;
 
@@ -699,13 +701,13 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 			insn->code = BPF_LDX | BPF_MEM | BPF_DW;
 		else
 			insn->code = BPF_LDX | BPF_MEM | BPF_W;
-		insn->a_reg = TMP_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_TMP;
+		insn->x_reg = BPF_REG_CTX;
 		insn->off = offsetof(struct sk_buff, dev);
 		insn++;
 
 		insn->code = BPF_JMP | BPF_JNE | BPF_K;
-		insn->a_reg = TMP_REG;
+		insn->a_reg = BPF_REG_TMP;
 		insn->imm = 0;
 		insn->off = 1;
 		insn++;
@@ -716,8 +718,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
 
-		insn->a_reg = A_REG;
-		insn->x_reg = TMP_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_TMP;
 
 		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) {
 			insn->code = BPF_LDX | BPF_MEM | BPF_W;
@@ -732,8 +734,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 
 		insn->code = BPF_LDX | BPF_MEM | BPF_W;
-		insn->a_reg = A_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_CTX;
 		insn->off = offsetof(struct sk_buff, mark);
 		break;
 
@@ -741,8 +743,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 
 		insn->code = BPF_LDX | BPF_MEM | BPF_W;
-		insn->a_reg = A_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_CTX;
 		insn->off = offsetof(struct sk_buff, hash);
 		break;
 
@@ -750,8 +752,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
 
 		insn->code = BPF_LDX | BPF_MEM | BPF_H;
-		insn->a_reg = A_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_CTX;
 		insn->off = offsetof(struct sk_buff, queue_mapping);
 		break;
 
@@ -760,8 +762,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 
 		insn->code = BPF_LDX | BPF_MEM | BPF_H;
-		insn->a_reg = A_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_CTX;
 		insn->off = offsetof(struct sk_buff, vlan_tci);
 		insn++;
 
@@ -769,16 +771,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 
 		if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
 			insn->code = BPF_ALU | BPF_AND | BPF_K;
-			insn->a_reg = A_REG;
+			insn->a_reg = BPF_REG_A;
 			insn->imm = ~VLAN_TAG_PRESENT;
 		} else {
 			insn->code = BPF_ALU | BPF_RSH | BPF_K;
-			insn->a_reg = A_REG;
+			insn->a_reg = BPF_REG_A;
 			insn->imm = 12;
 			insn++;
 
 			insn->code = BPF_ALU | BPF_AND | BPF_K;
-			insn->a_reg = A_REG;
+			insn->a_reg = BPF_REG_A;
 			insn->imm = 1;
 		}
 		break;
@@ -790,20 +792,20 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_RANDOM:
 		/* arg1 = ctx */
 		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		insn->a_reg = ARG1_REG;
-		insn->x_reg = CTX_REG;
+		insn->a_reg = BPF_REG_ARG1;
+		insn->x_reg = BPF_REG_CTX;
 		insn++;
 
 		/* arg2 = A */
 		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		insn->a_reg = ARG2_REG;
-		insn->x_reg = A_REG;
+		insn->a_reg = BPF_REG_ARG2;
+		insn->x_reg = BPF_REG_A;
 		insn++;
 
 		/* arg3 = X */
 		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		insn->a_reg = ARG3_REG;
-		insn->x_reg = X_REG;
+		insn->a_reg = BPF_REG_ARG3;
+		insn->x_reg = BPF_REG_X;
 		insn++;
 
 		/* Emit call(ctx, arg2=A, arg3=X) */
@@ -829,8 +831,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 
 	case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
 		insn->code = BPF_ALU | BPF_XOR | BPF_X;
-		insn->a_reg = A_REG;
-		insn->x_reg = X_REG;
+		insn->a_reg = BPF_REG_A;
+		insn->x_reg = BPF_REG_X;
 		break;
 
 	default:
@@ -880,7 +882,7 @@ int sk_convert_filter(struct sock_filter *prog, int len,
 	u8 bpf_src;
 
 	BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
-	BUILD_BUG_ON(FP_REG + 1 != MAX_BPF_REG);
+	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
 
 	if (len <= 0 || len >= BPF_MAXINSNS)
 		return -EINVAL;
@@ -897,8 +899,8 @@ do_pass:
 
 	if (new_insn) {
 		new_insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		new_insn->a_reg = CTX_REG;
-		new_insn->x_reg = ARG1_REG;
+		new_insn->a_reg = BPF_REG_CTX;
+		new_insn->x_reg = BPF_REG_ARG1;
 	}
 	new_insn++;
 
@@ -948,8 +950,8 @@ do_pass:
 				break;
 
 			insn->code = fp->code;
-			insn->a_reg = A_REG;
-			insn->x_reg = X_REG;
+			insn->a_reg = BPF_REG_A;
+			insn->x_reg = BPF_REG_X;
 			insn->imm = fp->k;
 			break;
 
@@ -983,16 +985,16 @@ do_pass:
 				 * in compare insn.
 				 */
 				insn->code = BPF_ALU | BPF_MOV | BPF_K;
-				insn->a_reg = TMP_REG;
+				insn->a_reg = BPF_REG_TMP;
 				insn->imm = fp->k;
 				insn++;
 
-				insn->a_reg = A_REG;
-				insn->x_reg = TMP_REG;
+				insn->a_reg = BPF_REG_A;
+				insn->x_reg = BPF_REG_TMP;
 				bpf_src = BPF_X;
 			} else {
-				insn->a_reg = A_REG;
-				insn->x_reg = X_REG;
+				insn->a_reg = BPF_REG_A;
+				insn->x_reg = BPF_REG_X;
 				insn->imm = fp->k;
 				bpf_src = BPF_SRC(fp->code);
 			}
@@ -1027,33 +1029,33 @@ do_pass:
 		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
 		case BPF_LDX | BPF_MSH | BPF_B:
 			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = TMP_REG;
-			insn->x_reg = A_REG;
+			insn->a_reg = BPF_REG_TMP;
+			insn->x_reg = BPF_REG_A;
 			insn++;
 
 			insn->code = BPF_LD | BPF_ABS | BPF_B;
-			insn->a_reg = A_REG;
+			insn->a_reg = BPF_REG_A;
 			insn->imm = fp->k;
 			insn++;
 
 			insn->code = BPF_ALU | BPF_AND | BPF_K;
-			insn->a_reg = A_REG;
+			insn->a_reg = BPF_REG_A;
 			insn->imm = 0xf;
 			insn++;
 
 			insn->code = BPF_ALU | BPF_LSH | BPF_K;
-			insn->a_reg = A_REG;
+			insn->a_reg = BPF_REG_A;
 			insn->imm = 2;
 			insn++;
 
 			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = X_REG;
-			insn->x_reg = A_REG;
+			insn->a_reg = BPF_REG_X;
+			insn->x_reg = BPF_REG_A;
 			insn++;
 
 			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = A_REG;
-			insn->x_reg = TMP_REG;
+			insn->a_reg = BPF_REG_A;
+			insn->x_reg = BPF_REG_TMP;
 			break;
 
 		/* RET_K, RET_A are remaped into 2 insns. */
@@ -1063,7 +1065,7 @@ do_pass:
 				     (BPF_RVAL(fp->code) == BPF_K ?
 				      BPF_K : BPF_X);
 			insn->a_reg = 0;
-			insn->x_reg = A_REG;
+			insn->x_reg = BPF_REG_A;
 			insn->imm = fp->k;
 			insn++;
 
@@ -1074,8 +1076,9 @@ do_pass:
 		case BPF_ST:
 		case BPF_STX:
 			insn->code = BPF_STX | BPF_MEM | BPF_W;
-			insn->a_reg = FP_REG;
-			insn->x_reg = fp->code == BPF_ST ? A_REG : X_REG;
+			insn->a_reg = BPF_REG_FP;
+			insn->x_reg = fp->code == BPF_ST ?
+				      BPF_REG_A : BPF_REG_X;
 			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
 			break;
 
@@ -1084,8 +1087,8 @@ do_pass:
 		case BPF_LDX | BPF_MEM:
 			insn->code = BPF_LDX | BPF_MEM | BPF_W;
 			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
-				      A_REG : X_REG;
-			insn->x_reg = FP_REG;
+				      BPF_REG_A : BPF_REG_X;
+			insn->x_reg = BPF_REG_FP;
 			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
 			break;
 
@@ -1094,22 +1097,22 @@ do_pass:
 		case BPF_LDX | BPF_IMM:
 			insn->code = BPF_ALU | BPF_MOV | BPF_K;
 			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
-				      A_REG : X_REG;
+				      BPF_REG_A : BPF_REG_X;
 			insn->imm = fp->k;
 			break;
 
 		/* X = A */
 		case BPF_MISC | BPF_TAX:
 			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = X_REG;
-			insn->x_reg = A_REG;
+			insn->a_reg = BPF_REG_X;
+			insn->x_reg = BPF_REG_A;
 			break;
 
 		/* A = X */
 		case BPF_MISC | BPF_TXA:
 			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = A_REG;
-			insn->x_reg = X_REG;
+			insn->a_reg = BPF_REG_A;
+			insn->x_reg = BPF_REG_X;
 			break;
 
 		/* A = skb->len or X = skb->len */
@@ -1117,16 +1120,16 @@ do_pass:
 		case BPF_LDX | BPF_W | BPF_LEN:
 			insn->code = BPF_LDX | BPF_MEM | BPF_W;
 			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
-				      A_REG : X_REG;
-			insn->x_reg = CTX_REG;
+				      BPF_REG_A : BPF_REG_X;
+			insn->x_reg = BPF_REG_CTX;
 			insn->off = offsetof(struct sk_buff, len);
 			break;
 
 		/* access seccomp_data fields */
 		case BPF_LDX | BPF_ABS | BPF_W:
 			insn->code = BPF_LDX | BPF_MEM | BPF_W;
-			insn->a_reg = A_REG;
-			insn->x_reg = CTX_REG;
+			insn->a_reg = BPF_REG_A;
+			insn->x_reg = BPF_REG_CTX;
 			insn->off = fp->k;
 			break;
 
-- 
cgit v1.2.3-71-gd317


From 9da93f9b7cdf8ab28da6b364cdc1fafc8670b4dc Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 5 May 2014 17:25:50 +1000
Subject: xfs: fix Q_XQUOTARM ioctl

The Q_XQUOTARM quotactl was not working properly, because
we weren't passing around proper flags.  The xfs_fs_set_xstate()
ioctl handler used the same flags for Q_XQUOTAON/OFF as
well as for Q_XQUOTARM, but Q_XQUOTAON/OFF look for
XFS_UQUOTA_ACCT, XFS_UQUOTA_ENFD, XFS_GQUOTA_ACCT etc,
i.e. quota type + state, while Q_XQUOTARM looks only for
the type of quota, i.e. XFS_DQ_USER, XFS_DQ_GROUP etc.

Unfortunately these flag spaces overlap a bit, so we
got semi-random results for Q_XQUOTARM; i.e. the value
for XFS_DQ_USER == XFS_UQUOTA_ACCT, etc.  yeargh.

Add a new quotactl op vector specifically for the QUOTARM
operation, since it operates with a different flag space.

This has been broken more or less forever, AFAICT.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/quota/quota.c      | 14 +++++++++++++-
 fs/xfs/xfs_quotaops.c | 29 +++++++++++++++++++++++++----
 include/linux/quota.h |  1 +
 3 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2b363e23f36e..ff3f0b3cfdb3 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -278,6 +278,17 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 	return ret;
 }
 
+static int quota_rmxquota(struct super_block *sb, void __user *addr)
+{
+	__u32 flags;
+
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->rm_xquota)
+		return -ENOSYS;
+	return sb->s_qcop->rm_xquota(sb, flags);
+}
+
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		       void __user *addr, struct path *path)
@@ -316,8 +327,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		return sb->s_qcop->quota_sync(sb, type);
 	case Q_XQUOTAON:
 	case Q_XQUOTAOFF:
-	case Q_XQUOTARM:
 		return quota_setxstate(sb, cmd, addr);
+	case Q_XQUOTARM:
+		return quota_rmxquota(sb, addr);
 	case Q_XGETQSTAT:
 		return quota_getxstate(sb, addr);
 	case Q_XGETQSTATV:
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index af33cafe69b6..2ad1b9822e92 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -100,15 +100,35 @@ xfs_fs_set_xstate(
 		if (!XFS_IS_QUOTA_ON(mp))
 			return -EINVAL;
 		return -xfs_qm_scall_quotaoff(mp, flags);
-	case Q_XQUOTARM:
-		if (XFS_IS_QUOTA_ON(mp))
-			return -EINVAL;
-		return -xfs_qm_scall_trunc_qfiles(mp, flags);
 	}
 
 	return -EINVAL;
 }
 
+STATIC int
+xfs_fs_rm_xquota(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	unsigned int		flags = 0;
+	
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (XFS_IS_QUOTA_ON(mp))
+		return -EINVAL;
+
+	if (uflags & FS_USER_QUOTA)
+		flags |= XFS_DQ_USER;
+	if (uflags & FS_GROUP_QUOTA)
+		flags |= XFS_DQ_GROUP;
+	if (uflags & FS_USER_QUOTA)
+		flags |= XFS_DQ_PROJ;
+
+	return -xfs_qm_scall_trunc_qfiles(mp, flags);
+}	
+
 STATIC int
 xfs_fs_get_dqblk(
 	struct super_block	*sb,
@@ -149,6 +169,7 @@ const struct quotactl_ops xfs_quotactl_operations = {
 	.get_xstatev		= xfs_fs_get_xstatev,
 	.get_xstate		= xfs_fs_get_xstate,
 	.set_xstate		= xfs_fs_set_xstate,
+	.rm_xquota		= xfs_fs_rm_xquota,
 	.get_dqblk		= xfs_fs_get_dqblk,
 	.set_dqblk		= xfs_fs_set_dqblk,
 };
diff --git a/include/linux/quota.h b/include/linux/quota.h
index cc7494a35429..0f3c5d38da1f 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -329,6 +329,7 @@ struct quotactl_ops {
 	int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
 	int (*set_xstate)(struct super_block *, unsigned int, int);
 	int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
+	int (*rm_xquota)(struct super_block *, unsigned int);
 };
 
 struct quota_format_type {
-- 
cgit v1.2.3-71-gd317


From 8757ad65d30f009fe0beeb2d70d3cd834cb998f2 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 11 Apr 2014 10:37:39 -0400
Subject: NVMe: Update copyright headers

Make the copyright dates accurate and remove the final paragraph that
includes the address of the FSF.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 4 ----
 drivers/block/nvme-scsi.c | 6 +-----
 include/linux/nvme.h      | 6 +-----
 include/uapi/linux/nvme.h | 6 +-----
 4 files changed, 3 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 7c64fa756cce..eacd64e36be3 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -10,10 +10,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #include <linux/nvme.h>
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 2c3f5be06da1..da3b252dea6f 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -1,6 +1,6 @@
 /*
  * NVM Express device driver
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011-2014, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -10,10 +10,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 /*
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index a50173ca1d72..cfd084cab22b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1,6 +1,6 @@
 /*
  * Definitions for the NVM Express interface
- * Copyright (c) 2011-2013, Intel Corporation.
+ * Copyright (c) 2011-2014, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -10,10 +10,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #ifndef _LINUX_NVME_H
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 096fe1c6f83d..ad9014e49693 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -1,6 +1,6 @@
 /*
  * Definitions for the NVM Express interface
- * Copyright (c) 2011-2013, Intel Corporation.
+ * Copyright (c) 2011-2014, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -10,10 +10,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #ifndef _UAPI_LINUX_NVME_H
-- 
cgit v1.2.3-71-gd317


From a7d2ce2832d84e0182585f63bf96ca7323b3aee7 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 29 Apr 2014 11:41:28 -0600
Subject: NVMe: Configure support for block flush

This configures an nvme request_queue as flush capable if the device
has a volatile write cache present.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 3 +++
 include/linux/nvme.h      | 1 +
 include/uapi/linux/nvme.h | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 025dd4cad4a6..e7c4fdb6a651 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1897,6 +1897,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	if (dev->max_hw_sectors)
 		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+	if (dev->vwc & NVME_CTRL_VWC_PRESENT)
+		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
 
 	disk->major = nvme_major;
 	disk->first_minor = 0;
@@ -2201,6 +2203,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	nn = le32_to_cpup(&ctrl->nn);
 	dev->oncs = le16_to_cpup(&ctrl->oncs);
 	dev->abort_limit = ctrl->acl + 1;
+	dev->vwc = ctrl->vwc;
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index cfd084cab22b..6266373d3147 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -99,6 +99,7 @@ struct nvme_dev {
 	u32 stripe_size;
 	u16 oncs;
 	u16 abort_limit;
+	u8 vwc;
 	u8 initialized;
 };
 
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index ad9014e49693..f090336d5bad 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -73,6 +73,7 @@ enum {
 	NVME_CTRL_ONCS_COMPARE			= 1 << 0,
 	NVME_CTRL_ONCS_WRITE_UNCORRECTABLE	= 1 << 1,
 	NVME_CTRL_ONCS_DSM			= 1 << 2,
+	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 };
 
 struct nvme_lbaf {
-- 
cgit v1.2.3-71-gd317


From 53562be74bd06bbe74d2acf3caca5398f8eeb160 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 29 Apr 2014 11:41:29 -0600
Subject: NVMe: Flush with data support

It is possible a filesystem may send a flush flagged bio with write
data. There is no such composite NVMe command, so the driver sends flush
and write separately.

The device is allowed to execute these commands in any order, so it was
possible the driver ends the bio after the write completes, but while the
flush is still active. We don't want to let a filesystem believe flush
succeeded before it really has; this could cause data corruption on a
power loss between these events. To fix, this patch splits the flush
and write into chained bios.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 44 ++++++++++++++++++++++++--------------------
 include/linux/nvme.h      |  1 -
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e7c4fdb6a651..cd8a8bc711cc 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -197,16 +197,13 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
 #define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
 #define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
-#define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
-#define CMD_CTX_ABORT		(0x31C + CMD_CTX_BASE)
+#define CMD_CTX_ABORT		(0x318 + CMD_CTX_BASE)
 
 static void special_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	if (ctx == CMD_CTX_CANCELLED)
 		return;
-	if (ctx == CMD_CTX_FLUSH)
-		return;
 	if (ctx == CMD_CTX_ABORT) {
 		++nvmeq->dev->abort_limit;
 		return;
@@ -629,16 +626,6 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return 0;
 }
 
-int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
-{
-	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
-					special_completion, NVME_IO_TIMEOUT);
-	if (unlikely(cmdid < 0))
-		return cmdid;
-
-	return nvme_submit_flush(nvmeq, ns, cmdid);
-}
-
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 {
 	struct bio *bio = iod->private;
@@ -654,7 +641,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 
 	if (bio->bi_rw & REQ_DISCARD)
 		return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
-	if ((bio->bi_rw & REQ_FLUSH) && !iod->nents)
+	if (bio->bi_rw & REQ_FLUSH)
 		return nvme_submit_flush(nvmeq, ns, cmdid);
 
 	control = 0;
@@ -688,6 +675,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
 	return 0;
 }
 
+static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio)
+{
+	struct bio *split = bio_clone(bio, GFP_ATOMIC);
+	if (!split)
+		return -ENOMEM;
+
+	split->bi_iter.bi_size = 0;
+	split->bi_phys_segments = 0;
+	bio->bi_rw &= ~REQ_FLUSH;
+	bio_chain(split, bio);
+
+	if (!waitqueue_active(&nvmeq->sq_full))
+		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+	bio_list_add(&nvmeq->sq_cong, split);
+	bio_list_add(&nvmeq->sq_cong, bio);
+	wake_up_process(nvme_thread);
+
+	return 0;
+}
+
 /*
  * Called with local interrupts disabled and the q_lock held.  May not sleep.
  */
@@ -698,11 +705,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	int psegs = bio_phys_segments(ns->queue, bio);
 	int result;
 
-	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
-		result = nvme_submit_flush_data(nvmeq, ns);
-		if (result)
-			return result;
-	}
+	if ((bio->bi_rw & REQ_FLUSH) && psegs)
+		return nvme_split_flush_data(nvmeq, bio);
 
 	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
 	if (!iod)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 6266373d3147..1813cfdb7e80 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -156,7 +156,6 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 			struct nvme_iod *iod);
 int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *);
-int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
 int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
 							u32 *result);
 int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,
-- 
cgit v1.2.3-71-gd317


From 76ba0aae673075c77a8b775e9133c8e8b1a44563 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Fri, 2 May 2014 16:29:18 -0700
Subject: net: Generalize checksum_init functions

Create a general __skb_checksum_validate function (actually a
macro) to subsume the various checksum_init functions. This
function can either init the checksum, or do the full validation
(logically checksum_init+skb_check_complete)-- a flag specifies
if full vaidation is performed. Also, there is a flag to the function
to indicate that zero checksums are allowed (to support optional
UDP checksums).

Added several stub functions for calling __skb_checksum_validate.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 08074a810164..3ca0dda5a42e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2741,6 +2741,99 @@ static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
 	       0 : __skb_checksum_complete(skb);
 }
 
+/* Check if we need to perform checksum complete validation.
+ *
+ * Returns true if checksum complete is needed, false otherwise
+ * (either checksum is unnecessary or zero checksum is allowed).
+ */
+static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
+						  bool zero_okay,
+						  __sum16 check)
+{
+	if (skb_csum_unnecessary(skb)) {
+		return false;
+	} else if (zero_okay && !check) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		return false;
+	}
+
+	return true;
+}
+
+/* For small packets <= CHECKSUM_BREAK peform checksum complete directly
+ * in checksum_init.
+ */
+#define CHECKSUM_BREAK 76
+
+/* Validate (init) checksum based on checksum complete.
+ *
+ * Return values:
+ *   0: checksum is validated or try to in skb_checksum_complete. In the latter
+ *	case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
+ *	checksum is stored in skb->csum for use in __skb_checksum_complete
+ *   non-zero: value of invalid checksum
+ *
+ */
+static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
+						       bool complete,
+						       __wsum psum)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		if (!csum_fold(csum_add(psum, skb->csum))) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			return 0;
+		}
+	}
+
+	skb->csum = psum;
+
+	if (complete || skb->len <= CHECKSUM_BREAK)
+		return __skb_checksum_complete(skb);
+
+	return 0;
+}
+
+static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
+{
+	return 0;
+}
+
+/* Perform checksum validate (init). Note that this is a macro since we only
+ * want to calculate the pseudo header which is an input function if necessary.
+ * First we try to validate without any computation (checksum unnecessary) and
+ * then calculate based on checksum complete calling the function to compute
+ * pseudo header.
+ *
+ * Return values:
+ *   0: checksum is validated or try to in skb_checksum_complete
+ *   non-zero: value of invalid checksum
+ */
+#define __skb_checksum_validate(skb, proto, complete,			\
+				zero_okay, check, compute_pseudo)	\
+({									\
+	__sum16 __ret = 0;						\
+	if (__skb_checksum_validate_needed(skb, zero_okay, check))	\
+		__ret = __skb_checksum_validate_complete(skb,		\
+				complete, compute_pseudo(skb, proto));	\
+	__ret;								\
+})
+
+#define skb_checksum_init(skb, proto, compute_pseudo)			\
+	__skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)
+
+#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo)	\
+	__skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)
+
+#define skb_checksum_validate(skb, proto, compute_pseudo)		\
+	__skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)
+
+#define skb_checksum_validate_zero_check(skb, proto, check,		\
+					 compute_pseudo)		\
+	__skb_checksum_validate_(skb, proto, true, true, check, compute_pseudo)
+
+#define skb_checksum_simple_validate(skb)				\
+	__skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
-- 
cgit v1.2.3-71-gd317


From bdffd893a0e9c431304142d12d9a0a21d365c502 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 29 Apr 2014 14:17:40 -0500
Subject: tracing: Replace __get_cpu_var uses with this_cpu_ptr

Replace uses of &__get_cpu_var for address calculation with this_cpu_ptr.

Link: http://lkml.kernel.org/p/alpine.DEB.2.10.1404291415560.18364@gentwo.org

Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 2 +-
 kernel/trace/ftrace.c   | 4 ++--
 kernel/trace/trace.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 925eaf28fca9..7bd2ad01e39c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -355,7 +355,7 @@ static inline void reset_current_kprobe(void)
 
 static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
 {
-	return (&__get_cpu_var(kprobe_ctlblk));
+	return this_cpu_ptr(&kprobe_ctlblk);
 }
 
 int register_kprobe(struct kprobe *p);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9eb1aa03a18d..38e5cf73b9ae 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -822,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
 
 	local_irq_save(flags);
 
-	stat = &__get_cpu_var(ftrace_profile_stats);
+	stat = this_cpu_ptr(&ftrace_profile_stats);
 	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
@@ -853,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	stat = &__get_cpu_var(ftrace_profile_stats);
+	stat = this_cpu_ptr(&ftrace_profile_stats);
 	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4c392c8238bf..05431696b10c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1726,7 +1726,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	 */
 	barrier();
 	if (use_stack == 1) {
-		trace.entries		= &__get_cpu_var(ftrace_stack).calls[0];
+		trace.entries		= this_cpu_ptr(ftrace_stack.calls);
 		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES;
 
 		if (regs)
-- 
cgit v1.2.3-71-gd317


From e7c24607b5d68a4cdc56e09d70a3c8bae5f0519f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Apr 2014 20:54:51 -0400
Subject: kill iov_iter_copy_from_user()

all callers can use copy_page_from_iter() and it actually simplifies
them.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/file.c         |  3 +--
 fs/cifs/file.c         |  7 +++----
 include/linux/uio.h    |  2 --
 mm/iov_iter.c          | 27 ---------------------------
 mm/process_vm_access.c |  6 +-----
 5 files changed, 5 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 88a6df4cbe6d..ef9115e4a6fa 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -737,13 +737,12 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
 		left = len;
 		for (n = 0; n < num_pages; n++) {
 			size_t plen = min_t(size_t, left, PAGE_SIZE);
-			ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
+			ret = copy_page_from_iter(pages[n], 0, plen, &i);
 			if (ret != plen) {
 				ret = -EFAULT;
 				break;
 			}
 			left -= ret;
-			iov_iter_advance(&i, ret);
 		}
 
 		if (ret < 0) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5ed03e0b8b40..2900d150654e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2444,11 +2444,10 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
 
 		save_len = cur_len;
 		for (i = 0; i < nr_pages; i++) {
-			bytes = min_t(const size_t, cur_len, PAGE_SIZE);
-			copied = iov_iter_copy_from_user(wdata->pages[i], &it,
-							 0, bytes);
+			bytes = min_t(size_t, cur_len, PAGE_SIZE);
+			copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
+						     &it);
 			cur_len -= copied;
-			iov_iter_advance(&it, copied);
 			/*
 			 * If we didn't copy as much as we expected, then that
 			 * may mean we trod into an unmapped area. Stop copying
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 199bcc34241b..abbe83ded630 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -62,8 +62,6 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
 
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 10e46cd721de..22ec1ef068a8 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -129,33 +129,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 }
 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap(page);
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
 void iov_iter_advance(struct iov_iter *i, size_t bytes)
 {
 	BUG_ON(i->count < bytes);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 8505c9262b35..f32b1fbbfe69 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -46,11 +46,7 @@ static int process_vm_rw_pages(struct page **pages,
 			copy = len;
 
 		if (vm_write) {
-			if (copy > iov_iter_count(iter))
-				copy = iov_iter_count(iter);
-			copied = iov_iter_copy_from_user(page, iter,
-					offset, copy);
-			iov_iter_advance(iter, copied);
+			copied = copy_page_from_iter(page, offset, copy, iter);
 			set_page_dirty_lock(page);
 		} else {
 			copied = copy_page_to_iter(page, offset, copy, iter);
-- 
cgit v1.2.3-71-gd317


From f8579f8673b7ecdb7a81d5d5bb1d981093d9aa94 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Mar 2014 22:03:20 -0500
Subject: generic_file_direct_write(): switch to iov_iter

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c    |  6 ++----
 fs/fuse/file.c     |  6 ++----
 fs/ocfs2/file.c    |  6 +++---
 fs/xfs/xfs_file.c  |  5 +++--
 include/linux/fs.h |  4 ++--
 mm/filemap.c       | 15 +++++++--------
 6 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ae6af072b635..9fe20c2052af 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1669,15 +1669,13 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 	loff_t endbyte;
 	int err;
 
-	written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-					    count, ocount);
+	iov_iter_init(&i, iov, nr_segs, count, 0);
+	written = generic_file_direct_write(iocb, &i, pos, count, ocount);
 
 	if (written < 0 || written == count)
 		return written;
 
 	pos += written;
-	count -= written;
-	iov_iter_init(&i, iov, nr_segs, count, written);
 	written_buffered = __btrfs_buffered_write(file, &i, pos);
 	if (written_buffered < 0) {
 		err = written_buffered;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 96d513e01a5d..126deb5d0a9c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1235,15 +1235,13 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		goto out;
 
 	if (file->f_flags & O_DIRECT) {
-		written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 
-						    count, ocount);
+		iov_iter_init(&i, iov, nr_segs, count, 0);
+		written = generic_file_direct_write(iocb, &i, pos, count, ocount);
 		if (written < 0 || written == count)
 			goto out;
 
 		pos += written;
-		count -= written;
 
-		iov_iter_init(&i, iov, nr_segs, count, written);
 		written_buffered = fuse_perform_write(file, mapping, &i, pos);
 		if (written_buffered < 0) {
 			err = written_buffered;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8970dcf74de5..d6d78c2aa96e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2251,6 +2251,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 	int unaligned_dio = 0;
+	struct iov_iter from;
 
 	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2365,16 +2366,15 @@ relock:
 	if (ret)
 		goto out_dio;
 
+	iov_iter_init(&from, iov, nr_segs, count, 0);
 	if (direct_io) {
-		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+		written = generic_file_direct_write(iocb, &from, *ppos,
 						    count, ocount);
 		if (written < 0) {
 			ret = written;
 			goto out_dio;
 		}
 	} else {
-		struct iov_iter from;
-		iov_iter_init(&from, iov, nr_segs, count, 0);
 		current->backing_dev_info = file->f_mapping->backing_dev_info;
 		written = generic_perform_write(file, &from, *ppos);
 		if (likely(written >= 0))
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 951a2321ee01..8617497867c7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -641,6 +641,7 @@ xfs_file_dio_aio_write(
 	int			iolock;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
+	struct iov_iter		from;
 
 	/* DIO must be aligned to device logical sector size */
 	if ((pos | count) & target->bt_logical_sectormask)
@@ -698,8 +699,8 @@ xfs_file_dio_aio_write(
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-	ret = generic_file_direct_write(iocb, iovp,
-			&nr_segs, pos, count, ocount);
+	iov_iter_init(&from, iovp, nr_segs, count, 0);
+	ret = generic_file_direct_write(iocb, &from, pos, count, ocount);
 
 out:
 	xfs_rw_iunlock(ip, iolock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 878031227c57..262f96e579b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2407,8 +2407,8 @@ int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isbl
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
-extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
-		unsigned long *, loff_t, size_t, size_t);
+extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *,
+		loff_t, size_t, size_t);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index 000a220e2a41..a840890ed39f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2385,9 +2385,8 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long *nr_segs, loff_t pos,
-		size_t count, size_t ocount)
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
+		loff_t pos, size_t count, size_t ocount)
 {
 	struct file	*file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -2397,9 +2396,9 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	pgoff_t		end;
 
 	if (count != ocount)
-		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+		from->nr_segs = iov_shorten((struct iovec *)from->iov, from->nr_segs, count);
 
-	write_len = iov_length(iov, *nr_segs);
+	write_len = iov_length(from->iov, from->nr_segs);
 	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
 
 	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2426,7 +2425,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+	written = mapping->a_ops->direct_IO(WRITE, iocb, from->iov, pos, from->nr_segs);
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
@@ -2443,6 +2442,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	if (written > 0) {
 		pos += written;
+		iov_iter_advance(from, written);
 		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
 			i_size_write(inode, pos);
 			mark_inode_dirty(inode);
@@ -2645,11 +2645,10 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		loff_t endbyte;
 
-		written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos,
+		written = generic_file_direct_write(iocb, &from, pos,
 							count, ocount);
 		if (written < 0 || written == count)
 			goto out;
-		iov_iter_advance(&from, written);
 
 		/*
 		 * direct-io write to a hole: fall through to buffered I/O
-- 
cgit v1.2.3-71-gd317


From cb66a7a1f149ff705fa37cad6d1252b046e0ad4f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Mar 2014 15:24:06 -0500
Subject: kill generic_segment_checks()

all callers of ->aio_read() and ->aio_write() have iov/nr_segs already
checked - generic_segment_checks() done after that is just an odd way
to spell iov_length().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/lustre/lustre/llite/file.c | 10 ++----
 fs/btrfs/file.c                            |  7 +---
 fs/ceph/file.c                             | 13 ++------
 fs/fuse/file.c                             |  7 +---
 fs/ntfs/file.c                             |  5 +--
 fs/ocfs2/file.c                            |  7 +---
 fs/xfs/xfs_file.c                          |  9 ++---
 include/linux/fs.h                         |  2 --
 mm/filemap.c                               | 53 ++----------------------------
 mm/shmem.c                                 |  7 ++--
 10 files changed, 16 insertions(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index 8e844a6371e0..220bd8390a84 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -1180,9 +1180,7 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t	     result;
 	int		 refcheck;
 
-	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-	if (result)
-		return result;
+	count = iov_length(iov, nr_segs);
 
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
@@ -1235,14 +1233,10 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct lu_env      *env;
 	struct vvp_io_args *args;
-	size_t	      count = 0;
+	size_t	      count = iov_length(iov, nr_segs);
 	ssize_t	     result;
 	int		 refcheck;
 
-	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
-	if (result)
-		return result;
-
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
 		return PTR_ERR(env);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1dafe0701daf..a0a94a30d85a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1726,12 +1726,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
 	mutex_lock(&inode->i_mutex);
 
-	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-	if (err) {
-		mutex_unlock(&inode->i_mutex);
-		goto out;
-	}
-	count = ocount;
+	count = ocount = iov_length(iov, nr_segs);
 
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ef9115e4a6fa..21a56c27b74c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -828,12 +828,8 @@ again:
 		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
 		     ceph_cap_string(got));
 
-		if (!read) {
-			ret = generic_segment_checks(iov, &nr_segs,
-							&len, VERIFY_WRITE);
-			if (ret)
-				goto out;
-		}
+		if (!read)
+			len = iov_length(iov, nr_segs);
 
 		iov_iter_init(&i, iov, nr_segs, len, read);
 
@@ -855,7 +851,6 @@ again:
 
 		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
 	}
-out:
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
 	ceph_put_cap_refs(ci, got);
@@ -911,9 +906,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	mutex_lock(&inode->i_mutex);
 
-	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
-	if (err)
-		goto out;
+	count = iov_length(iov, nr_segs);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = file->f_mapping->backing_dev_info;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 126deb5d0a9c..9c7f346879e7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1208,12 +1208,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	WARN_ON(iocb->ki_pos != pos);
 
-	ocount = 0;
-	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-	if (err)
-		return err;
-
-	count = ocount;
+	count = ocount = iov_length(iov, nr_segs);
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index db9bd8a31725..b6fa457d8d01 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2091,10 +2091,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 	size_t count;		/* after file limit checks */
 	ssize_t written, err;
 
-	count = 0;
-	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
-	if (err)
-		return err;
+	count = iov_length(iov, nr_segs);
 	pos = *ppos;
 	/* We can write back this queue in page reclaim. */
 	current->backing_dev_info = mapping->backing_dev_info;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d6d78c2aa96e..d33c4ced0baf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2355,12 +2355,7 @@ relock:
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 
-	ret = generic_segment_checks(iov, &nr_segs, &ocount,
-				     VERIFY_READ);
-	if (ret)
-		goto out_dio;
-
-	count = ocount;
+	count = ocount = iov_length(iov, nr_segs);
 	ret = generic_write_checks(file, ppos, &count,
 				   S_ISBLK(inode->i_mode));
 	if (ret)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8617497867c7..f0f8084a67be 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -253,9 +253,7 @@ xfs_file_aio_read(
 	if (file->f_mode & FMODE_NOCMTIME)
 		ioflags |= IO_INVIS;
 
-	ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
-	if (ret < 0)
-		return ret;
+	size = iov_length(iovp, nr_segs);
 
 	if (unlikely(ioflags & IO_ISDIRECT)) {
 		xfs_buftarg_t	*target =
@@ -777,10 +775,7 @@ xfs_file_aio_write(
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-	if (ret)
-		return ret;
-
+	ocount = iov_length(iovp, nr_segs);
 	if (ocount == 0)
 		return 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 262f96e579b8..796de742fe4a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2412,8 +2412,6 @@ extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *,
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
-extern int generic_segment_checks(const struct iovec *iov,
-		unsigned long *nr_segs, size_t *count, int access_flags);
 
 /* fs/block_dev.c */
 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/mm/filemap.c b/mm/filemap.c
index a840890ed39f..7c1417b0bd7b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1663,45 +1663,6 @@ out:
 	return written ? written : error;
 }
 
-/*
- * Performs necessary checks before doing a write
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
- * @count:	number of bytes to write
- * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
- *
- * Adjust number of segments and amount of bytes to write (nr_segs should be
- * properly initialized first). Returns appropriate error code that caller
- * should return or zero in case that write should be allowed.
- */
-int generic_segment_checks(const struct iovec *iov,
-			unsigned long *nr_segs, size_t *count, int access_flags)
-{
-	unsigned long   seg;
-	size_t cnt = 0;
-	for (seg = 0; seg < *nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		cnt += iv->iov_len;
-		if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(access_flags, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		*nr_segs = seg;
-		cnt -= iv->iov_len;	/* This segment is no good */
-		break;
-	}
-	*count = cnt;
-	return 0;
-}
-EXPORT_SYMBOL(generic_segment_checks);
-
 /**
  * generic_file_aio_read - generic filesystem read routine
  * @iocb:	kernel I/O control block
@@ -1717,15 +1678,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
+	ssize_t retval = 0;
 	size_t count;
 	loff_t *ppos = &iocb->ki_pos;
 	struct iov_iter i;
 
-	count = 0;
-	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-	if (retval)
-		return retval;
+	count = iov_length(iov, nr_segs);
 	iov_iter_init(&i, iov, nr_segs, count, 0);
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
@@ -2615,12 +2573,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t		status;
 	struct iov_iter from;
 
-	ocount = 0;
-	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-	if (err)
-		return err;
-
-	count = ocount;
+	count = ocount = iov_length(iov, nr_segs);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
diff --git a/mm/shmem.c b/mm/shmem.c
index 9f70e02111c6..2a93e625adaf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1412,14 +1412,11 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
 	unsigned long offset;
 	enum sgp_type sgp = SGP_READ;
 	int error = 0;
-	ssize_t retval;
-	size_t count;
+	ssize_t retval = 0;
+	size_t count = iov_length(iov, nr_segs);
 	loff_t *ppos = &iocb->ki_pos;
 	struct iov_iter iter;
 
-	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-	if (retval)
-		return retval;
 	iov_iter_init(&iter, iov, nr_segs, count, 0);
 
 	/*
-- 
cgit v1.2.3-71-gd317


From d8d3d94b80aa1a1c0ca75c58b8abdc7356f38418 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Mar 2014 21:27:34 -0500
Subject: pass iov_iter to ->direct_IO()

unmodified, for now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking          |  3 +--
 Documentation/filesystems/vfs.txt          |  3 +--
 drivers/staging/lustre/lustre/llite/rw26.c | 17 ++++++++---------
 fs/9p/vfs_addr.c                           |  5 ++---
 fs/block_dev.c                             |  9 +++++----
 fs/btrfs/inode.c                           | 12 ++++++------
 fs/ceph/addr.c                             |  4 ++--
 fs/cifs/file.c                             |  4 ++--
 fs/exofs/inode.c                           |  2 +-
 fs/ext2/inode.c                            | 11 ++++++-----
 fs/ext3/inode.c                            | 16 +++++++---------
 fs/ext4/inode.c                            | 11 +++++------
 fs/f2fs/data.c                             |  8 ++++----
 fs/fat/inode.c                             | 13 +++++++------
 fs/fuse/file.c                             | 10 +++++-----
 fs/gfs2/aops.c                             | 11 +++++------
 fs/hfs/inode.c                             |  8 ++++----
 fs/hfsplus/inode.c                         |  6 +++---
 fs/jfs/inode.c                             |  8 ++++----
 fs/nfs/direct.c                            |  8 ++++----
 fs/nilfs2/inode.c                          | 10 +++++-----
 fs/ocfs2/aops.c                            |  7 +++----
 fs/reiserfs/inode.c                        |  9 ++++-----
 fs/udf/file.c                              |  4 ++--
 fs/udf/inode.c                             |  8 ++++----
 fs/xfs/xfs_aops.c                          | 15 +++++++--------
 include/linux/fs.h                         |  3 +--
 include/linux/nfs_fs.h                     |  3 +--
 mm/filemap.c                               |  9 ++++-----
 mm/page_io.c                               |  6 ++++--
 30 files changed, 117 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index eba790134253..9b0d5a33c8bf 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -196,8 +196,7 @@ prototypes:
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
-	int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs);
+	int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
 				unsigned long *);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 617f6d70c077..1846374a5add 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -589,8 +589,7 @@ struct address_space_operations {
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
-	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs);
+	ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
 	/* migrate the contents of a page to the specified target */
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index 7e3e0967993b..66e05c6f4d27 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -363,15 +363,14 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
 #define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
 		      ~(DT_MAX_BRW_SIZE - 1))
 static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
-			       const struct iovec *iov, loff_t file_offset,
-			       unsigned long nr_segs)
+			       struct iov_iter *iter, loff_t file_offset)
 {
 	struct lu_env *env;
 	struct cl_io *io;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ccc_object *obj = cl_inode2ccc(inode);
-	long count = iov_length(iov, nr_segs);
+	long count = iov_length(iter->iov, iter->nr_segs);
 	long tot_bytes = 0, result = 0;
 	struct ll_inode_info *lli = ll_i2info(inode);
 	unsigned long seg = 0;
@@ -392,9 +391,9 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
 	       MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
 
 	/* Check that all user buffers are aligned as well */
-	for (seg = 0; seg < nr_segs; seg++) {
-		if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
-		    (iov[seg].iov_len & ~CFS_PAGE_MASK))
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		if (((unsigned long)iter->iov[seg].iov_base & ~CFS_PAGE_MASK) ||
+		    (iter->iov[seg].iov_len & ~CFS_PAGE_MASK))
 			return -EINVAL;
 	}
 
@@ -411,9 +410,9 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
 		mutex_lock(&inode->i_mutex);
 
 	LASSERT(obj->cob_transient_pages == 0);
-	for (seg = 0; seg < nr_segs; seg++) {
-		long iov_left = iov[seg].iov_len;
-		unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		long iov_left = iter->iov[seg].iov_len;
+		unsigned long user_addr = (unsigned long)iter->iov[seg].iov_base;
 
 		if (rw == READ) {
 			if (file_offset >= i_size_read(inode))
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index c71e88602ff4..cc1cfae726b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -259,8 +259,7 @@ static int v9fs_launder_page(struct page *page)
  *
  */
 static ssize_t
-v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-	       loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
 	/*
 	 * FIXME
@@ -269,7 +268,7 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	 */
 	p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
 		 iocb->ki_filp->f_path.dentry->d_name.name,
-		 (long long)pos, nr_segs);
+		 (long long)pos, iter->nr_segs);
 
 	return -EINVAL;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 552a8d13bc32..938fc707d769 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -165,14 +165,15 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
 }
 
 static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+			loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-				    nr_segs, blkdev_get_block, NULL, NULL, 0);
+	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter->iov,
+				    offset, iter->nr_segs, blkdev_get_block,
+				    NULL, NULL, 0);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f805bc944fa..30a6cc51f32c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7433,8 +7433,7 @@ out:
 }
 
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
+			struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
@@ -7444,8 +7443,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	bool relock = false;
 	ssize_t ret;
 
-	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-			    offset, nr_segs))
+	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter->iov,
+			    offset, iter->nr_segs))
 		return 0;
 
 	atomic_inc(&inode->i_dio_count);
@@ -7457,7 +7456,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	 * we need to flush the dirty pages again to make absolutely sure
 	 * that any outstanding dirty pages are on disk.
 	 */
-	count = iov_length(iov, nr_segs);
+	count = iov_length(iter->iov, iter->nr_segs);
 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 		     &BTRFS_I(inode)->runtime_flags))
 		filemap_fdatawrite_range(inode->i_mapping, offset, count);
@@ -7484,7 +7483,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 
 	ret = __blockdev_direct_IO(rw, iocb, inode,
 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+			iter->iov, offset, iter->nr_segs,
+			btrfs_get_blocks_direct, NULL,
 			btrfs_submit_direct, flags);
 	if (rw & WRITE) {
 		if (ret < 0 && ret != -EIOCBQUEUED)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b53278c9fd97..342ca5e423f9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1187,8 +1187,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
  * never get called.
  */
 static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
-			      const struct iovec *iov,
-			      loff_t pos, unsigned long nr_segs)
+			      struct iov_iter *iter,
+			      loff_t pos)
 {
 	WARN_ON(1);
 	return -EINVAL;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 2900d150654e..a4ccc39e6c11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3702,8 +3702,8 @@ void cifs_oplock_break(struct work_struct *work)
  * Direct IO is not yet supported in the cached mode. 
  */
 static ssize_t
-cifs_direct_io(int rw, struct kiocb *iocb, const struct iovec *iov,
-               loff_t pos, unsigned long nr_segs)
+cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
+               loff_t pos)
 {
         /*
          * FIXME
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1c244d67667..3f9cafd73931 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -964,7 +964,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
 
  /* TODO: Should be easy enough to do proprly */
 static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
-		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+		struct iov_iter *iter, loff_t offset)
 {
 	return 0;
 }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b1d2a4675d42..47fbe760a7f8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -850,18 +850,19 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 }
 
 static ssize_t
-ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs)
+ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+			loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-				 ext2_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				 iter->nr_segs, ext2_get_block);
 	if (ret < 0 && (rw & WRITE))
-		ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+		ext2_write_failed(mapping, offset +
+				  iov_length(iter->iov, iter->nr_segs));
 	return ret;
 }
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f5157d0d1b43..7a5c501dc31b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1820,8 +1820,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
  * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
+			struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
@@ -1829,10 +1828,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
 	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
-	size_t count = iov_length(iov, nr_segs);
+	size_t count = iov_length(iter->iov, iter->nr_segs);
 	int retries = 0;
 
-	trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+	trace_ext3_direct_IO_enter(inode, offset, count, rw);
 
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
@@ -1856,15 +1855,15 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 retry:
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-				 ext3_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				 iter->nr_segs, ext3_get_block);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + iov_length(iov, nr_segs);
+		loff_t end = offset + count;
 
 		if (end > isize)
 			ext3_truncate_failed_direct_write(inode);
@@ -1909,8 +1908,7 @@ retry:
 			ret = err;
 	}
 out:
-	trace_ext3_direct_IO_exit(inode, offset,
-				iov_length(iov, nr_segs), rw, ret);
+	trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
 	return ret;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d7b7462a0e13..f51db730da39 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3222,8 +3222,7 @@ retake_lock:
 }
 
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-			      const struct iovec *iov, loff_t offset,
-			      unsigned long nr_segs)
+			      struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
@@ -3239,13 +3238,13 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	if (ext4_has_inline_data(inode))
 		return 0;
 
-	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+	trace_ext4_direct_IO_enter(inode, offset, iov_length(iter->iov, iter->nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+		ret = ext4_ext_direct_IO(rw, iocb, iter->iov, offset, iter->nr_segs);
 	else
-		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+		ret = ext4_ind_direct_IO(rw, iocb, iter->iov, offset, iter->nr_segs);
 	trace_ext4_direct_IO_exit(inode, offset,
-				iov_length(iov, nr_segs), rw, ret);
+				iov_length(iter->iov, iter->nr_segs), rw, ret);
 	return ret;
 }
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 45abd60e2bff..3a6ef121c095 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1010,7 +1010,7 @@ static int check_direct_IO(struct inode *inode, int rw,
 }
 
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
-		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+		struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
@@ -1019,11 +1019,11 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 	if (f2fs_has_inline_data(inode))
 		return 0;
 
-	if (check_direct_IO(inode, rw, iov, offset, nr_segs))
+	if (check_direct_IO(inode, rw, iter->iov, offset, iter->nr_segs))
 		return 0;
 
-	return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-							get_data_block);
+	return blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				  iter->nr_segs, get_data_block);
 }
 
 static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index b3361fe2bcb5..d5237a199055 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -185,8 +185,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
 }
 
 static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
-			     const struct iovec *iov,
-			     loff_t offset, unsigned long nr_segs)
+			     struct iov_iter *iter,
+			     loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -203,7 +203,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 		 *
 		 * Return 0, and fallback to normal buffered write.
 		 */
-		loff_t size = offset + iov_length(iov, nr_segs);
+		loff_t size = offset + iov_length(iter->iov, iter->nr_segs);
 		if (MSDOS_I(inode)->mmu_private < size)
 			return 0;
 	}
@@ -212,10 +212,11 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-				 fat_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				 iter->nr_segs, fat_get_block);
 	if (ret < 0 && (rw & WRITE))
-		fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+		fat_write_failed(mapping, offset +
+			         iov_length(iter->iov, iter->nr_segs));
 
 	return ret;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9c7f346879e7..17d96f36df15 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2890,8 +2890,8 @@ static inline loff_t fuse_round_up(loff_t off)
 }
 
 static ssize_t
-fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs)
+fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+			loff_t offset)
 {
 	ssize_t ret = 0;
 	struct file *file = iocb->ki_filp;
@@ -2900,7 +2900,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t pos = 0;
 	struct inode *inode;
 	loff_t i_size;
-	size_t count = iov_length(iov, nr_segs);
+	size_t count = iov_length(iter->iov, iter->nr_segs);
 	struct fuse_io_priv *io;
 
 	pos = offset;
@@ -2944,9 +2944,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 		io->async = false;
 
 	if (rw == WRITE)
-		ret = __fuse_direct_write(io, iov, nr_segs, &pos);
+		ret = __fuse_direct_write(io, iter->iov, iter->nr_segs, &pos);
 	else
-		ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+		ret = __fuse_direct_read(io, iter->iov, iter->nr_segs, &pos, count);
 
 	if (io->async) {
 		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ce62dcac90b6..e84ddaa42104 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1041,8 +1041,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
 
 
 static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
-			      const struct iovec *iov, loff_t offset,
-			      unsigned long nr_segs)
+			      struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
@@ -1082,7 +1081,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	 */
 	if (mapping->nrpages) {
 		loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
-		loff_t len = iov_length(iov, nr_segs);
+		loff_t len = iov_length(iter->iov, iter->nr_segs);
 		loff_t end = PAGE_ALIGN(offset + len) - 1;
 
 		rv = 0;
@@ -1097,9 +1096,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 			truncate_inode_pages_range(mapping, lstart, end);
 	}
 
-	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, gfs2_get_block_direct,
-				  NULL, NULL, 0);
+	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				  iter->iov, offset, iter->nr_segs,
+				  gfs2_get_block_direct, NULL, NULL, 0);
 out:
 	gfs2_glock_dq(&gh);
 	gfs2_holder_uninit(&gh);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9e2fecd62f62..09cff13528c5 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -125,15 +125,15 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 }
 
 static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
-		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+		struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = file_inode(file)->i_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-				 hfs_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				 iter->nr_segs, hfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -141,7 +141,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + iov_length(iov, nr_segs);
+		loff_t end = offset + iov_length(iter->iov, iter->nr_segs);
 
 		if (end > isize)
 			hfs_write_failed(mapping, end);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a4f45bd88a63..7f894a5b5eaf 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -123,14 +123,14 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 }
 
 static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
-		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+		struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = file_inode(file)->i_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset, iter->nr_segs,
 				 hfsplus_get_block);
 
 	/*
@@ -139,7 +139,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + iov_length(iov, nr_segs);
+		loff_t end = offset + iov_length(iter->iov, iter->nr_segs);
 
 		if (end > isize)
 			hfsplus_write_failed(mapping, end);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 6f8fe72c2a7a..7052744d5107 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -331,15 +331,15 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
 }
 
 static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
-	const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+	struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-				 jfs_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				 iter->nr_segs, jfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -347,7 +347,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + iov_length(iov, nr_segs);
+		loff_t end = offset + iov_length(iter->iov, iter->nr_segs);
 
 		if (end > isize)
 			jfs_write_failed(mapping, end);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b8797ae6831f..e9cde3935001 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -121,20 +121,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
  * shunt off direct read and write requests before the VFS gets them,
  * so this method is only ever called for swap.
  */
-ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
+ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
 #ifndef CONFIG_NFS_SWAP
 	dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
-			iocb->ki_filp, (long long) pos, nr_segs);
+			iocb->ki_filp, (long long) pos, iter->nr_segs);
 
 	return -EINVAL;
 #else
 	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
 
 	if (rw == READ || rw == KERNEL_READ)
-		return nfs_file_direct_read(iocb, iov, nr_segs, pos,
+		return nfs_file_direct_read(iocb, iter->iov, iter->nr_segs, pos,
 				rw == READ ? true : false);
-	return nfs_file_direct_write(iocb, iov, nr_segs, pos,
+	return nfs_file_direct_write(iocb, iter->iov, iter->nr_segs, pos,
 				rw == WRITE ? true : false);
 #endif /* CONFIG_NFS_SWAP */
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b9c5726120e3..1c0e8fedc095 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -298,8 +298,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 }
 
 static ssize_t
-nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-		loff_t offset, unsigned long nr_segs)
+nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+		loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -310,8 +310,8 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 		return 0;
 
 	/* Needs synchronization with the cleaner */
-	size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-				  nilfs_get_block);
+	size = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				  iter->nr_segs, nilfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -319,7 +319,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	 */
 	if (unlikely((rw & WRITE) && size < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + iov_length(iov, nr_segs);
+		loff_t end = offset + iov_length(iter->iov, iter->nr_segs);
 
 		if (end > isize)
 			nilfs_write_failed(mapping, end);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index d310d12a9adc..799fd0afcb35 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -599,9 +599,8 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
 
 static ssize_t ocfs2_direct_IO(int rw,
 			       struct kiocb *iocb,
-			       const struct iovec *iov,
-			       loff_t offset,
-			       unsigned long nr_segs)
+			       struct iov_iter *iter,
+			       loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file)->i_mapping->host;
@@ -618,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw,
 		return 0;
 
 	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-				    iov, offset, nr_segs,
+				    iter->iov, offset, iter->nr_segs,
 				    ocfs2_direct_IO_get_blocks,
 				    ocfs2_dio_end_io, NULL, 0);
 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index bc8b8009897d..17bf4c41a509 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3083,15 +3083,14 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 /* We thank Mingming Cao for helping us understand in great detail what
    to do in this section of the code. */
 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
-				  const struct iovec *iov, loff_t offset,
-				  unsigned long nr_segs)
+				  struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-				  reiserfs_get_blocks_direct_io);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
+				 iter->nr_segs, reiserfs_get_blocks_direct_io);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -3099,7 +3098,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 	 */
 	if (unlikely((rw & WRITE) && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + iov_length(iov, nr_segs);
+		loff_t end = offset + iov_length(iter->iov, iter->nr_segs);
 
 		if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
 			truncate_setsize(inode, isize);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d2c170f8b035..ade886401658 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -119,8 +119,8 @@ static int udf_adinicb_write_end(struct file *file,
 }
 
 static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
-				     const struct iovec *iov,
-				     loff_t offset, unsigned long nr_segs)
+				     struct iov_iter *iter,
+				     loff_t offset)
 {
 	/* Fallback to buffered I/O. */
 	return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5d643706212f..5b184c7f7dcb 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -217,18 +217,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
 }
 
 static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
-			     const struct iovec *iov,
-			     loff_t offset, unsigned long nr_segs)
+			     struct iov_iter *iter,
+			     loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset, iter->nr_segs,
 				  udf_get_block);
 	if (unlikely(ret < 0 && (rw & WRITE)))
-		udf_write_failed(mapping, offset + iov_length(iov, nr_segs));
+		udf_write_failed(mapping, offset + iov_length(iter->iov, iter->nr_segs));
 	return ret;
 }
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0479c32c5eb1..330d7b1c4be3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1449,9 +1449,8 @@ STATIC ssize_t
 xfs_vm_direct_IO(
 	int			rw,
 	struct kiocb		*iocb,
-	const struct iovec	*iov,
-	loff_t			offset,
-	unsigned long		nr_segs)
+	struct iov_iter		*iter,
+	loff_t			offset)
 {
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
 	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
@@ -1459,7 +1458,7 @@ xfs_vm_direct_IO(
 	ssize_t			ret;
 
 	if (rw & WRITE) {
-		size_t size = iov_length(iov, nr_segs);
+		size_t size = iov_length(iter->iov, iter->nr_segs);
 
 		/*
 		 * We cannot preallocate a size update transaction here as we
@@ -1471,16 +1470,16 @@ xfs_vm_direct_IO(
 		if (offset + size > XFS_I(inode)->i_d.di_size)
 			ioend->io_isdirect = 1;
 
-		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-					    offset, nr_segs,
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter->iov,
+					    offset, iter->nr_segs,
 					    xfs_get_blocks_direct,
 					    xfs_end_io_direct_write, NULL,
 					    DIO_ASYNC_EXTEND);
 		if (ret != -EIOCBQUEUED && iocb->private)
 			goto out_destroy_ioend;
 	} else {
-		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-					    offset, nr_segs,
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter->iov,
+					    offset, iter->nr_segs,
 					    xfs_get_blocks_direct,
 					    NULL, NULL, 0);
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 796de742fe4a..399a338c92b5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -343,8 +343,7 @@ struct address_space_operations {
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, gfp_t);
 	void (*freepage)(struct page *);
-	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs);
+	ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
 						void **, unsigned long *);
 	/*
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fa6918b0f829..5a0d78ec739d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -459,8 +459,7 @@ extern int nfs3_removexattr (struct dentry *, const char *name);
 /*
  * linux/fs/nfs/direct.c
  */
-extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
-			unsigned long);
+extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
 			const struct iovec *iov, unsigned long nr_segs,
 			loff_t pos, bool uio);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7c1417b0bd7b..139641274f1e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1699,10 +1699,9 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		size = i_size_read(inode);
 		retval = filemap_write_and_wait_range(mapping, pos,
 					pos + iov_length(iov, nr_segs) - 1);
-		if (!retval) {
-			retval = mapping->a_ops->direct_IO(READ, iocb,
-							   iov, pos, nr_segs);
-		}
+		if (!retval)
+			retval = mapping->a_ops->direct_IO(READ, iocb, &i, pos);
+
 		if (retval > 0) {
 			*ppos = pos + retval;
 			count -= retval;
@@ -2383,7 +2382,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 		}
 	}
 
-	written = mapping->a_ops->direct_IO(WRITE, iocb, from->iov, pos, from->nr_segs);
+	written = mapping->a_ops->direct_IO(WRITE, iocb, from, pos);
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
diff --git a/mm/page_io.c b/mm/page_io.c
index 7c59ef681381..0ed0644c73db 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -263,16 +263,18 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 			.iov_base = kmap(page),
 			.iov_len  = PAGE_SIZE,
 		};
+		struct iov_iter from;
 
 		init_sync_kiocb(&kiocb, swap_file);
 		kiocb.ki_pos = page_file_offset(page);
 		kiocb.ki_nbytes = PAGE_SIZE;
+		iov_iter_init(&from, &iov, 1, PAGE_SIZE, 0);
 
 		set_page_writeback(page);
 		unlock_page(page);
 		ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
-						&kiocb, &iov,
-						kiocb.ki_pos, 1);
+						&kiocb, &from,
+						kiocb.ki_pos);
 		kunmap(page);
 		if (ret == PAGE_SIZE) {
 			count_vm_event(PSWPOUT);
-- 
cgit v1.2.3-71-gd317


From 619d30b4b8c488042b4a720ca79dccc346d1a516 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Mar 2014 21:53:33 -0500
Subject: convert the guts of nfs_direct_IO() to iov_iter

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/direct.c        | 46 +++++++++++++++++++++-------------------------
 fs/nfs/file.c          | 18 ++++++++++++------
 include/linux/nfs_fs.h |  4 ++--
 3 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e9cde3935001..21723149668b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -132,9 +132,9 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
 	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
 
 	if (rw == READ || rw == KERNEL_READ)
-		return nfs_file_direct_read(iocb, iter->iov, iter->nr_segs, pos,
+		return nfs_file_direct_read(iocb, iter, pos,
 				rw == READ ? true : false);
-	return nfs_file_direct_write(iocb, iter->iov, iter->nr_segs, pos,
+	return nfs_file_direct_write(iocb, iter, pos,
 				rw == WRITE ? true : false);
 #endif /* CONFIG_NFS_SWAP */
 }
@@ -414,8 +414,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 }
 
 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
-					      const struct iovec *iov,
-					      unsigned long nr_segs,
+					      struct iov_iter *iter,
 					      loff_t pos, bool uio)
 {
 	struct nfs_pageio_descriptor desc;
@@ -430,8 +429,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	desc.pg_dreq = dreq;
 	atomic_inc(&inode->i_dio_count);
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *vec = &iov[seg];
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		const struct iovec *vec = &iter->iov[seg];
 		result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
 		if (result < 0)
 			break;
@@ -461,8 +460,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 /**
  * nfs_file_direct_read - file direct read operation for NFS files
  * @iocb: target I/O control block
- * @iov: vector of user buffers into which to read data
- * @nr_segs: size of iov vector
+ * @iter: vector of user buffers into which to read data
  * @pos: byte offset in file where reading starts
  *
  * We use this function for direct reads instead of calling
@@ -479,8 +477,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * client must read the updated atime from the server back into its
  * cache.
  */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos, bool uio)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+				loff_t pos, bool uio)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -490,7 +488,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result = -EINVAL;
 	size_t count;
 
-	count = iov_length(iov, nr_segs);
+	count = iov_length(iter->iov, iter->nr_segs);
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
 	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
@@ -513,7 +511,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 		goto out_unlock;
 
 	dreq->inode = inode;
-	dreq->bytes_left = iov_length(iov, nr_segs);
+	dreq->bytes_left = count;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	l_ctx = nfs_get_lock_context(dreq->ctx);
 	if (IS_ERR(l_ctx)) {
@@ -524,8 +522,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	NFS_I(inode)->read_io += iov_length(iov, nr_segs);
-	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+	NFS_I(inode)->read_io += count;
+	result = nfs_direct_read_schedule_iovec(dreq, iter, pos, uio);
 
 	mutex_unlock(&inode->i_mutex);
 
@@ -864,8 +862,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
 };
 
 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
-					       const struct iovec *iov,
-					       unsigned long nr_segs,
+					       struct iov_iter *iter,
 					       loff_t pos, bool uio)
 {
 	struct nfs_pageio_descriptor desc;
@@ -880,9 +877,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	get_dreq(dreq);
 	atomic_inc(&inode->i_dio_count);
 
-	NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *vec = &iov[seg];
+	NFS_I(dreq->inode)->write_io += iov_length(iter->iov, iter->nr_segs);
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		const struct iovec *vec = &iter->iov[seg];
 		result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
 		if (result < 0)
 			break;
@@ -911,8 +908,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 /**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
- * @iov: vector of user buffers from which to write data
- * @nr_segs: size of iov vector
+ * @iter: vector of user buffers from which to write data
  * @pos: byte offset in file where writing starts
  *
  * We use this function for direct writes instead of calling
@@ -930,8 +926,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  * Note that O_APPEND is not supported for NFS direct writes, as there
  * is no atomic O_APPEND write facility in the NFS protocol.
  */
-ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos, bool uio)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+				loff_t pos, bool uio)
 {
 	ssize_t result = -EINVAL;
 	struct file *file = iocb->ki_filp;
@@ -942,7 +938,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	loff_t end;
 	size_t count;
 
-	count = iov_length(iov, nr_segs);
+	count = iov_length(iter->iov, iter->nr_segs);
 	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
@@ -993,7 +989,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+	result = nfs_direct_write_schedule_iovec(dreq, iter, pos, uio);
 
 	if (mapping->nrpages) {
 		invalidate_inode_pages2_range(mapping,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 284ca901fe16..3d01b152894e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -169,14 +169,18 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_length(iov, nr_segs);
 	ssize_t result;
+	struct iov_iter to;
+
+	iov_iter_init(&to, iov, nr_segs, count, 0);
 
 	if (iocb->ki_filp->f_flags & O_DIRECT)
-		return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
+		return nfs_file_direct_read(iocb, &to, pos, true);
 
-	dprintk("NFS: read(%pD2, %lu@%lu)\n",
+	dprintk("NFS: read(%pD2, %zu@%lu)\n",
 		iocb->ki_filp,
-		(unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
+		count, (unsigned long) pos);
 
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	if (!result) {
@@ -643,16 +647,18 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	unsigned long written = 0;
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
+	struct iov_iter from;
+	iov_iter_init(&from, iov, nr_segs, count, 0);
 
 	result = nfs_key_timeout_notify(file, inode);
 	if (result)
 		return result;
 
 	if (file->f_flags & O_DIRECT)
-		return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
+		return nfs_file_direct_write(iocb, &from, pos, true);
 
-	dprintk("NFS: write(%pD2, %lu@%Ld)\n",
-		file, (unsigned long) count, (long long) pos);
+	dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+		file, count, (long long) pos);
 
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5a0d78ec739d..0a82b6fbae8a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -461,10 +461,10 @@ extern int nfs3_removexattr (struct dentry *, const char *name);
  */
 extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
-			const struct iovec *iov, unsigned long nr_segs,
+			struct iov_iter *iter,
 			loff_t pos, bool uio);
 extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
-			const struct iovec *iov, unsigned long nr_segs,
+			struct iov_iter *iter,
 			loff_t pos, bool uio);
 
 /*
-- 
cgit v1.2.3-71-gd317


From 31b140398ce56ab41646eda7f02bcb78d6a4c916 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 01:33:16 -0500
Subject: switch {__,}blockdev_direct_IO() to iov_iter

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c      |  4 ++--
 fs/btrfs/inode.c    |  3 +--
 fs/direct-io.c      | 33 ++++++++++++++++-----------------
 fs/ext2/inode.c     |  3 +--
 fs/ext3/inode.c     |  3 +--
 fs/ext4/indirect.c  |  7 +++----
 fs/ext4/inode.c     |  4 ++--
 fs/f2fs/data.c      |  4 ++--
 fs/fat/inode.c      |  3 +--
 fs/gfs2/aops.c      |  2 +-
 fs/hfs/inode.c      |  3 +--
 fs/hfsplus/inode.c  |  2 +-
 fs/jfs/inode.c      |  3 +--
 fs/nilfs2/inode.c   |  4 ++--
 fs/ocfs2/aops.c     |  2 +-
 fs/reiserfs/inode.c |  4 ++--
 fs/udf/inode.c      |  3 +--
 fs/xfs/xfs_aops.c   | 10 ++++------
 include/linux/fs.h  | 12 ++++++------
 19 files changed, 49 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 938fc707d769..937e3011ed58 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -171,8 +171,8 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter->iov,
-				    offset, iter->nr_segs, blkdev_get_block,
+	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
+				    offset, blkdev_get_block,
 				    NULL, NULL, 0);
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c46a025d0c4b..b0b8fa0efba3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7483,8 +7483,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 
 	ret = __blockdev_direct_IO(rw, iocb, inode,
 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-			iter->iov, offset, iter->nr_segs,
-			btrfs_get_blocks_direct, NULL,
+			iter, offset, btrfs_get_blocks_direct, NULL,
 			btrfs_submit_direct, flags);
 	if (rw & WRITE) {
 		if (ret < 0 && ret != -EIOCBQUEUED)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 31ba0935e32e..1c677899b989 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1107,8 +1107,8 @@ static inline int drop_refcount(struct dio *dio)
  */
 static inline ssize_t
 do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	struct block_device *bdev, struct iov_iter *iter, loff_t offset, 
+	get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
 {
 	int seg;
@@ -1143,9 +1143,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	}
 
 	/* Check the memory alignment.  Blocks cannot straddle pages */
-	for (seg = 0; seg < nr_segs; seg++) {
-		addr = (unsigned long)iov[seg].iov_base;
-		size = iov[seg].iov_len;
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		addr = (unsigned long)iter->iov[seg].iov_base;
+		size = iter->iov[seg].iov_len;
 		end += size;
 		if (unlikely((addr & blocksize_mask) ||
 			     (size & blocksize_mask))) {
@@ -1256,18 +1256,18 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (unlikely(sdio.blkfactor))
 		sdio.pages_in_io = 2;
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		user_addr = (unsigned long)iter->iov[seg].iov_base;
 		sdio.pages_in_io +=
-			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+			((user_addr + iter->iov[seg].iov_len + PAGE_SIZE-1) /
 				PAGE_SIZE - user_addr / PAGE_SIZE);
 	}
 
 	blk_start_plug(&plug);
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio.size += bytes = iov[seg].iov_len;
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		user_addr = (unsigned long)iter->iov[seg].iov_base;
+		sdio.size += bytes = iter->iov[seg].iov_len;
 
 		/* Index into the first page of the first block */
 		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
@@ -1288,7 +1288,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 		retval = do_direct_IO(dio, &sdio, &map_bh);
 
-		dio->result += iov[seg].iov_len -
+		dio->result += iter->iov[seg].iov_len -
 			((sdio.final_block_in_request - sdio.block_in_file) <<
 					blkbits);
 
@@ -1365,8 +1365,8 @@ out:
 
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-	struct block_device *bdev, const struct iovec *iov, loff_t offset,
-	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+	get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
 {
 	/*
@@ -1381,9 +1381,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 
-	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				     nr_segs, get_block, end_io,
-				     submit_io, flags);
+	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset,
+				     get_block, end_io, submit_io, flags);
 }
 
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 116e809aa7cb..36d35c36311d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -859,8 +859,7 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				 iter->nr_segs, ext2_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
 	if (ret < 0 && (rw & WRITE))
 		ext2_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 8582ae2c80b0..4d32133a76c4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1855,8 +1855,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 retry:
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				 iter->nr_segs, ext3_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again.
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 123898a6af05..8a57e9fcd1b9 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -686,14 +686,13 @@ retry:
 			goto locked;
 		}
 		ret = __blockdev_direct_IO(rw, iocb, inode,
-				 inode->i_sb->s_bdev, iter->iov,
-				 offset, iter->nr_segs,
+				 inode->i_sb->s_bdev, iter, offset,
 				 ext4_get_block, NULL, NULL, 0);
 		inode_dio_done(inode);
 	} else {
 locked:
-		ret = blockdev_direct_IO(rw, iocb, inode, iter->iov,
-				 offset, iter->nr_segs, ext4_get_block);
+		ret = blockdev_direct_IO(rw, iocb, inode, iter,
+				 offset, ext4_get_block);
 
 		if (unlikely((rw & WRITE) && ret < 0)) {
 			loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b993579a968..e5718385a037 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3166,8 +3166,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		dio_flags = DIO_LOCKING;
 	}
 	ret = __blockdev_direct_IO(rw, iocb, inode,
-				   inode->i_sb->s_bdev, iter->iov,
-				   offset, iter->nr_segs,
+				   inode->i_sb->s_bdev, iter,
+				   offset,
 				   get_block_func,
 				   ext4_end_io_dio,
 				   NULL,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 3a6ef121c095..151488f27755 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1022,8 +1022,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 	if (check_direct_IO(inode, rw, iter->iov, offset, iter->nr_segs))
 		return 0;
 
-	return blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				  iter->nr_segs, get_data_block);
+	return blockdev_direct_IO(rw, iocb, inode, iter, offset,
+				  get_data_block);
 }
 
 static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 154a6f9d3189..385cce464e82 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -213,8 +213,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				 iter->nr_segs, fat_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block);
 	if (ret < 0 && (rw & WRITE))
 		fat_write_failed(mapping, offset + count);
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 228a12d2afa9..910838951d66 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1097,7 +1097,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-				  iter->iov, offset, iter->nr_segs,
+				  iter, offset,
 				  gfs2_get_block_direct, NULL, NULL, 0);
 out:
 	gfs2_glock_dq(&gh);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dc69e8f31581..f5fb09ebc850 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -133,8 +133,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				 iter->nr_segs, hfs_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index e6b1251af47a..76b930ff58ae 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -131,7 +131,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset, iter->nr_segs,
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, 
 				 hfsplus_get_block);
 
 	/*
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 6cde5928693b..bd3df1ca3c9b 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -339,8 +339,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				 iter->nr_segs, jfs_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7aaf913e8709..6252b173a465 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -311,8 +311,8 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	/* Needs synchronization with the cleaner */
-	size = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				  iter->nr_segs, nilfs_get_block);
+	size = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+				  nilfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 799fd0afcb35..4a231a166cf8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -617,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw,
 		return 0;
 
 	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-				    iter->iov, offset, iter->nr_segs,
+				    iter, offset,
 				    ocfs2_direct_IO_get_blocks,
 				    ocfs2_dio_end_io, NULL, 0);
 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 723affe921f1..b8003e8dd1f4 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3090,8 +3090,8 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset,
-				 iter->nr_segs, reiserfs_get_blocks_direct_io);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+				 reiserfs_get_blocks_direct_io);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 28984baf6194..236cd48184c2 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -226,8 +226,7 @@ static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(rw, iocb, inode, iter->iov, offset, iter->nr_segs,
-				  udf_get_block);
+	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block);
 	if (unlikely(ret < 0 && (rw & WRITE)))
 		udf_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6462b3186784..08d13e395252 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1470,17 +1470,15 @@ xfs_vm_direct_IO(
 		if (offset + size > XFS_I(inode)->i_d.di_size)
 			ioend->io_isdirect = 1;
 
-		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter->iov,
-					    offset, iter->nr_segs,
-					    xfs_get_blocks_direct,
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+					    offset, xfs_get_blocks_direct,
 					    xfs_end_io_direct_write, NULL,
 					    DIO_ASYNC_EXTEND);
 		if (ret != -EIOCBQUEUED && iocb->private)
 			goto out_destroy_ioend;
 	} else {
-		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter->iov,
-					    offset, iter->nr_segs,
-					    xfs_get_blocks_direct,
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+					    offset, xfs_get_blocks_direct,
 					    NULL, NULL, 0);
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 399a338c92b5..946a9484844f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2474,16 +2474,16 @@ enum {
 void dio_end_io(struct bio *bio, int error);
 
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-	struct block_device *bdev, const struct iovec *iov, loff_t offset,
-	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+	get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags);
 
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
-		struct inode *inode, const struct iovec *iov, loff_t offset,
-		unsigned long nr_segs, get_block_t get_block)
+		struct inode *inode, struct iov_iter *iter, loff_t offset,
+		get_block_t get_block)
 {
-	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				    offset, nr_segs, get_block, NULL, NULL,
+	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iter,
+				    offset, get_block, NULL, NULL,
 				    DIO_LOCKING | DIO_SKIP_HOLES);
 }
 #endif
-- 
cgit v1.2.3-71-gd317


From 886a39115005ced8b15ab067c9c2a8d546b40a5e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 13:50:45 -0500
Subject: new primitive: iov_iter_alignment()

returns the value aligned as badly as the worst remaining segment
in iov_iter is.  Use instead of open-coded equivalents.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/lustre/lustre/llite/rw26.c |  7 ++-----
 fs/direct-io.c                             | 27 +++++----------------------
 include/linux/uio.h                        |  2 ++
 mm/iov_iter.c                              | 25 +++++++++++++++++++++++++
 4 files changed, 34 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index 38a5b580e7f0..f718585c9e08 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -391,11 +391,8 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
 	       MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
 
 	/* Check that all user buffers are aligned as well */
-	for (seg = 0; seg < iter->nr_segs; seg++) {
-		if (((unsigned long)iter->iov[seg].iov_base & ~CFS_PAGE_MASK) ||
-		    (iter->iov[seg].iov_len & ~CFS_PAGE_MASK))
-			return -EINVAL;
-	}
+	if (iov_iter_alignment(iter) & ~CFS_PAGE_MASK)
+		return -EINVAL;
 
 	env = cl_env_get(&refcheck);
 	LASSERT(!IS_ERR(env));
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1c677899b989..adfa1fb33456 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1112,19 +1112,18 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	dio_submit_t submit_io,	int flags)
 {
 	int seg;
-	size_t size;
-	unsigned long addr;
 	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
 	unsigned blkbits = i_blkbits;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
-	loff_t end = offset;
+	loff_t end = offset + iov_iter_count(iter);
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
 	unsigned long user_addr;
 	size_t bytes;
 	struct buffer_head map_bh = { 0, };
 	struct blk_plug plug;
+	unsigned long align = offset | iov_iter_alignment(iter);
 
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
@@ -1134,32 +1133,16 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * the early prefetch in the caller enough time.
 	 */
 
-	if (offset & blocksize_mask) {
+	if (align & blocksize_mask) {
 		if (bdev)
 			blkbits = blksize_bits(bdev_logical_block_size(bdev));
 		blocksize_mask = (1 << blkbits) - 1;
-		if (offset & blocksize_mask)
+		if (align & blocksize_mask)
 			goto out;
 	}
 
-	/* Check the memory alignment.  Blocks cannot straddle pages */
-	for (seg = 0; seg < iter->nr_segs; seg++) {
-		addr = (unsigned long)iter->iov[seg].iov_base;
-		size = iter->iov[seg].iov_len;
-		end += size;
-		if (unlikely((addr & blocksize_mask) ||
-			     (size & blocksize_mask))) {
-			if (bdev)
-				blkbits = blksize_bits(
-					 bdev_logical_block_size(bdev));
-			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))
-				goto out;
-		}
-	}
-
 	/* watch out for a 0 len io from a tricksy fs */
-	if (rw == READ && end == offset)
+	if (rw == READ && !iov_iter_count(iter))
 		return 0;
 
 	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index abbe83ded630..4ee17413fe1b 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -67,6 +67,7 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
+unsigned long iov_iter_alignment(const struct iov_iter *i);
 
 static inline void iov_iter_init(struct iov_iter *i,
 			const struct iovec *iov, unsigned long nr_segs,
@@ -88,4 +89,5 @@ static inline size_t iov_iter_count(struct iov_iter *i)
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
 
+
 #endif
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 22ec1ef068a8..2f762cc21080 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -195,3 +195,28 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
 		return min(i->count, iov->iov_len - i->iov_offset);
 }
 EXPORT_SYMBOL(iov_iter_single_seg_count);
+
+unsigned long iov_iter_alignment(const struct iov_iter *i)
+{
+	const struct iovec *iov = i->iov;
+	unsigned long res;
+	size_t size = i->count;
+	size_t n;
+
+	if (!size)
+		return 0;
+
+	res = (unsigned long)iov->iov_base + i->iov_offset;
+	n = iov->iov_len - i->iov_offset;
+	if (n >= size)
+		return res | size;
+	size -= n;
+	res |= n;
+	while (size > (++iov)->iov_len) {
+		res |= (unsigned long)iov->iov_base | iov->iov_len;
+		size -= iov->iov_len;
+	}
+	res |= (unsigned long)iov->iov_base | size;
+	return res;
+}
+EXPORT_SYMBOL(iov_iter_alignment);
-- 
cgit v1.2.3-71-gd317


From ed978a811ec528dbe40243605c3afab55892f722 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 22:53:04 -0500
Subject: new helper: generic_file_read_iter()

iov_iter-using variant of generic_file_aio_read().  Some callers
converted.  Note that it's still not quite there for use as ->read_iter() -
we depend on having zero iter->iov_offset in O_DIRECT case.  Fortunately,
that's true for all converted callers (and for generic_file_aio_read() itself).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/file.c     | 15 +-----------
 fs/nfs/file.c      |  2 +-
 include/linux/fs.h |  1 +
 mm/filemap.c       | 67 +++++++++++++++++++++++++++---------------------------
 4 files changed, 37 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d8f383d59449..910a3022eb27 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -833,24 +833,11 @@ again:
 		/* hmm, this isn't really async... */
 		ret = ceph_sync_read(iocb, &i, &checkeof);
 	} else {
-		/*
-		 * We can't modify the content of iov,
-		 * so we only read from beginning.
-		 *
-		 * When we switch generic_file_aio_read() to iov_iter, the
-		 * if () below will be removed -- AV
-		 */
-		if (read) {
-			iocb->ki_pos = pos;
-			len = iocb->ki_nbytes;
-			read = 0;
-			iov_iter_init(&i, iov, nr_segs, len, 0);
-		}
 		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
 		     inode, ceph_vinop(inode), pos, (unsigned)len,
 		     ceph_cap_string(got));
 
-		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+		ret = generic_file_read_iter(iocb, &i);
 	}
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3d01b152894e..a352bc6d613f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -184,7 +184,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	if (!result) {
-		result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+		result = generic_file_read_iter(iocb, &to);
 		if (result > 0)
 			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 946a9484844f..d096ebc7f348 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2404,6 +2404,7 @@ extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
 		unsigned long size, pgoff_t pgoff);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *,
diff --git a/mm/filemap.c b/mm/filemap.c
index 866f4ae8223b..a7f79e90209c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1663,55 +1663,34 @@ out:
 	return written ? written : error;
 }
 
-/**
- * generic_file_aio_read - generic filesystem read routine
- * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
- * @pos:	current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
- */
 ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct file *filp = iocb->ki_filp;
+	struct file *file = iocb->ki_filp;
 	ssize_t retval = 0;
-	size_t count;
 	loff_t *ppos = &iocb->ki_pos;
-	struct iov_iter i;
-
-	count = iov_length(iov, nr_segs);
-	iov_iter_init(&i, iov, nr_segs, count, 0);
+	loff_t pos = *ppos;
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-	if (filp->f_flags & O_DIRECT) {
+	if (file->f_flags & O_DIRECT) {
+		struct address_space *mapping = file->f_mapping;
+		struct inode *inode = mapping->host;
+		size_t count = iov_iter_count(iter);
 		loff_t size;
-		struct address_space *mapping;
-		struct inode *inode;
 
-		mapping = filp->f_mapping;
-		inode = mapping->host;
 		if (!count)
 			goto out; /* skip atime */
 		size = i_size_read(inode);
 		retval = filemap_write_and_wait_range(mapping, pos,
 					pos + count - 1);
 		if (!retval) {
-			struct iov_iter data = i;
+			struct iov_iter data = *iter;
 			retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
 		}
 
 		if (retval > 0) {
 			*ppos = pos + retval;
-			count -= retval;
-			/*
-			 * If we did a short DIO read we need to skip the
-			 * section of the iov that we've already read data into.
-			 */
-			iov_iter_advance(&i, retval);
+			iov_iter_advance(iter, retval);
 		}
 
 		/*
@@ -1722,16 +1701,38 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		 * and return.  Otherwise fallthrough to buffered io for
 		 * the rest of the read.
 		 */
-		if (retval < 0 || !count || *ppos >= size) {
-			file_accessed(filp);
+		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+			file_accessed(file);
 			goto out;
 		}
 	}
 
-	retval = do_generic_file_read(filp, ppos, &i, retval);
+	retval = do_generic_file_read(file, ppos, iter, retval);
 out:
 	return retval;
 }
+EXPORT_SYMBOL(generic_file_read_iter);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb:	kernel I/O control block
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @pos:	current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	size_t count = iov_length(iov, nr_segs);
+	struct iov_iter i;
+
+	iov_iter_init(&i, iov, nr_segs, count, 0);
+	return generic_file_read_iter(iocb, &i);
+}
 EXPORT_SYMBOL(generic_file_aio_read);
 
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3-71-gd317


From 71d8e532b1549a478e6a6a8a44f309d050294d00 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Mar 2014 19:28:09 -0500
Subject: start adding the tag to iov_iter

For now, just use the same thing we pass to ->direct_IO() - it's all
iovec-based at the moment.  Pass it explicitly to iov_iter_init() and
account for kvec vs. iovec in there, by the same kludge NFS ->direct_IO()
uses.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c        |  2 +-
 fs/ceph/file.c         |  8 ++++----
 fs/cifs/file.c         |  4 ++--
 fs/fuse/file.c         |  6 +++---
 fs/nfs/file.c          |  4 ++--
 fs/ocfs2/file.c        |  2 +-
 fs/pipe.c              |  2 +-
 fs/splice.c            |  2 +-
 fs/xfs/xfs_file.c      |  4 ++--
 include/linux/uio.h    | 15 +++------------
 mm/filemap.c           |  4 ++--
 mm/iov_iter.c          | 15 +++++++++++++++
 mm/page_io.c           |  2 +-
 mm/process_vm_access.c |  4 ++--
 mm/shmem.c             |  2 +-
 15 files changed, 41 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a0a94a30d85a..f8cee205618a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1740,7 +1740,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		goto out;
 	}
 
-	iov_iter_init(&i, iov, nr_segs, count, 0);
+	iov_iter_init(&i, WRITE, iov, nr_segs, count);
 
 	err = file_remove_suid(file);
 	if (err) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 910a3022eb27..5b93cadedfbe 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -582,7 +582,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		CEPH_OSD_FLAG_ONDISK |
 		CEPH_OSD_FLAG_WRITE;
 
-	iov_iter_init(&i, iov, nr_segs, count, 0);
+	iov_iter_init(&i, WRITE, iov, nr_segs, count);
 
 	while (iov_iter_count(&i) > 0) {
 		void __user *data = i.iov->iov_base + i.iov_offset;
@@ -703,7 +703,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
 		CEPH_OSD_FLAG_WRITE |
 		CEPH_OSD_FLAG_ACK;
 
-	iov_iter_init(&i, iov, nr_segs, count, 0);
+	iov_iter_init(&i, WRITE, iov, nr_segs, count);
 
 	while ((len = iov_iter_count(&i)) > 0) {
 		size_t left;
@@ -808,7 +808,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	int checkeof = 0, read = 0;
 	struct iov_iter i;
 
-	iov_iter_init(&i, iov, nr_segs, len, 0);
+	iov_iter_init(&i, READ, iov, nr_segs, len);
 
 again:
 	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
@@ -961,7 +961,7 @@ retry_snap:
 		 * are pending vmtruncate. So write and vmtruncate
 		 * can not run at the same time
 		 */
-		iov_iter_init(&from, iov, nr_segs, count, 0);
+		iov_iter_init(&from, WRITE, iov, nr_segs, count);
 		written = generic_perform_write(file, &from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a4ccc39e6c11..15201c21ac88 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2424,7 +2424,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
 	else
 		pid = current->tgid;
 
-	iov_iter_init(&it, iov, nr_segs, len, 0);
+	iov_iter_init(&it, WRITE, iov, nr_segs, len);
 	do {
 		size_t save_len;
 
@@ -2854,7 +2854,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
 	if (!len)
 		return 0;
 
-	iov_iter_init(&to, iov, nr_segs, len, 0);
+	iov_iter_init(&to, READ, iov, nr_segs, len);
 
 	INIT_LIST_HEAD(&rdata_list);
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fc54d04a41e2..4a5519ca253f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1217,7 +1217,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
 		goto out;
-	iov_iter_init(&i, iov, nr_segs, count, 0);
+	iov_iter_init(&i, WRITE, iov, nr_segs, count);
 
 	if (count == 0)
 		goto out;
@@ -1386,7 +1386,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 	struct fuse_req *req;
 	struct iov_iter ii;
 
-	iov_iter_init(&ii, iov, nr_segs, count, 0);
+	iov_iter_init(&ii, write ? WRITE : READ, iov, nr_segs, count);
 
 	if (io->async)
 		req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
@@ -2367,7 +2367,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 	if (!bytes)
 		return 0;
 
-	iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+	iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
 
 	while (iov_iter_count(&ii)) {
 		struct page *page = pages[page_idx++];
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a352bc6d613f..ead8f44f7973 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -173,7 +173,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result;
 	struct iov_iter to;
 
-	iov_iter_init(&to, iov, nr_segs, count, 0);
+	iov_iter_init(&to, READ, iov, nr_segs, count);
 
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return nfs_file_direct_read(iocb, &to, pos, true);
@@ -648,7 +648,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
 	struct iov_iter from;
-	iov_iter_init(&from, iov, nr_segs, count, 0);
+	iov_iter_init(&from, WRITE, iov, nr_segs, count);
 
 	result = nfs_key_timeout_notify(file, inode);
 	if (result)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d33c4ced0baf..9ce9ed7615c1 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2361,7 +2361,7 @@ relock:
 	if (ret)
 		goto out_dio;
 
-	iov_iter_init(&from, iov, nr_segs, count, 0);
+	iov_iter_init(&from, WRITE, iov, nr_segs, count);
 	if (direct_io) {
 		written = generic_file_direct_write(iocb, &from, *ppos,
 						    count, ocount);
diff --git a/fs/pipe.c b/fs/pipe.c
index 034bffac3f97..cd4ccf07e772 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -287,7 +287,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 	if (unlikely(total_len == 0))
 		return 0;
 
-	iov_iter_init(&iter, iov, nr_segs, total_len, 0);
+	iov_iter_init(&iter, READ, iov, nr_segs, total_len);
 
 	do_wakeup = 0;
 	ret = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 9bc07d2b53cf..f99e420744c7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1548,7 +1548,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
 	if (ret <= 0)
 		return ret;
 
-	iov_iter_init(&iter, iov, nr_segs, count, 0);
+	iov_iter_init(&iter, READ, iov, nr_segs, count);
 
 	sd.len = 0;
 	sd.total_len = count;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f0f8084a67be..762bb3e148a6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -697,7 +697,7 @@ xfs_file_dio_aio_write(
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-	iov_iter_init(&from, iovp, nr_segs, count, 0);
+	iov_iter_init(&from, WRITE, iovp, nr_segs, count);
 	ret = generic_file_direct_write(iocb, &from, pos, count, ocount);
 
 out:
@@ -731,7 +731,7 @@ xfs_file_buffered_aio_write(
 	if (ret)
 		goto out;
 
-	iov_iter_init(&from, iovp, nr_segs, count, 0);
+	iov_iter_init(&from, WRITE, iovp, nr_segs, count);
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 4ee17413fe1b..b80bbe197d13 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -20,6 +20,7 @@ struct kvec {
 };
 
 struct iov_iter {
+	int type;
 	const struct iovec *iov;
 	unsigned long nr_segs;
 	size_t iov_offset;
@@ -68,18 +69,8 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
-
-static inline void iov_iter_init(struct iov_iter *i,
-			const struct iovec *iov, unsigned long nr_segs,
-			size_t count, size_t written)
-{
-	i->iov = iov;
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count + written;
-
-	iov_iter_advance(i, written);
-}
+void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
+			unsigned long nr_segs, size_t count);
 
 static inline size_t iov_iter_count(struct iov_iter *i)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index a7f79e90209c..3aeaf2df4135 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1730,7 +1730,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	size_t count = iov_length(iov, nr_segs);
 	struct iov_iter i;
 
-	iov_iter_init(&i, iov, nr_segs, count, 0);
+	iov_iter_init(&i, READ, iov, nr_segs, count);
 	return generic_file_read_iter(iocb, &i);
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -2596,7 +2596,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (err)
 		goto out;
 
-	iov_iter_init(&from, iov, nr_segs, count, 0);
+	iov_iter_init(&from, WRITE, iov, nr_segs, count);
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 2f762cc21080..e2c9a2db4350 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -220,3 +220,18 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 	return res;
 }
 EXPORT_SYMBOL(iov_iter_alignment);
+
+void iov_iter_init(struct iov_iter *i, int direction,
+			const struct iovec *iov, unsigned long nr_segs,
+			size_t count)
+{
+	/* It will get better.  Eventually... */
+	if (segment_eq(get_fs(), KERNEL_DS))
+		direction |= REQ_KERNEL;
+	i->type = direction;
+	i->iov = iov;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_init);
diff --git a/mm/page_io.c b/mm/page_io.c
index 0ed0644c73db..313bfedb75d1 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -268,7 +268,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		init_sync_kiocb(&kiocb, swap_file);
 		kiocb.ki_pos = page_file_offset(page);
 		kiocb.ki_nbytes = PAGE_SIZE;
-		iov_iter_init(&from, &iov, 1, PAGE_SIZE, 0);
+		iov_iter_init(&from, KERNEL_WRITE, &iov, 1, PAGE_SIZE);
 
 		set_page_writeback(page);
 		unlock_page(page);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index f32b1fbbfe69..5077afcd9e11 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -274,7 +274,7 @@ static ssize_t process_vm_rw(pid_t pid,
 	if (rc <= 0)
 		goto free_iovecs;
 
-	iov_iter_init(&iter, iov_l, liovcnt, rc, 0);
+	iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
 
 	rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
 				   iovstack_r, &iov_r);
@@ -337,7 +337,7 @@ compat_process_vm_rw(compat_pid_t pid,
 						  &iov_l);
 	if (rc <= 0)
 		goto free_iovecs;
-	iov_iter_init(&iter, iov_l, liovcnt, rc, 0);
+	iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
 	rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
 					  UIO_FASTIOV, iovstack_r,
 					  &iov_r);
diff --git a/mm/shmem.c b/mm/shmem.c
index 2a93e625adaf..e0b76696c3f9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1417,7 +1417,7 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
 	loff_t *ppos = &iocb->ki_pos;
 	struct iov_iter iter;
 
-	iov_iter_init(&iter, iov, nr_segs, count, 0);
+	iov_iter_init(&iter, READ, iov, nr_segs, count);
 
 	/*
 	 * Might this read be for a stacking filesystem?  Then when reading
-- 
cgit v1.2.3-71-gd317


From 7b2c99d15559e285384c742db52316802e24b0bd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Mar 2014 04:05:57 -0400
Subject: new helper: iov_iter_get_pages()

iov_iter_get_pages(iter, pages, maxsize, &start) grabs references pinning
the pages of up to maxsize of (contiguous) data from iter.  Returns the
amount of memory grabbed or -error.  In case of success, the requested
area begins at offset start in pages[0] and runs through pages[1], etc.
Less than requested amount might be returned - either because the contiguous
area in the beginning of iterator is smaller than requested, or because
the kernel failed to pin that many pages.

direct-io.c switched to using iov_iter_get_pages()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/direct-io.c      | 111 ++++++++++++++++++----------------------------------
 include/linux/uio.h |   2 +
 mm/iov_iter.c       |  27 +++++++++++++
 3 files changed, 67 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 387d91989c45..4b410d58faae 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -77,7 +77,6 @@ struct dio_submit {
 	unsigned blocks_available;	/* At block_in_file.  changes */
 	int reap_counter;		/* rate limit reaping */
 	sector_t final_block_in_request;/* doesn't change */
-	unsigned first_block_in_page;	/* doesn't change, Used only once */
 	int boundary;			/* prev block is at a boundary */
 	get_block_t *get_block;		/* block mapping function */
 	dio_submit_t *submit_io;	/* IO submition function */
@@ -98,19 +97,14 @@ struct dio_submit {
 	sector_t cur_page_block;	/* Where it starts */
 	loff_t cur_page_fs_offset;	/* Offset in file */
 
-	/*
-	 * Page fetching state. These variables belong to dio_refill_pages().
-	 */
-	int curr_page;			/* changes */
-	int total_pages;		/* doesn't change */
-	unsigned long curr_user_address;/* changes */
-
+	struct iov_iter *iter;
 	/*
 	 * Page queue.  These variables belong to dio_refill_pages() and
 	 * dio_get_page().
 	 */
 	unsigned head;			/* next page to process */
 	unsigned tail;			/* last valid page + 1 */
+	size_t from, to;
 };
 
 /* dio_state communicated between submission path and end_io */
@@ -163,15 +157,10 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
  */
 static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
-	int ret;
-	int nr_pages;
+	ssize_t ret;
 
-	nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
-	ret = get_user_pages_fast(
-		sdio->curr_user_address,		/* Where from? */
-		nr_pages,			/* How many pages? */
-		dio->rw == READ,		/* Write to memory? */
-		&dio->pages[0]);		/* Put results here */
+	ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+				&sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
@@ -186,18 +175,19 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 		dio->pages[0] = page;
 		sdio->head = 0;
 		sdio->tail = 1;
-		ret = 0;
-		goto out;
+		sdio->from = 0;
+		sdio->to = PAGE_SIZE;
+		return 0;
 	}
 
 	if (ret >= 0) {
-		sdio->curr_user_address += ret * PAGE_SIZE;
-		sdio->curr_page += ret;
+		iov_iter_advance(sdio->iter, ret);
+		ret += sdio->from;
 		sdio->head = 0;
-		sdio->tail = ret;
-		ret = 0;
+		sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
+		return 0;
 	}
-out:
 	return ret;	
 }
 
@@ -208,8 +198,9 @@ out:
  * L1 cache.
  */
 static inline struct page *dio_get_page(struct dio *dio,
-		struct dio_submit *sdio)
+		struct dio_submit *sdio, size_t *from, size_t *to)
 {
+	int n;
 	if (dio_pages_present(sdio) == 0) {
 		int ret;
 
@@ -218,7 +209,10 @@ static inline struct page *dio_get_page(struct dio *dio,
 			return ERR_PTR(ret);
 		BUG_ON(dio_pages_present(sdio) == 0);
 	}
-	return dio->pages[sdio->head++];
+	n = sdio->head++;
+	*from = n ? 0 : sdio->from;
+	*to = (n == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
+	return dio->pages[n];
 }
 
 /**
@@ -422,8 +416,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
  */
 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
-	while (dio_pages_present(sdio))
-		page_cache_release(dio_get_page(dio, sdio));
+	while (sdio->head < sdio->tail)
+		page_cache_release(dio->pages[sdio->head++]);
 }
 
 /*
@@ -912,23 +906,18 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 			struct buffer_head *map_bh)
 {
 	const unsigned blkbits = sdio->blkbits;
-	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
-	struct page *page;
-	unsigned block_in_page;
 	int ret = 0;
 
-	/* The I/O can start at any block offset within the first page */
-	block_in_page = sdio->first_block_in_page;
-
 	while (sdio->block_in_file < sdio->final_block_in_request) {
-		page = dio_get_page(dio, sdio);
+		struct page *page;
+		size_t from, to;
+		page = dio_get_page(dio, sdio, &from, &to);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
 			goto out;
 		}
 
-		while (block_in_page < blocks_per_page) {
-			unsigned offset_in_page = block_in_page << blkbits;
+		while (from < to) {
 			unsigned this_chunk_bytes;	/* # of bytes mapped */
 			unsigned this_chunk_blocks;	/* # of blocks */
 			unsigned u;
@@ -999,10 +988,9 @@ do_holes:
 					page_cache_release(page);
 					goto out;
 				}
-				zero_user(page, block_in_page << blkbits,
-						1 << blkbits);
+				zero_user(page, from, 1 << blkbits);
 				sdio->block_in_file++;
-				block_in_page++;
+				from += 1 << blkbits;
 				dio->result += 1 << blkbits;
 				goto next_block;
 			}
@@ -1020,7 +1008,7 @@ do_holes:
 			 * can add to this page
 			 */
 			this_chunk_blocks = sdio->blocks_available;
-			u = (PAGE_SIZE - offset_in_page) >> blkbits;
+			u = (to - from) >> blkbits;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			u = sdio->final_block_in_request - sdio->block_in_file;
@@ -1032,7 +1020,7 @@ do_holes:
 			if (this_chunk_blocks == sdio->blocks_available)
 				sdio->boundary = buffer_boundary(map_bh);
 			ret = submit_page_section(dio, sdio, page,
-						  offset_in_page,
+						  from,
 						  this_chunk_bytes,
 						  sdio->next_block_for_io,
 						  map_bh);
@@ -1043,9 +1031,9 @@ do_holes:
 			sdio->next_block_for_io += this_chunk_blocks;
 
 			sdio->block_in_file += this_chunk_blocks;
-			block_in_page += this_chunk_blocks;
+			from += this_chunk_bytes;
+			dio->result += this_chunk_bytes;
 			sdio->blocks_available -= this_chunk_blocks;
-			dio->result += this_chunk_blocks << blkbits;
 next_block:
 			BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
 			if (sdio->block_in_file == sdio->final_block_in_request)
@@ -1054,7 +1042,6 @@ next_block:
 
 		/* Drop the ref which was taken in get_user_pages() */
 		page_cache_release(page);
-		block_in_page = 0;
 	}
 out:
 	return ret;
@@ -1122,7 +1109,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
 	unsigned long user_addr;
-	size_t bytes;
 	struct buffer_head map_bh = { 0, };
 	struct blk_plug plug;
 	unsigned long align = offset | iov_iter_alignment(iter);
@@ -1234,6 +1220,10 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
+	sdio.iter = iter;
+	sdio.final_block_in_request =
+		(offset + iov_iter_count(iter)) >> blkbits;
+
 	/*
 	 * In case of non-aligned buffers, we may need 2 more
 	 * pages since we need to zero out first and last block.
@@ -1250,34 +1240,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 	blk_start_plug(&plug);
 
-	for (seg = 0; seg < iter->nr_segs; seg++) {
-		user_addr = (unsigned long)iter->iov[seg].iov_base;
-		sdio.size += bytes = iter->iov[seg].iov_len;
-
-		/* Index into the first page of the first block */
-		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		sdio.final_block_in_request = sdio.block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		sdio.head = 0;
-		sdio.tail = 0;
-		sdio.curr_page = 0;
-
-		sdio.total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			sdio.total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		sdio.curr_user_address = user_addr;
-
-		retval = do_direct_IO(dio, &sdio, &map_bh);
-
-		if (retval) {
-			dio_cleanup(dio, &sdio);
-			break;
-		}
-	} /* end iovec loop */
+	retval = do_direct_IO(dio, &sdio, &map_bh);
+	if (retval)
+		dio_cleanup(dio, &sdio);
 
 	if (retval == -ENOTBLK) {
 		/*
diff --git a/include/linux/uio.h b/include/linux/uio.h
index b80bbe197d13..341986116d83 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -71,6 +71,8 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
+ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
+			size_t maxsize, size_t *start);
 
 static inline size_t iov_iter_count(struct iov_iter *i)
 {
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index e2c9a2db4350..45204cd5ccd8 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -235,3 +235,30 @@ void iov_iter_init(struct iov_iter *i, int direction,
 	i->count = count;
 }
 EXPORT_SYMBOL(iov_iter_init);
+
+ssize_t iov_iter_get_pages(struct iov_iter *i,
+		   struct page **pages, size_t maxsize,
+		   size_t *start)
+{
+	size_t offset = i->iov_offset;
+	const struct iovec *iov = i->iov;
+	size_t len;
+	unsigned long addr;
+	int n;
+	int res;
+
+	len = iov->iov_len - offset;
+	if (len > i->count)
+		len = i->count;
+	if (len > maxsize)
+		len = maxsize;
+	addr = (unsigned long)iov->iov_base + offset;
+	len += *start = addr & (PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
+	res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
+	if (unlikely(res < 0))
+		return res;
+	return (res == n ? len : res * PAGE_SIZE) - *start;
+}
+EXPORT_SYMBOL(iov_iter_get_pages);
-- 
cgit v1.2.3-71-gd317


From f67da30c1d5fc9e341bc8121708874bfd7b31e45 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 19 Mar 2014 01:16:16 -0400
Subject: new helper: iov_iter_npages()

counts the pages covered by iov_iter, up to given limit.
do_block_direct_io() and fuse_iter_npages() switched to
it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/direct-io.c      |  9 +--------
 fs/fuse/file.c      | 16 ++--------------
 include/linux/uio.h |  1 +
 mm/iov_iter.c       | 27 +++++++++++++++++++++++++++
 4 files changed, 31 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 4b410d58faae..98040ba388ac 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1100,7 +1100,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
 {
-	int seg;
 	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
 	unsigned blkbits = i_blkbits;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
@@ -1108,7 +1107,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	loff_t end = offset + iov_iter_count(iter);
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
-	unsigned long user_addr;
 	struct buffer_head map_bh = { 0, };
 	struct blk_plug plug;
 	unsigned long align = offset | iov_iter_alignment(iter);
@@ -1231,12 +1229,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (unlikely(sdio.blkfactor))
 		sdio.pages_in_io = 2;
 
-	for (seg = 0; seg < iter->nr_segs; seg++) {
-		user_addr = (unsigned long)iter->iov[seg].iov_base;
-		sdio.pages_in_io +=
-			((user_addr + iter->iov[seg].iov_len + PAGE_SIZE-1) /
-				PAGE_SIZE - user_addr / PAGE_SIZE);
-	}
+	sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
 
 	blk_start_plug(&plug);
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7db564d18dc6..7026014717bc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1310,7 +1310,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 
 	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
 		unsigned npages;
-		size_t start, end, frag_size;
+		size_t start;
 		unsigned n = req->max_pages - req->num_pages;
 		ssize_t ret = iov_iter_get_pages(ii,
 					&req->pages[req->num_pages],
@@ -1344,19 +1344,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 
 static inline int fuse_iter_npages(const struct iov_iter *ii_p)
 {
-	struct iov_iter ii = *ii_p;
-	int npages = 0;
-
-	while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
-		unsigned long user_addr = fuse_get_user_addr(&ii);
-		unsigned offset = user_addr & ~PAGE_MASK;
-		size_t frag_size = iov_iter_single_seg_count(&ii);
-
-		npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		iov_iter_advance(&ii, frag_size);
-	}
-
-	return min(npages, FUSE_MAX_PAGES_PER_REQ);
+	return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
 }
 
 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 341986116d83..2f8825b06680 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -73,6 +73,7 @@ void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 			size_t maxsize, size_t *start);
+int iov_iter_npages(const struct iov_iter *i, int maxpages);
 
 static inline size_t iov_iter_count(struct iov_iter *i)
 {
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 45204cd5ccd8..0b677f8f9bad 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -262,3 +262,30 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 	return (res == n ? len : res * PAGE_SIZE) - *start;
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
+
+int iov_iter_npages(const struct iov_iter *i, int maxpages)
+{
+	size_t offset = i->iov_offset;
+	size_t size = i->count;
+	const struct iovec *iov = i->iov;
+	int npages = 0;
+	int n;
+
+	for (n = 0; size && n < i->nr_segs; n++, iov++) {
+		unsigned long addr = (unsigned long)iov->iov_base + offset;
+		size_t len = iov->iov_len - offset;
+		offset = 0;
+		if (unlikely(!len))	/* empty segment */
+			continue;
+		if (len > size)
+			len = size;
+		npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
+			  - addr / PAGE_SIZE;
+		if (npages >= maxpages)	/* don't bother going further */
+			return maxpages;
+		size -= len;
+		offset = 0;
+	}
+	return min(npages, maxpages);
+}
+EXPORT_SYMBOL(iov_iter_npages);
-- 
cgit v1.2.3-71-gd317


From 91f79c43d1b54d7154b118860d81b39bad07dfff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Mar 2014 04:58:33 -0400
Subject: new helper: iov_iter_get_pages_alloc()

same as iov_iter_get_pages(), except that pages array is allocated
(kmalloc if possible, vmalloc if that fails) and left for caller to
free.  Lustre and NFS ->direct_IO() switched to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/lustre/lustre/llite/rw26.c |  92 ++++-----
 fs/nfs/direct.c                            | 290 +++++++++--------------------
 include/linux/uio.h                        |   2 +
 mm/iov_iter.c                              |  40 ++++
 4 files changed, 167 insertions(+), 257 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index f718585c9e08..6b5994577b6b 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -218,14 +218,11 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
 	int i;
 
 	for (i = 0; i < npages; i++) {
-		if (pages[i] == NULL)
-			break;
 		if (do_dirty)
 			set_page_dirty_lock(pages[i]);
 		page_cache_release(pages[i]);
 	}
-
-	OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+	kvfree(pages);
 }
 
 ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
@@ -370,10 +367,9 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ccc_object *obj = cl_inode2ccc(inode);
-	long count = iov_iter_count(iter);
-	long tot_bytes = 0, result = 0;
+	ssize_t count = iov_iter_count(iter);
+	ssize_t tot_bytes = 0, result = 0;
 	struct ll_inode_info *lli = ll_i2info(inode);
-	unsigned long seg = 0;
 	long size = MAX_DIO_SIZE;
 	int refcheck;
 
@@ -407,63 +403,49 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
 		mutex_lock(&inode->i_mutex);
 
 	LASSERT(obj->cob_transient_pages == 0);
-	for (seg = 0; seg < iter->nr_segs; seg++) {
-		long iov_left = iter->iov[seg].iov_len;
-		unsigned long user_addr = (unsigned long)iter->iov[seg].iov_base;
+	while (iov_iter_count(iter)) {
+		struct page **pages;
+		size_t offs;
 
+		count = min_t(size_t, iov_iter_count(iter), size);
 		if (rw == READ) {
 			if (file_offset >= i_size_read(inode))
 				break;
-			if (file_offset + iov_left > i_size_read(inode))
-				iov_left = i_size_read(inode) - file_offset;
+			if (file_offset + count > i_size_read(inode))
+				count = i_size_read(inode) - file_offset;
 		}
 
-		while (iov_left > 0) {
-			struct page **pages;
-			int page_count, max_pages = 0;
-			long bytes;
-
-			bytes = min(size, iov_left);
-			page_count = ll_get_user_pages(rw, user_addr, bytes,
-						       &pages, &max_pages);
-			if (likely(page_count > 0)) {
-				if (unlikely(page_count <  max_pages))
-					bytes = page_count << PAGE_CACHE_SHIFT;
-				result = ll_direct_IO_26_seg(env, io, rw, inode,
-							     file->f_mapping,
-							     bytes, file_offset,
-							     pages, page_count);
-				ll_free_user_pages(pages, max_pages, rw==READ);
-			} else if (page_count == 0) {
-				GOTO(out, result = -EFAULT);
-			} else {
-				result = page_count;
-			}
-			if (unlikely(result <= 0)) {
-				/* If we can't allocate a large enough buffer
-				 * for the request, shrink it to a smaller
-				 * PAGE_SIZE multiple and try again.
-				 * We should always be able to kmalloc for a
-				 * page worth of page pointers = 4MB on i386. */
-				if (result == -ENOMEM &&
-				    size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
-					   PAGE_CACHE_SIZE) {
-					size = ((((size / 2) - 1) |
-						 ~CFS_PAGE_MASK) + 1) &
-						CFS_PAGE_MASK;
-					CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
-					       size);
-					continue;
-				}
-
-				GOTO(out, result);
+		result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
+		if (likely(result > 0)) {
+			int n = (result + offs + PAGE_SIZE - 1) / PAGE_SIZE;
+			result = ll_direct_IO_26_seg(env, io, rw, inode,
+						     file->f_mapping,
+						     result, file_offset,
+						     pages, n);
+			ll_free_user_pages(pages, n, rw==READ);
+		}
+		if (unlikely(result <= 0)) {
+			/* If we can't allocate a large enough buffer
+			 * for the request, shrink it to a smaller
+			 * PAGE_SIZE multiple and try again.
+			 * We should always be able to kmalloc for a
+			 * page worth of page pointers = 4MB on i386. */
+			if (result == -ENOMEM &&
+			    size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+				   PAGE_CACHE_SIZE) {
+				size = ((((size / 2) - 1) |
+					 ~CFS_PAGE_MASK) + 1) &
+					CFS_PAGE_MASK;
+				CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
+				       size);
+				continue;
 			}
 
-			tot_bytes += result;
-			file_offset += result;
-			iov_left -= result;
-			user_addr += result;
+			GOTO(out, result);
 		}
+		iov_iter_advance(iter, result);
+		tot_bytes += result;
+		file_offset += result;
 	}
 out:
 	LASSERT(obj->cob_transient_pages == 0);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1d34f454989e..b122fe21fea0 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -322,60 +322,37 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
  * handled automatically by nfs_direct_read_result().  Otherwise, if
  * no requests have been sent, just return an error.
  */
-static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
-						const struct iovec *iov,
-						loff_t pos, bool uio)
-{
-	struct nfs_direct_req *dreq = desc->pg_dreq;
-	struct nfs_open_context *ctx = dreq->ctx;
-	struct inode *inode = ctx->dentry->d_inode;
-	unsigned long user_addr = (unsigned long)iov->iov_base;
-	size_t count = iov->iov_len;
-	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
-	struct page **pagevec = NULL;
-	unsigned int npages;
-
-	do {
-		size_t bytes;
-		int i;
 
-		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+					      struct iov_iter *iter,
+					      loff_t pos)
+{
+	struct nfs_pageio_descriptor desc;
+	struct inode *inode = dreq->inode;
+	ssize_t result = -EINVAL;
+	size_t requested_bytes = 0;
+	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
 
-		result = -ENOMEM;
-		npages = nfs_page_array_len(pgbase, bytes);
-		if (!pagevec)
-			pagevec = kmalloc(npages * sizeof(struct page *),
-					  GFP_KERNEL);
-		if (!pagevec)
-			break;
-		if (uio) {
-			down_read(&current->mm->mmap_sem);
-			result = get_user_pages(current, current->mm, user_addr,
-					npages, 1, 0, pagevec, NULL);
-			up_read(&current->mm->mmap_sem);
-			if (result < 0)
-				break;
-		} else {
-			WARN_ON(npages != 1);
-			result = get_kernel_page(user_addr, 1, pagevec);
-			if (WARN_ON(result != 1))
-				break;
-		}
+	NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
+			     &nfs_direct_read_completion_ops);
+	get_dreq(dreq);
+	desc.pg_dreq = dreq;
+	atomic_inc(&inode->i_dio_count);
 
-		if ((unsigned)result < npages) {
-			bytes = result * PAGE_SIZE;
-			if (bytes <= pgbase) {
-				nfs_direct_release_pages(pagevec, result);
-				break;
-			}
-			bytes -= pgbase;
-			npages = result;
-		}
+	while (iov_iter_count(iter)) {
+		struct page **pagevec;
+		size_t bytes;
+		size_t pgbase;
+		unsigned npages, i;
 
+		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+						  rsize, &pgbase);
+		if (result < 0)
+			break;
+	
+		bytes = result;
+		iov_iter_advance(iter, bytes);
+		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
 		for (i = 0; i < npages; i++) {
 			struct nfs_page *req;
 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
@@ -389,55 +366,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 			}
 			req->wb_index = pos >> PAGE_SHIFT;
 			req->wb_offset = pos & ~PAGE_MASK;
-			if (!nfs_pageio_add_request(desc, req)) {
-				result = desc->pg_error;
+			if (!nfs_pageio_add_request(&desc, req)) {
+				result = desc.pg_error;
 				nfs_release_request(req);
 				break;
 			}
 			pgbase = 0;
 			bytes -= req_len;
-			started += req_len;
-			user_addr += req_len;
+			requested_bytes += req_len;
 			pos += req_len;
-			count -= req_len;
 			dreq->bytes_left -= req_len;
 		}
-		/* The nfs_page now hold references to these pages */
 		nfs_direct_release_pages(pagevec, npages);
-	} while (count != 0 && result >= 0);
-
-	kfree(pagevec);
-
-	if (started)
-		return started;
-	return result < 0 ? (ssize_t) result : -EFAULT;
-}
-
-static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
-					      struct iov_iter *iter,
-					      loff_t pos, bool uio)
-{
-	struct nfs_pageio_descriptor desc;
-	struct inode *inode = dreq->inode;
-	ssize_t result = -EINVAL;
-	size_t requested_bytes = 0;
-	unsigned long seg;
-
-	NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
-			     &nfs_direct_read_completion_ops);
-	get_dreq(dreq);
-	desc.pg_dreq = dreq;
-	atomic_inc(&inode->i_dio_count);
-
-	for (seg = 0; seg < iter->nr_segs; seg++) {
-		const struct iovec *vec = &iter->iov[seg];
-		result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
+		kvfree(pagevec);
 		if (result < 0)
 			break;
-		requested_bytes += result;
-		if ((size_t)result < vec->iov_len)
-			break;
-		pos += vec->iov_len;
 	}
 
 	nfs_pageio_complete(&desc);
@@ -521,7 +464,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 		dreq->iocb = iocb;
 
 	NFS_I(inode)->read_io += count;
-	result = nfs_direct_read_schedule_iovec(dreq, iter, pos, uio);
+	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
 
 	mutex_unlock(&inode->i_mutex);
 
@@ -677,109 +620,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 }
 #endif
 
-/*
- * NB: Return the value of the first error return code.  Subsequent
- *     errors after the first one are ignored.
- */
-/*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
- * bail and stop sending more writes.  Write length accounting is
- * handled automatically by nfs_direct_write_result().  Otherwise, if
- * no requests have been sent, just return an error.
- */
-static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
-						 const struct iovec *iov,
-						 loff_t pos, bool uio)
-{
-	struct nfs_direct_req *dreq = desc->pg_dreq;
-	struct nfs_open_context *ctx = dreq->ctx;
-	struct inode *inode = ctx->dentry->d_inode;
-	unsigned long user_addr = (unsigned long)iov->iov_base;
-	size_t count = iov->iov_len;
-	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
-	struct page **pagevec = NULL;
-	unsigned int npages;
-
-	do {
-		size_t bytes;
-		int i;
-
-		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
-
-		result = -ENOMEM;
-		npages = nfs_page_array_len(pgbase, bytes);
-		if (!pagevec)
-			pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
-		if (!pagevec)
-			break;
-
-		if (uio) {
-			down_read(&current->mm->mmap_sem);
-			result = get_user_pages(current, current->mm, user_addr,
-						npages, 0, 0, pagevec, NULL);
-			up_read(&current->mm->mmap_sem);
-			if (result < 0)
-				break;
-		} else {
-			WARN_ON(npages != 1);
-			result = get_kernel_page(user_addr, 0, pagevec);
-			if (WARN_ON(result != 1))
-				break;
-		}
-
-		if ((unsigned)result < npages) {
-			bytes = result * PAGE_SIZE;
-			if (bytes <= pgbase) {
-				nfs_direct_release_pages(pagevec, result);
-				break;
-			}
-			bytes -= pgbase;
-			npages = result;
-		}
-
-		for (i = 0; i < npages; i++) {
-			struct nfs_page *req;
-			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
-
-			req = nfs_create_request(dreq->ctx, dreq->inode,
-						 pagevec[i],
-						 pgbase, req_len);
-			if (IS_ERR(req)) {
-				result = PTR_ERR(req);
-				break;
-			}
-			nfs_lock_request(req);
-			req->wb_index = pos >> PAGE_SHIFT;
-			req->wb_offset = pos & ~PAGE_MASK;
-			if (!nfs_pageio_add_request(desc, req)) {
-				result = desc->pg_error;
-				nfs_unlock_and_release_request(req);
-				break;
-			}
-			pgbase = 0;
-			bytes -= req_len;
-			started += req_len;
-			user_addr += req_len;
-			pos += req_len;
-			count -= req_len;
-			dreq->bytes_left -= req_len;
-		}
-		/* The nfs_page now hold references to these pages */
-		nfs_direct_release_pages(pagevec, npages);
-	} while (count != 0 && result >= 0);
-
-	kfree(pagevec);
-
-	if (started)
-		return started;
-	return result < 0 ? (ssize_t) result : -EFAULT;
-}
-
 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
 	struct nfs_direct_req *dreq = hdr->dreq;
@@ -859,15 +699,27 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
 	.completion = nfs_direct_write_completion,
 };
 
+
+/*
+ * NB: Return the value of the first error return code.  Subsequent
+ *     errors after the first one are ignored.
+ */
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes.  Write length accounting is
+ * handled automatically by nfs_direct_write_result().  Otherwise, if
+ * no requests have been sent, just return an error.
+ */
 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 					       struct iov_iter *iter,
-					       loff_t pos, bool uio)
+					       loff_t pos)
 {
 	struct nfs_pageio_descriptor desc;
 	struct inode *inode = dreq->inode;
 	ssize_t result = 0;
 	size_t requested_bytes = 0;
-	unsigned long seg;
+	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
 
 	NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
 			      &nfs_direct_write_completion_ops);
@@ -875,16 +727,50 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	get_dreq(dreq);
 	atomic_inc(&inode->i_dio_count);
 
-	NFS_I(dreq->inode)->write_io += iov_iter_count(iter);
-	for (seg = 0; seg < iter->nr_segs; seg++) {
-		const struct iovec *vec = &iter->iov[seg];
-		result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
+	NFS_I(inode)->write_io += iov_iter_count(iter);
+	while (iov_iter_count(iter)) {
+		struct page **pagevec;
+		size_t bytes;
+		size_t pgbase;
+		unsigned npages, i;
+
+		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+						  wsize, &pgbase);
 		if (result < 0)
 			break;
-		requested_bytes += result;
-		if ((size_t)result < vec->iov_len)
+
+		bytes = result;
+		iov_iter_advance(iter, bytes);
+		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+		for (i = 0; i < npages; i++) {
+			struct nfs_page *req;
+			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+
+			req = nfs_create_request(dreq->ctx, inode,
+						 pagevec[i],
+						 pgbase, req_len);
+			if (IS_ERR(req)) {
+				result = PTR_ERR(req);
+				break;
+			}
+			nfs_lock_request(req);
+			req->wb_index = pos >> PAGE_SHIFT;
+			req->wb_offset = pos & ~PAGE_MASK;
+			if (!nfs_pageio_add_request(&desc, req)) {
+				result = desc.pg_error;
+				nfs_unlock_and_release_request(req);
+				break;
+			}
+			pgbase = 0;
+			bytes -= req_len;
+			requested_bytes += req_len;
+			pos += req_len;
+			dreq->bytes_left -= req_len;
+		}
+		nfs_direct_release_pages(pagevec, npages);
+		kvfree(pagevec);
+		if (result < 0)
 			break;
-		pos += vec->iov_len;
 	}
 	nfs_pageio_complete(&desc);
 
@@ -985,7 +871,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	result = nfs_direct_write_schedule_iovec(dreq, iter, pos, uio);
+	result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
 
 	if (mapping->nrpages) {
 		invalidate_inode_pages2_range(mapping,
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2f8825b06680..4876e9f2a58f 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -73,6 +73,8 @@ void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 			size_t maxsize, size_t *start);
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
+			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
 
 static inline size_t iov_iter_count(struct iov_iter *i)
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 0b677f8f9bad..a5c691c1a283 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -1,6 +1,8 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
@@ -263,6 +265,44 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
 
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+		   struct page ***pages, size_t maxsize,
+		   size_t *start)
+{
+	size_t offset = i->iov_offset;
+	const struct iovec *iov = i->iov;
+	size_t len;
+	unsigned long addr;
+	void *p;
+	int n;
+	int res;
+
+	len = iov->iov_len - offset;
+	if (len > i->count)
+		len = i->count;
+	if (len > maxsize)
+		len = maxsize;
+	addr = (unsigned long)iov->iov_base + offset;
+	len += *start = addr & (PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
+	
+	p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
+	if (!p)
+		p = vmalloc(n * sizeof(struct page *));
+	if (!p)
+		return -ENOMEM;
+
+	res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
+	if (unlikely(res < 0)) {
+		kvfree(p);
+		return res;
+	}
+	*pages = p;
+	return (res == n ? len : res * PAGE_SIZE) - *start;
+}
+EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+
 int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	size_t offset = i->iov_offset;
-- 
cgit v1.2.3-71-gd317


From 0c949334a9e2581646c6ff0d1470a805b1e5be99 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Mar 2014 06:51:37 -0400
Subject: iov_iter_truncate()

Now It Can Be Done(tm) - we don't need to do iov_shorten() in
generic_file_direct_write() anymore, now that all ->direct_IO()
instances are converted to proper iov_iter methods and honour
iter->count and iter->iov_offset properly.

Get rid of count/ocount arguments of generic_file_direct_write(),
while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c     | 17 ++++++++---------
 fs/fuse/file.c      | 18 ++++++++----------
 fs/ocfs2/file.c     | 10 +++++-----
 fs/xfs/xfs_file.c   |  9 +++++----
 include/linux/fs.h  |  3 +--
 include/linux/uio.h |  6 ++++++
 mm/filemap.c        | 19 +++++++------------
 7 files changed, 40 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f8cee205618a..ea63a51c148c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1659,8 +1659,7 @@ again:
 
 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 				    struct iov_iter *from,
-				    loff_t pos,
-				    size_t count, size_t ocount)
+				    loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t written;
@@ -1668,9 +1667,9 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 	loff_t endbyte;
 	int err;
 
-	written = generic_file_direct_write(iocb, from, pos, count, ocount);
+	written = generic_file_direct_write(iocb, from, pos);
 
-	if (written < 0 || written == count)
+	if (written < 0 || !iov_iter_count(from))
 		return written;
 
 	pos += written;
@@ -1720,13 +1719,14 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	u64 end_pos;
 	ssize_t num_written = 0;
 	ssize_t err = 0;
-	size_t count, ocount;
+	size_t count;
 	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 	struct iov_iter i;
 
 	mutex_lock(&inode->i_mutex);
 
-	count = ocount = iov_length(iov, nr_segs);
+	count = iov_length(iov, nr_segs);
+	iov_iter_init(&i, WRITE, iov, nr_segs, count);
 
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
@@ -1740,7 +1740,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		goto out;
 	}
 
-	iov_iter_init(&i, WRITE, iov, nr_segs, count);
+	iov_iter_truncate(&i, count);
 
 	err = file_remove_suid(file);
 	if (err) {
@@ -1783,8 +1783,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		atomic_inc(&BTRFS_I(inode)->sync_writers);
 
 	if (unlikely(file->f_flags & O_DIRECT)) {
-		num_written = __btrfs_direct_write(iocb, &i,
-						   pos, count, ocount);
+		num_written = __btrfs_direct_write(iocb, &i, pos);
 	} else {
 		num_written = __btrfs_buffered_write(file, &i, pos);
 		if (num_written > 0)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7026014717bc..66d2d5de19d2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1188,8 +1188,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
-	size_t count = 0;
-	size_t ocount = 0;
+	size_t count;
 	ssize_t written = 0;
 	ssize_t written_buffered = 0;
 	struct inode *inode = mapping->host;
@@ -1208,7 +1207,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	WARN_ON(iocb->ki_pos != pos);
 
-	count = ocount = iov_length(iov, nr_segs);
+	count = iov_length(iov, nr_segs);
+	iov_iter_init(&i, WRITE, iov, nr_segs, count);
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
@@ -1217,11 +1217,11 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
 		goto out;
-	iov_iter_init(&i, WRITE, iov, nr_segs, count);
 
 	if (count == 0)
 		goto out;
 
+	iov_iter_truncate(&i, count);
 	err = file_remove_suid(file);
 	if (err)
 		goto out;
@@ -1231,8 +1231,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		goto out;
 
 	if (file->f_flags & O_DIRECT) {
-		written = generic_file_direct_write(iocb, &i, pos, count, ocount);
-		if (written < 0 || written == count)
+		written = generic_file_direct_write(iocb, &i, pos);
+		if (written < 0 || !iov_iter_count(&i))
 			goto out;
 
 		pos += written;
@@ -1469,8 +1469,7 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
 
 	res = generic_write_checks(file, ppos, &count, 0);
 	if (!res) {
-		if (iter->count > count)
-			iter->count = count;
+		iov_iter_truncate(iter, count);
 		res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE);
 	}
 
@@ -2896,8 +2895,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 		if (offset >= i_size)
 			return 0;
 		count = min_t(loff_t, count, fuse_round_up(i_size - offset));
-		if (iter->count > count)
-			iter->count = count;
+		iov_iter_truncate(iter, count);
 	}
 
 	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9ce9ed7615c1..06b6a16d9776 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2241,7 +2241,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
 	int can_do_direct, has_refcount = 0;
 	ssize_t written = 0;
-	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	loff_t old_size, *ppos = &iocb->ki_pos;
 	u32 old_clusters;
@@ -2253,6 +2252,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	int unaligned_dio = 0;
 	struct iov_iter from;
 
+	count = iov_length(iov, nr_segs);
+	iov_iter_init(&from, WRITE, iov, nr_segs, count);
+
 	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
 		file->f_path.dentry->d_name.len,
@@ -2355,16 +2357,14 @@ relock:
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 
-	count = ocount = iov_length(iov, nr_segs);
 	ret = generic_write_checks(file, ppos, &count,
 				   S_ISBLK(inode->i_mode));
 	if (ret)
 		goto out_dio;
 
-	iov_iter_init(&from, WRITE, iov, nr_segs, count);
+	iov_iter_truncate(&from, count);
 	if (direct_io) {
-		written = generic_file_direct_write(iocb, &from, *ppos,
-						    count, ocount);
+		written = generic_file_direct_write(iocb, &from, *ppos);
 		if (written < 0) {
 			ret = written;
 			goto out_dio;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 762bb3e148a6..c997aa2751b2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -626,7 +626,7 @@ xfs_file_dio_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount)
+	size_t			count)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -634,7 +634,6 @@ xfs_file_dio_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			ret = 0;
-	size_t			count = ocount;
 	int			unaligned_io = 0;
 	int			iolock;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
@@ -645,6 +644,8 @@ xfs_file_dio_aio_write(
 	if ((pos | count) & target->bt_logical_sectormask)
 		return -XFS_ERROR(EINVAL);
 
+	iov_iter_init(&from, WRITE, iovp, nr_segs, count);
+
 	/* "unaligned" here means not aligned to a filesystem block */
 	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
 		unaligned_io = 1;
@@ -676,6 +677,7 @@ xfs_file_dio_aio_write(
 	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 	if (ret)
 		goto out;
+	iov_iter_truncate(&from, count);
 
 	if (mapping->nrpages) {
 		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
@@ -697,8 +699,7 @@ xfs_file_dio_aio_write(
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-	iov_iter_init(&from, WRITE, iovp, nr_segs, count);
-	ret = generic_file_direct_write(iocb, &from, pos, count, ocount);
+	ret = generic_file_direct_write(iocb, &from, pos);
 
 out:
 	xfs_rw_iunlock(ip, iolock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d096ebc7f348..8153396d19b4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2407,8 +2407,7 @@ extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsig
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
-extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *,
-		loff_t, size_t, size_t);
+extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 4876e9f2a58f..532f59d0adbb 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -82,6 +82,12 @@ static inline size_t iov_iter_count(struct iov_iter *i)
 	return i->count;
 }
 
+static inline void iov_iter_truncate(struct iov_iter *i, size_t count)
+{
+	if (i->count > count)
+		i->count = count;
+}
+
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 3aeaf2df4135..c0404b763a17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2345,8 +2345,7 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
-		loff_t pos, size_t count, size_t ocount)
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 {
 	struct file	*file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -2356,10 +2355,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	pgoff_t		end;
 	struct iov_iter data;
 
-	if (count != ocount)
-		from->nr_segs = iov_shorten((struct iovec *)from->iov, from->nr_segs, count);
-
-	write_len = iov_length(from->iov, from->nr_segs);
+	write_len = iov_iter_count(from);
 	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
 
 	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2568,7 +2564,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
-	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
 	loff_t		pos = iocb->ki_pos;
@@ -2577,7 +2572,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t		status;
 	struct iov_iter from;
 
-	count = ocount = iov_length(iov, nr_segs);
+	count = iov_length(iov, nr_segs);
+	iov_iter_init(&from, WRITE, iov, nr_segs, count);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -2588,6 +2584,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (count == 0)
 		goto out;
 
+	iov_iter_truncate(&from, count);
+
 	err = file_remove_suid(file);
 	if (err)
 		goto out;
@@ -2596,14 +2594,11 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (err)
 		goto out;
 
-	iov_iter_init(&from, WRITE, iov, nr_segs, count);
-
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		loff_t endbyte;
 
-		written = generic_file_direct_write(iocb, &from, pos,
-							count, ocount);
+		written = generic_file_direct_write(iocb, &from, pos);
 		if (written < 0 || written == count)
 			goto out;
 
-- 
cgit v1.2.3-71-gd317


From 7f7f25e82d54870df24d415a7007fbd327da027b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 17:49:24 -0500
Subject: replace checking for ->read/->aio_read presence with check in
 ->f_mode

Since we are about to introduce new methods (read_iter/write_iter), the
tests in a bunch of places would have to grow inconveniently.  Check
once (at open() time) and store results in ->f_mode as FMODE_CAN_READ
and FMODE_CAN_WRITE resp.  It might end up being a temporary measure -
once everything switches from ->aio_{read,write} to ->{read,write}_iter
it might make sense to return to open-coded checks.  We'll see...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/nand/nandsim.c          |  4 ++--
 drivers/usb/gadget/storage_common.c |  4 ++--
 fs/file_table.c                     |  4 ++++
 fs/open.c                           |  4 ++++
 fs/read_write.c                     | 14 +++++++-------
 include/linux/fs.h                  |  4 ++++
 6 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 42e8a770e631..4f0d83648e5a 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -575,12 +575,12 @@ static int alloc_device(struct nandsim *ns)
 		cfile = filp_open(cache_file, O_CREAT | O_RDWR | O_LARGEFILE, 0600);
 		if (IS_ERR(cfile))
 			return PTR_ERR(cfile);
-		if (!cfile->f_op->read && !cfile->f_op->aio_read) {
+		if (!(cfile->f_mode & FMODE_CAN_READ)) {
 			NS_ERR("alloc_device: cache file not readable\n");
 			err = -EINVAL;
 			goto err_close;
 		}
-		if (!cfile->f_op->write && !cfile->f_op->aio_write) {
+		if (!(cfile->f_mode & FMODE_CAN_WRITE)) {
 			NS_ERR("alloc_device: cache file not writeable\n");
 			err = -EINVAL;
 			goto err_close;
diff --git a/drivers/usb/gadget/storage_common.c b/drivers/usb/gadget/storage_common.c
index ec20a1f50c2d..a8898df131ed 100644
--- a/drivers/usb/gadget/storage_common.c
+++ b/drivers/usb/gadget/storage_common.c
@@ -220,11 +220,11 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
 	 * If we can't read the file, it's no good.
 	 * If we can't write the file, use it read-only.
 	 */
-	if (!(filp->f_op->read || filp->f_op->aio_read)) {
+	if (!(filp->f_mode & FMODE_CAN_READ)) {
 		LINFO(curlun, "file not readable: %s\n", filename);
 		goto out;
 	}
-	if (!(filp->f_op->write || filp->f_op->aio_write))
+	if (!(filp->f_mode & FMODE_CAN_WRITE))
 		ro = 1;
 
 	size = i_size_read(inode->i_mapping->host);
diff --git a/fs/file_table.c b/fs/file_table.c
index a374f5033e97..be73cbc48c12 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -175,6 +175,10 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 	file->f_path = *path;
 	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
+	if ((mode & FMODE_READ) && likely(fop->read || fop->aio_read))
+		mode |= FMODE_CAN_READ;
+	if ((mode & FMODE_WRITE) && likely(fop->write || fop->aio_write))
+		mode |= FMODE_CAN_WRITE;
 	file->f_mode = mode;
 	file->f_op = fop;
 	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
diff --git a/fs/open.c b/fs/open.c
index 9d64679cec73..39d3d0468ee6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -725,6 +725,10 @@ static int do_dentry_open(struct file *f,
 	}
 	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(inode);
+	if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->aio_read))
+		f->f_mode |= FMODE_CAN_READ;
+	if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->aio_write))
+		f->f_mode |= FMODE_CAN_WRITE;
 
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 31c6efa43183..d29d2a361d2c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -396,7 +396,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
-	if (!file->f_op->read && !file->f_op->aio_read)
+	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 		return -EFAULT;
@@ -445,7 +445,7 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
 	const char __user *p;
 	ssize_t ret;
 
-	if (!file->f_op->write && !file->f_op->aio_write)
+	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
 
 	old_fs = get_fs();
@@ -472,7 +472,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
-	if (!file->f_op->write && !file->f_op->aio_write)
+	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 		return -EFAULT;
@@ -785,7 +785,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
-	if (!file->f_op->aio_read && !file->f_op->read)
+	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
 
 	return do_readv_writev(READ, file, vec, vlen, pos);
@@ -798,7 +798,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
-	if (!file->f_op->aio_write && !file->f_op->write)
+	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
 
 	return do_readv_writev(WRITE, file, vec, vlen, pos);
@@ -964,7 +964,7 @@ static size_t compat_readv(struct file *file,
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op->aio_read && !file->f_op->read)
+	if (!(file->f_mode & FMODE_CAN_READ))
 		goto out;
 
 	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
@@ -1041,7 +1041,7 @@ static size_t compat_writev(struct file *file,
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op->aio_write && !file->f_op->write)
+	if (!(file->f_mode & FMODE_CAN_WRITE))
 		goto out;
 
 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8153396d19b4..75eb71357adf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -128,6 +128,10 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
 /* Write access to underlying fs */
 #define FMODE_WRITER		((__force fmode_t)0x10000)
+/* Has read method(s) */
+#define FMODE_CAN_READ          ((__force fmode_t)0x20000)
+/* Has write method(s) */
+#define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)
 
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
-- 
cgit v1.2.3-71-gd317


From 293bc9822fa9b3c9d4b7893bcb241e085580771a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Feb 2014 18:37:41 -0500
Subject: new methods: ->read_iter() and ->write_iter()

Beginning to introduce those.  Just the callers for now, and it's
clumsier than it'll eventually become; once we finish converting
aio_read and aio_write instances, the things will get nicer.

For now, these guys are in parallel to ->aio_read() and ->aio_write();
they take iocb and iov_iter, with everything in iov_iter already
validated.  File offset is passed in iocb->ki_pos, iov/nr_segs -
in iov_iter.

Main concerns in that series are stack footprint and ability to
split the damn thing cleanly.

[fix from Peter Ujfalusi <peter.ujfalusi@ti.com> folded]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  2 +
 Documentation/filesystems/vfs.txt | 10 ++++-
 fs/aio.c                          | 14 +++++-
 fs/file_table.c                   |  6 ++-
 fs/open.c                         |  6 ++-
 fs/read_write.c                   | 90 ++++++++++++++++++++++++++++++++++++---
 include/linux/fs.h                |  6 +++
 7 files changed, 121 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 9b0d5a33c8bf..b18dd1779029 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -430,6 +430,8 @@ prototypes:
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 1846374a5add..a1d0d7a30165 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -806,6 +806,8 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@ -836,11 +838,15 @@ otherwise noted.
 
   read: called by read(2) and related system calls
 
-  aio_read: called by io_submit(2) and other asynchronous I/O operations
+  aio_read: vectored, possibly asynchronous read
+
+  read_iter: possibly asynchronous read with iov_iter as destination
 
   write: called by write(2) and related system calls
 
-  aio_write: called by io_submit(2) and other asynchronous I/O operations
+  aio_write: vectored, possibly asynchronous write
+
+  write_iter: possibly asynchronous write with iov_iter as source
 
   iterate: called when the VFS needs to read the directory contents
 
diff --git a/fs/aio.c b/fs/aio.c
index a0ed6c7d2cd2..56b28607c32d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1241,6 +1241,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 
 typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
 			    unsigned long, loff_t);
+typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
 
 static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 				     int rw, char __user *buf,
@@ -1298,7 +1299,9 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 	int rw;
 	fmode_t mode;
 	aio_rw_op *rw_op;
+	rw_iter_op *iter_op;
 	struct iovec inline_vec, *iovec = &inline_vec;
+	struct iov_iter iter;
 
 	switch (opcode) {
 	case IOCB_CMD_PREAD:
@@ -1306,6 +1309,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 		mode	= FMODE_READ;
 		rw	= READ;
 		rw_op	= file->f_op->aio_read;
+		iter_op	= file->f_op->read_iter;
 		goto rw_common;
 
 	case IOCB_CMD_PWRITE:
@@ -1313,12 +1317,13 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 		mode	= FMODE_WRITE;
 		rw	= WRITE;
 		rw_op	= file->f_op->aio_write;
+		iter_op	= file->f_op->write_iter;
 		goto rw_common;
 rw_common:
 		if (unlikely(!(file->f_mode & mode)))
 			return -EBADF;
 
-		if (!rw_op)
+		if (!rw_op && !iter_op)
 			return -EINVAL;
 
 		ret = (opcode == IOCB_CMD_PREADV ||
@@ -1347,7 +1352,12 @@ rw_common:
 		if (rw == WRITE)
 			file_start_write(file);
 
-		ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+		if (iter_op) {
+			iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+			ret = iter_op(req, &iter);
+		} else {
+			ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+		}
 
 		if (rw == WRITE)
 			file_end_write(file);
diff --git a/fs/file_table.c b/fs/file_table.c
index be73cbc48c12..f8cc881fbbfb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -175,9 +175,11 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 	file->f_path = *path;
 	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
-	if ((mode & FMODE_READ) && likely(fop->read || fop->aio_read))
+	if ((mode & FMODE_READ) &&
+	     likely(fop->read || fop->aio_read || fop->read_iter))
 		mode |= FMODE_CAN_READ;
-	if ((mode & FMODE_WRITE) && likely(fop->write || fop->aio_write))
+	if ((mode & FMODE_WRITE) &&
+	     likely(fop->write || fop->aio_write || fop->write_iter))
 		mode |= FMODE_CAN_WRITE;
 	file->f_mode = mode;
 	file->f_op = fop;
diff --git a/fs/open.c b/fs/open.c
index 39d3d0468ee6..36662d036237 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -725,9 +725,11 @@ static int do_dentry_open(struct file *f,
 	}
 	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(inode);
-	if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->aio_read))
+	if ((f->f_mode & FMODE_READ) &&
+	     likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter))
 		f->f_mode |= FMODE_CAN_READ;
-	if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->aio_write))
+	if ((f->f_mode & FMODE_WRITE) &&
+	     likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter))
 		f->f_mode |= FMODE_CAN_WRITE;
 
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
diff --git a/fs/read_write.c b/fs/read_write.c
index d29d2a361d2c..fe2f9d5e3536 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -25,6 +25,7 @@
 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
 typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t);
+typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
 
 const struct file_operations generic_ro_fops = {
 	.llseek		= generic_file_llseek,
@@ -390,6 +391,27 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 
 EXPORT_SYMBOL(do_sync_read);
 
+ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+	struct iovec iov = { .iov_base = buf, .iov_len = len };
+	struct kiocb kiocb;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_nbytes = len;
+	iov_iter_init(&iter, READ, &iov, 1, len);
+
+	ret = filp->f_op->read_iter(&kiocb, &iter);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+EXPORT_SYMBOL(new_sync_read);
+
 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 {
 	ssize_t ret;
@@ -406,8 +428,10 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 		count = ret;
 		if (file->f_op->read)
 			ret = file->f_op->read(file, buf, count, pos);
-		else
+		else if (file->f_op->aio_read)
 			ret = do_sync_read(file, buf, count, pos);
+		else
+			ret = new_sync_read(file, buf, count, pos);
 		if (ret > 0) {
 			fsnotify_access(file);
 			add_rchar(current, ret);
@@ -439,6 +463,27 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 
 EXPORT_SYMBOL(do_sync_write);
 
+ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
+{
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
+	struct kiocb kiocb;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_nbytes = len;
+	iov_iter_init(&iter, WRITE, &iov, 1, len);
+
+	ret = filp->f_op->write_iter(&kiocb, &iter);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+EXPORT_SYMBOL(new_sync_write);
+
 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 {
 	mm_segment_t old_fs;
@@ -455,8 +500,10 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
 		count =  MAX_RW_COUNT;
 	if (file->f_op->write)
 		ret = file->f_op->write(file, p, count, pos);
-	else
+	else if (file->f_op->aio_write)
 		ret = do_sync_write(file, p, count, pos);
+	else
+		ret = new_sync_write(file, p, count, pos);
 	set_fs(old_fs);
 	if (ret > 0) {
 		fsnotify_modify(file);
@@ -483,8 +530,10 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 		file_start_write(file);
 		if (file->f_op->write)
 			ret = file->f_op->write(file, buf, count, pos);
-		else
+		else if (file->f_op->aio_write)
 			ret = do_sync_write(file, buf, count, pos);
+		else
+			ret = new_sync_write(file, buf, count, pos);
 		if (ret > 0) {
 			fsnotify_modify(file);
 			add_wchar(current, ret);
@@ -601,6 +650,25 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 }
 EXPORT_SYMBOL(iov_shorten);
 
+static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
+		unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+{
+	struct kiocb kiocb;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_nbytes = len;
+
+	iov_iter_init(&iter, rw, iov, nr_segs, len);
+	ret = fn(&kiocb, &iter);
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
 static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 {
@@ -738,6 +806,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	ssize_t ret;
 	io_fn_t fn;
 	iov_fn_t fnv;
+	iter_fn_t iter_fn;
 
 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
 				    ARRAY_SIZE(iovstack), iovstack, &iov);
@@ -753,13 +822,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	if (type == READ) {
 		fn = file->f_op->read;
 		fnv = file->f_op->aio_read;
+		iter_fn = file->f_op->read_iter;
 	} else {
 		fn = (io_fn_t)file->f_op->write;
 		fnv = file->f_op->aio_write;
+		iter_fn = file->f_op->write_iter;
 		file_start_write(file);
 	}
 
-	if (fnv)
+	if (iter_fn)
+		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+						pos, iter_fn);
+	else if (fnv)
 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 						pos, fnv);
 	else
@@ -912,6 +986,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	ssize_t ret;
 	io_fn_t fn;
 	iov_fn_t fnv;
+	iter_fn_t iter_fn;
 
 	ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
 					       UIO_FASTIOV, iovstack, &iov);
@@ -927,13 +1002,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	if (type == READ) {
 		fn = file->f_op->read;
 		fnv = file->f_op->aio_read;
+		iter_fn = file->f_op->read_iter;
 	} else {
 		fn = (io_fn_t)file->f_op->write;
 		fnv = file->f_op->aio_write;
+		iter_fn = file->f_op->write_iter;
 		file_start_write(file);
 	}
 
-	if (fnv)
+	if (iter_fn)
+		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+						pos, iter_fn);
+	else if (fnv)
 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 						pos, fnv);
 	else
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75eb71357adf..17535e0a4547 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1451,6 +1451,8 @@ struct block_device_operations;
 #define HAVE_COMPAT_IOCTL 1
 #define HAVE_UNLOCKED_IOCTL 1
 
+struct iov_iter;
+
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -1458,6 +1460,8 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@ -2415,6 +2419,8 @@ extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
+extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
+extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 
 /* fs/block_dev.c */
 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-- 
cgit v1.2.3-71-gd317


From 8174202b34c30e0c07231bf63f18ab29af634f0b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Apr 2014 03:17:43 -0400
Subject: write_iter variants of {__,}generic_file_aio_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_file.c        |  8 +++----
 fs/adfs/file.c          |  4 ++--
 fs/affs/file.c          |  4 ++--
 fs/bfs/file.c           |  4 ++--
 fs/ecryptfs/file.c      |  4 ++--
 fs/exofs/file.c         |  4 ++--
 fs/ext2/file.c          |  4 ++--
 fs/ext3/file.c          |  4 ++--
 fs/f2fs/file.c          |  4 ++--
 fs/fat/file.c           |  4 ++--
 fs/hfs/inode.c          |  4 ++--
 fs/hfsplus/inode.c      |  4 ++--
 fs/hostfs/hostfs_kern.c |  4 ++--
 fs/hpfs/file.c          |  4 ++--
 fs/jffs2/file.c         |  4 ++--
 fs/jfs/file.c           |  4 ++--
 fs/logfs/file.c         |  4 ++--
 fs/minix/file.c         |  4 ++--
 fs/nilfs2/file.c        |  4 ++--
 fs/omfs/file.c          |  4 ++--
 fs/ramfs/file-mmu.c     |  4 ++--
 fs/ramfs/file-nommu.c   |  4 ++--
 fs/reiserfs/file.c      |  4 ++--
 fs/sysv/file.c          |  4 ++--
 fs/ufs/file.c           |  4 ++--
 include/linux/fs.h      |  2 ++
 mm/filemap.c            | 61 ++++++++++++++++++++++++++++++-------------------
 mm/shmem.c              |  4 ++--
 mm/vmscan.c             |  2 +-
 29 files changed, 94 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 47e0597d1e9b..b9b5f979a2ca 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -763,7 +763,7 @@ err_out:
 
 buff_write:
 	mutex_unlock(&inode->i_mutex);
-	return do_sync_write(filp, data, count, offsetp);
+	return new_sync_write(filp, data, count, offsetp);
 }
 
 /**
@@ -781,7 +781,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
 
 	if (filp->f_flags & O_DIRECT)
 		return v9fs_direct_write(filp, data, count, offset);
-	return do_sync_write(filp, data, count, offset);
+	return new_sync_write(filp, data, count, offset);
 }
 
 
@@ -851,7 +851,7 @@ const struct file_operations v9fs_cached_file_operations = {
 	.read = v9fs_cached_file_read,
 	.write = v9fs_cached_file_write,
 	.read_iter = generic_file_read_iter,
-	.aio_write = generic_file_aio_write,
+	.write_iter = generic_file_write_iter,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
@@ -864,7 +864,7 @@ const struct file_operations v9fs_cached_file_operations_dotl = {
 	.read = v9fs_cached_file_read,
 	.write = v9fs_cached_file_write,
 	.read_iter = generic_file_read_iter,
-	.aio_write = generic_file_aio_write,
+	.write_iter = generic_file_write_iter,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock_dotl,
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 3bfc9efa29b4..07c9edce5aa7 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -27,8 +27,8 @@ const struct file_operations adfs_file_operations = {
 	.read_iter	= generic_file_read_iter,
 	.mmap		= generic_file_mmap,
 	.fsync		= generic_file_fsync,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.splice_read	= generic_file_splice_read,
 };
 
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 982853f17afc..9df23175e28b 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -29,8 +29,8 @@ const struct file_operations affs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 0aa788892f93..e7f88ace1a25 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -25,8 +25,8 @@ const struct file_operations bfs_file_operations = {
 	.llseek 	= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index b32827c917e1..db0fad3269c0 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -353,8 +353,8 @@ const struct file_operations ecryptfs_main_fops = {
 	.llseek = generic_file_llseek,
 	.read = new_sync_read,
 	.read_iter = ecryptfs_read_update_atime,
-	.write = do_sync_write,
-	.aio_write = generic_file_aio_write,
+	.write = new_sync_write,
+	.write_iter = generic_file_write_iter,
 	.iterate = ecryptfs_readdir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 90d394da7471..5b7f6be5a2d5 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -68,9 +68,9 @@ static int exofs_flush(struct file *file, fl_owner_t id)
 const struct file_operations exofs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= exofs_release_file,
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 407305072597..970c6aca15cc 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -63,9 +63,9 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 const struct file_operations ext2_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5439d2f0141b..c833b1226d4d 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,9 +51,9 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0e01fb0bc97c..22f4900dd8eb 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -680,9 +680,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 const struct file_operations f2fs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.open		= generic_file_open,
 	.mmap		= f2fs_file_mmap,
 	.fsync		= f2fs_sync_file,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 29285e990c90..85f79a89e747 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -171,9 +171,9 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
 	.unlocked_ioctl	= fat_generic_ioctl,
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 6d4055aff109..d0929bc81782 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -676,8 +676,8 @@ static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= hfs_file_fsync,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index cccc89e47cb6..0cf786f2d046 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -343,8 +343,8 @@ static const struct file_operations hfsplus_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= hfsplus_file_fsync,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index ce0005d8ffeb..bb529f3b7f2b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -381,8 +381,8 @@ static const struct file_operations hostfs_file_fops = {
 	.read		= new_sync_read,
 	.splice_read	= generic_file_splice_read,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
-	.write		= do_sync_write,
+	.write_iter	= generic_file_write_iter,
+	.write		= new_sync_write,
 	.mmap		= generic_file_mmap,
 	.open		= hostfs_file_open,
 	.release	= hostfs_file_release,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index bacb478a4990..7f54e5f76cec 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -199,8 +199,8 @@ const struct file_operations hpfs_file_ops =
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.release	= hpfs_file_release,
 	.fsync		= hpfs_file_fsync,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 9192127d591c..64989ca9ba90 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -53,8 +53,8 @@ const struct file_operations jffs2_file_operations =
 	.open =		generic_file_open,
  	.read =		new_sync_read,
  	.read_iter =	generic_file_read_iter,
- 	.write =	do_sync_write,
- 	.aio_write =	generic_file_aio_write,
+ 	.write =	new_sync_write,
+ 	.write_iter =	generic_file_write_iter,
 	.unlocked_ioctl=jffs2_ioctl,
 	.mmap =		generic_file_readonly_mmap,
 	.fsync =	jffs2_fsync,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index a5d8299b2208..cc744ecaf51f 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -151,10 +151,10 @@ const struct inode_operations jfs_file_inode_operations = {
 const struct file_operations jfs_file_operations = {
 	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 1ca8026dc664..8538752df2f6 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -265,14 +265,14 @@ const struct inode_operations logfs_reg_iops = {
 
 const struct file_operations logfs_reg_fops = {
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.fsync		= logfs_fsync,
 	.unlocked_ioctl	= logfs_ioctl,
 	.llseek		= generic_file_llseek,
 	.mmap		= generic_file_readonly_mmap,
 	.open		= generic_file_open,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 };
 
 const struct address_space_operations logfs_reg_aops = {
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 607b47145325..a967de085ac0 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -16,8 +16,8 @@ const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.fsync		= generic_file_fsync,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index dcb1b0e8b435..24978153c0c4 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -153,9 +153,9 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 const struct file_operations nilfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= generic_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.unlocked_ioctl	= nilfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= nilfs_compat_ioctl,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 3bf28da9f0df..902e88527fce 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -338,9 +338,9 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
 const struct file_operations omfs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = new_sync_read,
-	.write = do_sync_write,
+	.write = new_sync_write,
 	.read_iter = generic_file_read_iter,
-	.aio_write = generic_file_aio_write,
+	.write_iter = generic_file_write_iter,
 	.mmap = generic_file_mmap,
 	.fsync = generic_file_fsync,
 	.splice_read = generic_file_splice_read,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 30ffb367bc0b..6ea0b9718a9d 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -33,8 +33,8 @@
 const struct file_operations ramfs_file_operations = {
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.fsync		= noop_fsync,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 416db04f8464..9ed420f8f3ca 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -39,8 +39,8 @@ const struct file_operations ramfs_file_operations = {
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
 	.read			= new_sync_read,
 	.read_iter		= generic_file_read_iter,
-	.write			= do_sync_write,
-	.aio_write		= generic_file_aio_write,
+	.write			= new_sync_write,
+	.write_iter		= generic_file_write_iter,
 	.fsync			= noop_fsync,
 	.splice_read		= generic_file_splice_read,
 	.splice_write		= generic_file_splice_write,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 7592d681fd8c..7c8ecd6468db 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -236,7 +236,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
 
 const struct file_operations reiserfs_file_operations = {
 	.read = new_sync_read,
-	.write = do_sync_write,
+	.write = new_sync_write,
 	.unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = reiserfs_compat_ioctl,
@@ -246,7 +246,7 @@ const struct file_operations reiserfs_file_operations = {
 	.release = reiserfs_file_release,
 	.fsync = reiserfs_sync_file,
 	.read_iter = generic_file_read_iter,
-	.aio_write = generic_file_aio_write,
+	.write_iter = generic_file_write_iter,
 	.splice_read = generic_file_splice_read,
 	.splice_write = generic_file_splice_write,
 	.llseek = generic_file_llseek,
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index d99be8877388..b00811c75b24 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -23,8 +23,8 @@ const struct file_operations sysv_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.fsync		= generic_file_fsync,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index b6b402989e6b..c84ec010a676 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -37,8 +37,8 @@ const struct file_operations ufs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
 	.fsync		= generic_file_fsync,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 17535e0a4547..4b221637f09e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2414,7 +2414,9 @@ int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isbl
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
+extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index c0404b763a17..d2d9eeec8bf0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2542,10 +2542,9 @@ again:
 EXPORT_SYMBOL(generic_perform_write);
 
 /**
- * __generic_file_aio_write - write data to a file
+ * __generic_file_write_iter - write data to a file
  * @iocb:	IO state structure (file, offset, etc.)
- * @iov:	vector with data to write
- * @nr_segs:	number of segments in the vector
+ * @from:	iov_iter with data to write
  *
  * This function does all the work needed for actually writing data to a
  * file. It does all basic checks, removes SUID from the file, updates
@@ -2559,21 +2558,16 @@ EXPORT_SYMBOL(generic_perform_write);
  * A caller has to handle it. This is mainly due to the fact that we want to
  * avoid syncing under i_mutex.
  */
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs)
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
-	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
 	loff_t		pos = iocb->ki_pos;
 	ssize_t		written = 0;
 	ssize_t		err;
 	ssize_t		status;
-	struct iov_iter from;
-
-	count = iov_length(iov, nr_segs);
-	iov_iter_init(&from, WRITE, iov, nr_segs, count);
+	size_t		count = iov_iter_count(from);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -2584,7 +2578,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (count == 0)
 		goto out;
 
-	iov_iter_truncate(&from, count);
+	iov_iter_truncate(from, count);
 
 	err = file_remove_suid(file);
 	if (err)
@@ -2598,7 +2592,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		loff_t endbyte;
 
-		written = generic_file_direct_write(iocb, &from, pos);
+		written = generic_file_direct_write(iocb, from, pos);
 		if (written < 0 || written == count)
 			goto out;
 
@@ -2609,7 +2603,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		pos += written;
 		count -= written;
 
-		status = generic_perform_write(file, &from, pos);
+		status = generic_perform_write(file, from, pos);
 		/*
 		 * If generic_perform_write() returned a synchronous error
 		 * then we want to return the number of bytes which were
@@ -2641,7 +2635,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 */
 		}
 	} else {
-		written = generic_perform_write(file, &from, pos);
+		written = generic_perform_write(file, from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
 	}
@@ -2649,30 +2643,36 @@ out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs)
+{
+	size_t count = iov_length(iov, nr_segs);
+	struct iov_iter from;
+
+	iov_iter_init(&from, WRITE, iov, nr_segs, count);
+	return __generic_file_write_iter(iocb, &from);
+}
 EXPORT_SYMBOL(__generic_file_aio_write);
 
 /**
- * generic_file_aio_write - write data to a file
+ * generic_file_write_iter - write data to a file
  * @iocb:	IO state structure
- * @iov:	vector with data to write
- * @nr_segs:	number of segments in the vector
- * @pos:	position in file where to write
+ * @from:	iov_iter with data to write
  *
- * This is a wrapper around __generic_file_aio_write() to be used by most
+ * This is a wrapper around __generic_file_write_iter() to be used by most
  * filesystems. It takes care of syncing the file in case of O_SYNC file
  * and acquires i_mutex as needed.
  */
-ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
-	BUG_ON(iocb->ki_pos != pos);
-
 	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_aio_write(iocb, iov, nr_segs);
+	ret = __generic_file_write_iter(iocb, from);
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0) {
@@ -2684,6 +2684,19 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	}
 	return ret;
 }
+EXPORT_SYMBOL(generic_file_write_iter);
+
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	size_t count = iov_length(iov, nr_segs);
+	struct iov_iter from;
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	iov_iter_init(&from, WRITE, iov, nr_segs, count);
+	return generic_file_write_iter(iocb, &from);
+}
 EXPORT_SYMBOL(generic_file_aio_write);
 
 /**
diff --git a/mm/shmem.c b/mm/shmem.c
index edc6c7e817e9..d3e5c6fc313c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2618,9 +2618,9 @@ static const struct file_operations shmem_file_operations = {
 #ifdef CONFIG_TMPFS
 	.llseek		= shmem_file_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= shmem_file_read_iter,
-	.aio_write	= generic_file_aio_write,
+	.write_iter	= generic_file_write_iter,
 	.fsync		= noop_fsync,
 	.splice_read	= shmem_file_splice_read,
 	.splice_write	= generic_file_splice_write,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32c661d66a45..9c2dba6ac685 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -458,7 +458,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	 * stalls if we need to run get_block().  We could test
 	 * PagePrivate for that.
 	 *
-	 * If this process is currently in __generic_file_aio_write() against
+	 * If this process is currently in __generic_file_write_iter() against
 	 * this page's queue, we can perform writeback even if that
 	 * will block.
 	 *
-- 
cgit v1.2.3-71-gd317


From 1456c0a87c4241d3a801651019e66983c69ad17d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Apr 2014 03:21:50 -0400
Subject: blkdev_aio_write() - turn into blkdev_write_iter()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/raw.c |  4 ++--
 fs/block_dev.c     | 16 ++++++----------
 include/linux/fs.h |  3 +--
 3 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index cfb607a64b85..0102dc788608 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -286,8 +286,8 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd,
 static const struct file_operations raw_fops = {
 	.read		= new_sync_read,
 	.read_iter	= generic_file_read_iter,
-	.write		= do_sync_write,
-	.aio_write	= blkdev_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= blkdev_write_iter,
 	.fsync		= blkdev_fsync,
 	.open		= raw_open,
 	.release	= raw_release,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3d97f4a257ff..4e36b8ea8aa4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1509,28 +1509,24 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
  * Does not take i_mutex for the write and thus is not for general purpose
  * use.
  */
-ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-			 unsigned long nr_segs, loff_t pos)
+ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct blk_plug plug;
 	ssize_t ret;
 
-	BUG_ON(iocb->ki_pos != pos);
-
 	blk_start_plug(&plug);
-	ret = __generic_file_aio_write(iocb, iov, nr_segs);
+	ret = __generic_file_write_iter(iocb, from);
 	if (ret > 0) {
 		ssize_t err;
-
-		err = generic_write_sync(file, pos, ret);
+		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
 	blk_finish_plug(&plug);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blkdev_aio_write);
+EXPORT_SYMBOL_GPL(blkdev_write_iter);
 
 static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -1577,9 +1573,9 @@ const struct file_operations def_blk_fops = {
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read		= new_sync_read,
-	.write		= do_sync_write,
+	.write		= new_sync_write,
 	.read_iter	= blkdev_read_iter,
-	.aio_write	= blkdev_aio_write,
+	.write_iter	= blkdev_write_iter,
 	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4b221637f09e..1b9b6c59abdd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2425,8 +2425,7 @@ extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
 extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 
 /* fs/block_dev.c */
-extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos);
+extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
 extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 			int datasync);
 extern void block_sync_page(struct page *page);
-- 
cgit v1.2.3-71-gd317


From a8f3550cd228b6edc5d17fce1a9af8cc7004f185 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Apr 2014 03:32:25 -0400
Subject: bury __generic_file_aio_write()

all users converted to __generic_file_write_iter() now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h |  1 -
 mm/filemap.c       | 11 -----------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1b9b6c59abdd..99817c9e665e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2413,7 +2413,6 @@ extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
-extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index d2d9eeec8bf0..7dcdb9db710d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2645,17 +2645,6 @@ out:
 }
 EXPORT_SYMBOL(__generic_file_write_iter);
 
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs)
-{
-	size_t count = iov_length(iov, nr_segs);
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, iov, nr_segs, count);
-	return __generic_file_write_iter(iocb, &from);
-}
-EXPORT_SYMBOL(__generic_file_aio_write);
-
 /**
  * generic_file_write_iter - write data to a file
  * @iocb:	IO state structure
-- 
cgit v1.2.3-71-gd317


From f0d1bec9d58d4c038d0ac958c9af82be6eb18045 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Apr 2014 15:05:18 -0400
Subject: new helper: copy_page_from_iter()

parallel to copy_page_to_iter().  pipe_write() switched to it (and became
->write_iter()).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c           | 129 ++++++++--------------------------------------------
 include/linux/uio.h |   2 +
 mm/iov_iter.c       |  78 +++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 05ccb00cb407..21981e58e2a6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -116,50 +116,6 @@ void pipe_wait(struct pipe_inode_info *pipe)
 	pipe_lock(pipe);
 }
 
-static int
-pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
-			int atomic)
-{
-	unsigned long copy;
-
-	while (len > 0) {
-		while (!iov->iov_len)
-			iov++;
-		copy = min_t(unsigned long, len, iov->iov_len);
-
-		if (atomic) {
-			if (__copy_from_user_inatomic(to, iov->iov_base, copy))
-				return -EFAULT;
-		} else {
-			if (copy_from_user(to, iov->iov_base, copy))
-				return -EFAULT;
-		}
-		to += copy;
-		len -= copy;
-		iov->iov_base += copy;
-		iov->iov_len -= copy;
-	}
-	return 0;
-}
-
-/*
- * Pre-fault in the user memory, so we can use atomic copies.
- */
-static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
-{
-	while (!iov->iov_len)
-		iov++;
-
-	while (len > 0) {
-		unsigned long this_len;
-
-		this_len = min_t(unsigned long, len, iov->iov_len);
-		fault_in_pages_readable(iov->iov_base, this_len);
-		len -= this_len;
-		iov++;
-	}
-}
-
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 				  struct pipe_buffer *buf)
 {
@@ -380,24 +336,19 @@ static inline int is_packetized(struct file *file)
 }
 
 static ssize_t
-pipe_write(struct kiocb *iocb, const struct iovec *_iov,
-	    unsigned long nr_segs, loff_t ppos)
+pipe_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *filp = iocb->ki_filp;
 	struct pipe_inode_info *pipe = filp->private_data;
-	ssize_t ret;
-	int do_wakeup;
-	struct iovec *iov = (struct iovec *)_iov;
-	size_t total_len;
+	ssize_t ret = 0;
+	int do_wakeup = 0;
+	size_t total_len = iov_iter_count(from);
 	ssize_t chars;
 
-	total_len = iov_length(iov, nr_segs);
 	/* Null write succeeds. */
 	if (unlikely(total_len == 0))
 		return 0;
 
-	do_wakeup = 0;
-	ret = 0;
 	__pipe_lock(pipe);
 
 	if (!pipe->readers) {
@@ -416,38 +367,19 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 		int offset = buf->offset + buf->len;
 
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
-			int error, atomic = 1;
-			void *addr;
-
-			error = ops->confirm(pipe, buf);
+			int error = ops->confirm(pipe, buf);
 			if (error)
 				goto out;
 
-			iov_fault_in_pages_read(iov, chars);
-redo1:
-			if (atomic)
-				addr = kmap_atomic(buf->page);
-			else
-				addr = kmap(buf->page);
-			error = pipe_iov_copy_from_user(offset + addr, iov,
-							chars, atomic);
-			if (atomic)
-				kunmap_atomic(addr);
-			else
-				kunmap(buf->page);
-			ret = error;
-			do_wakeup = 1;
-			if (error) {
-				if (atomic) {
-					atomic = 0;
-					goto redo1;
-				}
+			ret = copy_page_from_iter(buf->page, offset, chars, from);
+			if (unlikely(ret < chars)) {
+				error = -EFAULT;
 				goto out;
 			}
+			do_wakeup = 1;
 			buf->len += chars;
-			total_len -= chars;
 			ret = chars;
-			if (!total_len)
+			if (!iov_iter_count(from))
 				goto out;
 		}
 	}
@@ -466,8 +398,7 @@ redo1:
 			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
-			char *src;
-			int error, atomic = 1;
+			int copied;
 
 			if (!page) {
 				page = alloc_page(GFP_HIGHUSER);
@@ -483,40 +414,19 @@ redo1:
 			 * FIXME! Is this really true?
 			 */
 			do_wakeup = 1;
-			chars = PAGE_SIZE;
-			if (chars > total_len)
-				chars = total_len;
-
-			iov_fault_in_pages_read(iov, chars);
-redo2:
-			if (atomic)
-				src = kmap_atomic(page);
-			else
-				src = kmap(page);
-
-			error = pipe_iov_copy_from_user(src, iov, chars,
-							atomic);
-			if (atomic)
-				kunmap_atomic(src);
-			else
-				kunmap(page);
-
-			if (unlikely(error)) {
-				if (atomic) {
-					atomic = 0;
-					goto redo2;
-				}
+			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
 				if (!ret)
-					ret = error;
+					ret = -EFAULT;
 				break;
 			}
-			ret += chars;
+			ret += copied;
 
 			/* Insert it into the buffer array */
 			buf->page = page;
 			buf->ops = &anon_pipe_buf_ops;
 			buf->offset = 0;
-			buf->len = chars;
+			buf->len = copied;
 			buf->flags = 0;
 			if (is_packetized(filp)) {
 				buf->ops = &packet_pipe_buf_ops;
@@ -525,8 +435,7 @@ redo2:
 			pipe->nrbufs = ++bufs;
 			pipe->tmp_page = NULL;
 
-			total_len -= chars;
-			if (!total_len)
+			if (!iov_iter_count(from))
 				break;
 		}
 		if (bufs < pipe->buffers)
@@ -1040,8 +949,8 @@ const struct file_operations pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read		= new_sync_read,
 	.read_iter	= pipe_read,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
+	.write		= new_sync_write,
+	.write_iter	= pipe_write,
 	.poll		= pipe_poll,
 	.unlocked_ioctl	= pipe_ioctl,
 	.release	= pipe_release,
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 532f59d0adbb..66012352d333 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -68,6 +68,8 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
+size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index a5c691c1a283..081e3273085b 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -82,6 +82,84 @@ done:
 }
 EXPORT_SYMBOL(copy_page_to_iter);
 
+size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	size_t skip, copy, left, wanted;
+	const struct iovec *iov;
+	char __user *buf;
+	void *kaddr, *to;
+
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	wanted = bytes;
+	iov = i->iov;
+	skip = i->iov_offset;
+	buf = iov->iov_base + skip;
+	copy = min(bytes, iov->iov_len - skip);
+
+	if (!fault_in_pages_readable(buf, copy)) {
+		kaddr = kmap_atomic(page);
+		to = kaddr + offset;
+
+		/* first chunk, usually the only one */
+		left = __copy_from_user_inatomic(to, buf, copy);
+		copy -= left;
+		skip += copy;
+		to += copy;
+		bytes -= copy;
+
+		while (unlikely(!left && bytes)) {
+			iov++;
+			buf = iov->iov_base;
+			copy = min(bytes, iov->iov_len);
+			left = __copy_from_user_inatomic(to, buf, copy);
+			copy -= left;
+			skip = copy;
+			to += copy;
+			bytes -= copy;
+		}
+		if (likely(!bytes)) {
+			kunmap_atomic(kaddr);
+			goto done;
+		}
+		offset = to - kaddr;
+		buf += copy;
+		kunmap_atomic(kaddr);
+		copy = min(bytes, iov->iov_len - skip);
+	}
+	/* Too bad - revert to non-atomic kmap */
+	kaddr = kmap(page);
+	to = kaddr + offset;
+	left = __copy_from_user(to, buf, copy);
+	copy -= left;
+	skip += copy;
+	to += copy;
+	bytes -= copy;
+	while (unlikely(!left && bytes)) {
+		iov++;
+		buf = iov->iov_base;
+		copy = min(bytes, iov->iov_len);
+		left = __copy_from_user(to, buf, copy);
+		copy -= left;
+		skip = copy;
+		to += copy;
+		bytes -= copy;
+	}
+	kunmap(page);
+done:
+	i->count -= wanted - bytes;
+	i->nr_segs -= iov - i->iov;
+	i->iov = iov;
+	i->iov_offset = skip;
+	return wanted - bytes;
+}
+EXPORT_SYMBOL(copy_page_from_iter);
+
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
-- 
cgit v1.2.3-71-gd317


From 2b777c9dd9ebbb2f8b6818d454cc5e6d7c1e3c8b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Apr 2014 22:31:22 -0400
Subject: ceph_sync_read: stop poking into iov_iter guts

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/file.c               | 46 +++++++++++++++++---------------------------
 include/linux/ceph/libceph.h |  2 --
 net/ceph/pagevec.c           | 35 ++++-----------------------------
 3 files changed, 22 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c9a24ba98c9a..672b0fedb17b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -418,7 +418,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 	struct page **pages;
 	u64 off = iocb->ki_pos;
 	int num_pages, ret;
-	size_t len = i->count;
+	size_t len = iov_iter_count(i);
 
 	dout("sync_read on file %p %llu~%u %s\n", file, off,
 	     (unsigned)len,
@@ -436,25 +436,26 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 
 	if (file->f_flags & O_DIRECT) {
 		while (iov_iter_count(i)) {
-			void __user *data = i->iov[0].iov_base + i->iov_offset;
-			size_t len = i->iov[0].iov_len - i->iov_offset;
+			size_t start;
+			ssize_t n;
 
-			num_pages = calc_pages_for((unsigned long)data, len);
-			pages = ceph_get_direct_page_vector(data,
-							    num_pages, true);
-			if (IS_ERR(pages))
-				return PTR_ERR(pages);
+			n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
+			if (n < 0)
+				return n;
 
-			ret = striped_read(inode, off, len,
+			num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+			ret = striped_read(inode, off, n,
 					   pages, num_pages, checkeof,
-					   1, (unsigned long)data & ~PAGE_MASK);
+					   1, start);
+
 			ceph_put_page_vector(pages, num_pages, true);
 
 			if (ret <= 0)
 				break;
 			off += ret;
 			iov_iter_advance(i, ret);
-			if (ret < len)
+			if (ret < n)
 				break;
 		}
 	} else {
@@ -466,25 +467,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 					num_pages, checkeof, 0, 0);
 		if (ret > 0) {
 			int l, k = 0;
-			size_t left = len = ret;
+			size_t left = ret;
 
 			while (left) {
-				void __user *data = i->iov[0].iov_base
-							+ i->iov_offset;
-				l = min(i->iov[0].iov_len - i->iov_offset,
-					left);
-
-				ret = ceph_copy_page_vector_to_user(&pages[k],
-								    data, off,
-								    l);
-				if (ret > 0) {
-					iov_iter_advance(i, ret);
-					left -= ret;
-					off += ret;
-					k = calc_pages_for(iocb->ki_pos,
-							   len - left + 1) - 1;
-					BUG_ON(k >= num_pages && left);
-				} else
+				int copy = min_t(size_t, PAGE_SIZE, left);
+				l = copy_page_to_iter(pages[k++], 0, copy, i);
+				off += l;
+				left -= l;
+				if (l < copy)
 					break;
 			}
 		}
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 2f49aa4c4f7f..279b0afac1c1 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -222,8 +222,6 @@ extern void ceph_copy_to_page_vector(struct page **pages,
 extern void ceph_copy_from_page_vector(struct page **pages,
 				    void *data,
 				    loff_t off, size_t len);
-extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
-				    loff_t off, size_t len);
 extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
 
 
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 815a2249cfa9..555013034f7a 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -53,7 +53,10 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
 			set_page_dirty_lock(pages[i]);
 		put_page(pages[i]);
 	}
-	kfree(pages);
+	if (is_vmalloc_addr(pages))
+		vfree(pages);
+	else
+		kfree(pages);
 }
 EXPORT_SYMBOL(ceph_put_page_vector);
 
@@ -164,36 +167,6 @@ void ceph_copy_from_page_vector(struct page **pages,
 }
 EXPORT_SYMBOL(ceph_copy_from_page_vector);
 
-/*
- * copy user data from a page vector into a user pointer
- */
-int ceph_copy_page_vector_to_user(struct page **pages,
-					 void __user *data,
-					 loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, left, PAGE_CACHE_SIZE-po);
-		bad = copy_to_user(data, page_address(pages[i]) + po, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		if (po) {
-			po += l - bad;
-			if (po == PAGE_CACHE_SIZE)
-				po = 0;
-		}
-		i++;
-	}
-	return len;
-}
-EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
-
 /*
  * Zero an extent within a page vector.  Offset is relative to the
  * start of the first page.
-- 
cgit v1.2.3-71-gd317


From b42b15fdad3ebb790250041d1517acebb9bd56d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Apr 2014 12:15:19 -0400
Subject: lustre: get rid of messing with iovecs

* switch to ->read_iter/->write_iter
* keep a pointer to iov_iter instead of iov/nr_segs
* do not modify iovecs; use iov_iter_truncate()/iov_iter_advance() and
a new primitive - iov_iter_reexpand() (expand previously truncated
iterator) istead.
* (racy) check for lustre VMAs intersecting with iovecs kept for now as
for_each_iov() loop.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/lustre/lustre/include/lclient.h    |  11 +--
 drivers/staging/lustre/lustre/lclient/lcommon_cl.c |  48 +---------
 drivers/staging/lustre/lustre/llite/file.c         | 106 ++++-----------------
 .../staging/lustre/lustre/llite/llite_internal.h   |   3 +-
 drivers/staging/lustre/lustre/llite/rw.c           |   3 +-
 drivers/staging/lustre/lustre/llite/vvp_io.c       |  29 +++---
 include/linux/uio.h                                |   9 ++
 7 files changed, 46 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h
index 827209ea6bd0..386a36c00f57 100644
--- a/drivers/staging/lustre/lustre/include/lclient.h
+++ b/drivers/staging/lustre/lustre/include/lclient.h
@@ -82,16 +82,7 @@ struct ccc_io {
 	/**
 	 * I/O vector information to or from which read/write is going.
 	 */
-	struct iovec *cui_iov;
-	unsigned long cui_nrsegs;
-	/**
-	 * Total iov count for left IO.
-	 */
-	unsigned long cui_tot_nrsegs;
-	/**
-	 * Old length for iov that was truncated partially.
-	 */
-	size_t cui_iov_olen;
+	struct iov_iter *cui_iter;
 	/**
 	 * Total size for the left IO.
 	 */
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
index 6907a16dbbd1..a07d5156bc50 100644
--- a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
@@ -721,31 +721,12 @@ int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
 void ccc_io_update_iov(const struct lu_env *env,
 		       struct ccc_io *cio, struct cl_io *io)
 {
-	int i;
 	size_t size = io->u.ci_rw.crw_count;
 
-	cio->cui_iov_olen = 0;
-	if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0)
+	if (!cl_is_normalio(env, io) || cio->cui_iter == NULL)
 		return;
 
-	for (i = 0; i < cio->cui_tot_nrsegs; i++) {
-		struct iovec *iv = &cio->cui_iov[i];
-
-		if (iv->iov_len < size)
-			size -= iv->iov_len;
-		else {
-			if (iv->iov_len > size) {
-				cio->cui_iov_olen = iv->iov_len;
-				iv->iov_len = size;
-			}
-			break;
-		}
-	}
-
-	cio->cui_nrsegs = i + 1;
-	LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs,
-		 "tot_nrsegs: %lu, nrsegs: %lu\n",
-		 cio->cui_tot_nrsegs, cio->cui_nrsegs);
+	iov_iter_truncate(cio->cui_iter, size);
 }
 
 int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
@@ -776,30 +757,7 @@ void ccc_io_advance(const struct lu_env *env,
 	if (!cl_is_normalio(env, io))
 		return;
 
-	LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
-	LASSERT(cio->cui_tot_count  >= nob);
-
-	cio->cui_iov	+= cio->cui_nrsegs;
-	cio->cui_tot_nrsegs -= cio->cui_nrsegs;
-	cio->cui_tot_count  -= nob;
-
-	/* update the iov */
-	if (cio->cui_iov_olen > 0) {
-		struct iovec *iv;
-
-		cio->cui_iov--;
-		cio->cui_tot_nrsegs++;
-		iv = &cio->cui_iov[0];
-		if (io->ci_continue) {
-			iv->iov_base += iv->iov_len;
-			LASSERT(cio->cui_iov_olen > iv->iov_len);
-			iv->iov_len = cio->cui_iov_olen - iv->iov_len;
-		} else {
-			/* restore the iov_len, in case of restart io. */
-			iv->iov_len = cio->cui_iov_olen;
-		}
-		cio->cui_iov_olen = 0;
-	}
+	iov_iter_reexpand(cio->cui_iter, cio->cui_tot_count  -= nob);
 }
 
 /**
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index 220bd8390a84..3efda2540d29 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -1105,9 +1105,7 @@ restart:
 
 		switch (vio->cui_io_subtype) {
 		case IO_NORMAL:
-			cio->cui_iov = args->u.normal.via_iov;
-			cio->cui_nrsegs = args->u.normal.via_nrsegs;
-			cio->cui_tot_nrsegs = cio->cui_nrsegs;
+			cio->cui_iter = args->u.normal.via_iter;
 			cio->cui_iocb = args->u.normal.via_iocb;
 			if ((iot == CIT_WRITE) &&
 			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
@@ -1171,56 +1169,23 @@ out:
 	return result;
 }
 
-static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos)
+static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct lu_env      *env;
 	struct vvp_io_args *args;
-	size_t	      count = 0;
 	ssize_t	     result;
 	int		 refcheck;
 
-	count = iov_length(iov, nr_segs);
-
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
 		return PTR_ERR(env);
 
 	args = vvp_env_args(env, IO_NORMAL);
-	args->u.normal.via_iov = (struct iovec *)iov;
-	args->u.normal.via_nrsegs = nr_segs;
+	args->u.normal.via_iter = to;
 	args->u.normal.via_iocb = iocb;
 
 	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
-				    &iocb->ki_pos, count);
-	cl_env_put(env, &refcheck);
-	return result;
-}
-
-static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
-			    loff_t *ppos)
-{
-	struct lu_env *env;
-	struct iovec  *local_iov;
-	struct kiocb  *kiocb;
-	ssize_t	result;
-	int	    refcheck;
-
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		return PTR_ERR(env);
-
-	local_iov = &vvp_env_info(env)->vti_local_iov;
-	kiocb = &vvp_env_info(env)->vti_kiocb;
-	local_iov->iov_base = (void __user *)buf;
-	local_iov->iov_len = count;
-	init_sync_kiocb(kiocb, file);
-	kiocb->ki_pos = *ppos;
-	kiocb->ki_nbytes = count;
-
-	result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
-	*ppos = kiocb->ki_pos;
-
+				    &iocb->ki_pos, iov_iter_count(to));
 	cl_env_put(env, &refcheck);
 	return result;
 }
@@ -1228,12 +1193,10 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
 /*
  * Write to a file (through the page cache).
  */
-static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs, loff_t pos)
+static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct lu_env      *env;
 	struct vvp_io_args *args;
-	size_t	      count = iov_length(iov, nr_segs);
 	ssize_t	     result;
 	int		 refcheck;
 
@@ -1242,46 +1205,15 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		return PTR_ERR(env);
 
 	args = vvp_env_args(env, IO_NORMAL);
-	args->u.normal.via_iov = (struct iovec *)iov;
-	args->u.normal.via_nrsegs = nr_segs;
+	args->u.normal.via_iter = from;
 	args->u.normal.via_iocb = iocb;
 
 	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
-				  &iocb->ki_pos, count);
+				  &iocb->ki_pos, iov_iter_count(from));
 	cl_env_put(env, &refcheck);
 	return result;
 }
 
-static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
-			     loff_t *ppos)
-{
-	struct lu_env *env;
-	struct iovec  *local_iov;
-	struct kiocb  *kiocb;
-	ssize_t	result;
-	int	    refcheck;
-
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		return PTR_ERR(env);
-
-	local_iov = &vvp_env_info(env)->vti_local_iov;
-	kiocb = &vvp_env_info(env)->vti_kiocb;
-	local_iov->iov_base = (void __user *)buf;
-	local_iov->iov_len = count;
-	init_sync_kiocb(kiocb, file);
-	kiocb->ki_pos = *ppos;
-	kiocb->ki_nbytes = count;
-
-	result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
-	*ppos = kiocb->ki_pos;
-
-	cl_env_put(env, &refcheck);
-	return result;
-}
-
-
-
 /*
  * Send file content (through pagecache) somewhere with helper
  */
@@ -3133,10 +3065,10 @@ int ll_inode_permission(struct inode *inode, int mask)
 
 /* -o localflock - only provides locally consistent flock locks */
 struct file_operations ll_file_operations = {
-	.read	   = ll_file_read,
-	.aio_read = ll_file_aio_read,
-	.write	  = ll_file_write,
-	.aio_write = ll_file_aio_write,
+	.read	   = new_sync_read,
+	.read_iter = ll_file_read_iter,
+	.write	  = new_sync_write,
+	.write_iter = ll_file_write_iter,
 	.unlocked_ioctl = ll_file_ioctl,
 	.open	   = ll_file_open,
 	.release	= ll_file_release,
@@ -3148,10 +3080,10 @@ struct file_operations ll_file_operations = {
 };
 
 struct file_operations ll_file_operations_flock = {
-	.read	   = ll_file_read,
-	.aio_read    = ll_file_aio_read,
-	.write	  = ll_file_write,
-	.aio_write   = ll_file_aio_write,
+	.read	   = new_sync_read,
+	.read_iter    = ll_file_read_iter,
+	.write	  = new_sync_write,
+	.write_iter   = ll_file_write_iter,
 	.unlocked_ioctl = ll_file_ioctl,
 	.open	   = ll_file_open,
 	.release	= ll_file_release,
@@ -3166,10 +3098,10 @@ struct file_operations ll_file_operations_flock = {
 
 /* These are for -o noflock - to return ENOSYS on flock calls */
 struct file_operations ll_file_operations_noflock = {
-	.read	   = ll_file_read,
-	.aio_read    = ll_file_aio_read,
-	.write	  = ll_file_write,
-	.aio_write   = ll_file_aio_write,
+	.read	   = new_sync_read,
+	.read_iter    = ll_file_read_iter,
+	.write	  = new_sync_write,
+	.write_iter   = ll_file_write_iter,
 	.unlocked_ioctl = ll_file_ioctl,
 	.open	   = ll_file_open,
 	.release	= ll_file_release,
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 69aba0afca41..fbb8650ead34 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -974,8 +974,7 @@ struct vvp_io_args {
 	union {
 		struct {
 			struct kiocb      *via_iocb;
-			struct iovec      *via_iov;
-			unsigned long      via_nrsegs;
+			struct iov_iter   *via_iter;
 		} normal;
 		struct {
 			struct pipe_inode_info  *via_pipe;
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
index 416f7a094a6d..b345dfa599f3 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -157,8 +157,7 @@ static struct ll_cl_context *ll_cl_init(struct file *file,
 		result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
 		if (result == 0) {
 			cio->cui_fd = LUSTRE_FPRIVATE(file);
-			cio->cui_iov = NULL;
-			cio->cui_nrsegs = 0;
+			cio->cui_iter = NULL;
 			result = cl_io_iter_init(env, io);
 			if (result == 0) {
 				result = cl_io_lock(env, io);
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index c7d70091246e..cfe8c625ae64 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -211,27 +211,26 @@ static int vvp_mmap_locks(const struct lu_env *env,
 	struct cl_lock_descr   *descr = &cti->cti_descr;
 	ldlm_policy_data_t      policy;
 	unsigned long	   addr;
-	unsigned long	   seg;
 	ssize_t		 count;
 	int		     result;
+	struct iov_iter i;
+	struct iovec iov;
 
 	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
 
 	if (!cl_is_normalio(env, io))
 		return 0;
 
-	if (vio->cui_iov == NULL) /* nfs or loop back device write */
+	if (vio->cui_iter == NULL) /* nfs or loop back device write */
 		return 0;
 
 	/* No MM (e.g. NFS)? No vmas too. */
 	if (mm == NULL)
 		return 0;
 
-	for (seg = 0; seg < vio->cui_nrsegs; seg++) {
-		const struct iovec *iv = &vio->cui_iov[seg];
-
-		addr = (unsigned long)iv->iov_base;
-		count = iv->iov_len;
+	iov_for_each(iov, i, *(vio->cui_iter)) {
+		addr = (unsigned long)iov.iov_base;
+		count = iov.iov_len;
 		if (count == 0)
 			continue;
 
@@ -527,9 +526,7 @@ static int vvp_io_read_start(const struct lu_env *env,
 	switch (vio->cui_io_subtype) {
 	case IO_NORMAL:
 		LASSERT(cio->cui_iocb->ki_pos == pos);
-		result = generic_file_aio_read(cio->cui_iocb,
-					       cio->cui_iov, cio->cui_nrsegs,
-					       cio->cui_iocb->ki_pos);
+		result = generic_file_read_iter(cio->cui_iocb, cio->cui_iter);
 		break;
 	case IO_SPLICE:
 		result = generic_file_splice_read(file, &pos,
@@ -595,12 +592,11 @@ static int vvp_io_write_start(const struct lu_env *env,
 
 	CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
 
-	if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
+	if (cio->cui_iter == NULL) /* from a temp io in ll_cl_init(). */
 		result = 0;
 	else
-		result = generic_file_aio_write(cio->cui_iocb,
-						cio->cui_iov, cio->cui_nrsegs,
-						cio->cui_iocb->ki_pos);
+		result = generic_file_write_iter(cio->cui_iocb, cio->cui_iter);
+
 	if (result > 0) {
 		if (result < cnt)
 			io->ci_continue = 0;
@@ -1162,10 +1158,9 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		 *  results."  -- Single Unix Spec */
 		if (count == 0)
 			result = 1;
-		else {
+		else
 			cio->cui_tot_count = count;
-			cio->cui_tot_nrsegs = 0;
-		}
+
 		/* for read/write, we store the jobid in the inode, and
 		 * it'll be fetched by osc when building RPC.
 		 *
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 66012352d333..e8a109a75de1 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -90,6 +90,15 @@ static inline void iov_iter_truncate(struct iov_iter *i, size_t count)
 		i->count = count;
 }
 
+/*
+ * reexpand a previously truncated iterator; count must be no more than how much
+ * we had shrunk it.
+ */
+static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
+{
+	i->count = count;
+}
+
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
 
-- 
cgit v1.2.3-71-gd317


From 6abd232274fd652e4a57f486d14e52ffee6f72e9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Apr 2014 14:20:57 -0400
Subject: bury generic_file_aio_{read,write}

no callers left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h |  2 --
 mm/filemap.c       | 43 ++++++++-----------------------------------
 2 files changed, 8 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 99817c9e665e..a6448849dbce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2411,10 +2411,8 @@ extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
 		unsigned long size, pgoff_t pgoff);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
-extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
-extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7dcdb9db710d..2f724e3cdf24 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1663,6 +1663,14 @@ out:
 	return written ? written : error;
 }
 
+/**
+ * generic_file_read_iter - generic filesystem read routine
+ * @iocb:	kernel I/O control block
+ * @iter:	destination for the data read
+ *
+ * This is the "read_iter()" routine for all filesystems
+ * that can use the page cache directly.
+ */
 ssize_t
 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
@@ -1713,28 +1721,6 @@ out:
 }
 EXPORT_SYMBOL(generic_file_read_iter);
 
-/**
- * generic_file_aio_read - generic filesystem read routine
- * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
- * @pos:	current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
- */
-ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	size_t count = iov_length(iov, nr_segs);
-	struct iov_iter i;
-
-	iov_iter_init(&i, READ, iov, nr_segs, count);
-	return generic_file_read_iter(iocb, &i);
-}
-EXPORT_SYMBOL(generic_file_aio_read);
-
 #ifdef CONFIG_MMU
 /**
  * page_cache_read - adds requested page to the page cache if not already there
@@ -2675,19 +2661,6 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(generic_file_write_iter);
 
-ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	size_t count = iov_length(iov, nr_segs);
-	struct iov_iter from;
-
-	BUG_ON(iocb->ki_pos != pos);
-
-	iov_iter_init(&from, WRITE, iov, nr_segs, count);
-	return generic_file_write_iter(iocb, &from);
-}
-EXPORT_SYMBOL(generic_file_aio_write);
-
 /**
  * try_to_release_page() - release old fs-specific metadata on a page
  *
-- 
cgit v1.2.3-71-gd317


From 62a8067a7f35dba2de501c9cb00e4cf36da90bc0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Apr 2014 23:12:29 -0400
Subject: bio_vec-backed iov_iter

New variant of iov_iter - ITER_BVEC in iter->type, backed with
bio_vec array instead of iovec one.  Primitives taught to deal
with such beasts, __swap_write() switched to using that kind
of iov_iter.

Note that bio_vec is just a <page, offset, length> triple - there's
nothing block-specific about it.  I've left the definition where it
was, but took it from under ifdef CONFIG_BLOCK.

Next target: ->splice_write()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/file.c            |   2 +-
 include/linux/blk_types.h |   4 +-
 include/linux/uio.h       |  14 +-
 mm/iov_iter.c             | 390 ++++++++++++++++++++++++++++++++++++++++++----
 mm/page_io.c              |  19 ++-
 5 files changed, 385 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7fbc803cf51d..b2dae9d1437c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1288,7 +1288,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 	size_t nbytes = 0;  /* # bytes already packed in req */
 
 	/* Special case for kernel I/O: can copy directly into the buffer */
-	if (ii->type & REQ_KERNEL) {
+	if (ii->type & ITER_KVEC) {
 		unsigned long user_addr = fuse_get_user_addr(ii);
 		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index aa0eaa2d0bd8..86df13b97160 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -5,8 +5,6 @@
 #ifndef __LINUX_BLK_TYPES_H
 #define __LINUX_BLK_TYPES_H
 
-#ifdef CONFIG_BLOCK
-
 #include <linux/types.h>
 
 struct bio_set;
@@ -28,6 +26,8 @@ struct bio_vec {
 	unsigned int	bv_offset;
 };
 
+#ifdef CONFIG_BLOCK
+
 struct bvec_iter {
 	sector_t		bi_sector;	/* device address in 512 byte
 						   sectors */
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e8a109a75de1..e2231e47cec1 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -19,12 +19,21 @@ struct kvec {
 	size_t iov_len;
 };
 
+enum {
+	ITER_IOVEC = 0,
+	ITER_KVEC = 2,
+	ITER_BVEC = 4,
+};
+
 struct iov_iter {
 	int type;
-	const struct iovec *iov;
-	unsigned long nr_segs;
 	size_t iov_offset;
 	size_t count;
+	union {
+		const struct iovec *iov;
+		const struct bio_vec *bvec;
+	};
+	unsigned long nr_segs;
 };
 
 /*
@@ -54,6 +63,7 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
 }
 
 #define iov_for_each(iov, iter, start)				\
+	if (!((start).type & ITER_BVEC))			\
 	for (iter = (start);					\
 	     (iter).count &&					\
 	     ((iov = iov_iter_iovec(&(iter))), 1);		\
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index fcdaaab438b6..7b5dbd1517b5 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -4,7 +4,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
 	size_t skip, copy, left, wanted;
@@ -84,9 +84,8 @@ done:
 	i->iov_offset = skip;
 	return wanted - bytes;
 }
-EXPORT_SYMBOL(copy_page_to_iter);
 
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
+static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
 	size_t skip, copy, left, wanted;
@@ -166,7 +165,6 @@ done:
 	i->iov_offset = skip;
 	return wanted - bytes;
 }
-EXPORT_SYMBOL(copy_page_from_iter);
 
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
@@ -195,7 +193,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
  * were successfully copied.  If a fault is encountered then return the number of
  * bytes which were copied.
  */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
+static size_t copy_from_user_atomic_iovec(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes)
 {
 	char *kaddr;
@@ -215,9 +213,8 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 
 	return copied;
 }
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
+static void advance_iovec(struct iov_iter *i, size_t bytes)
 {
 	BUG_ON(i->count < bytes);
 
@@ -252,7 +249,6 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
 		i->nr_segs = nr_segs;
 	}
 }
-EXPORT_SYMBOL(iov_iter_advance);
 
 /*
  * Fault in the first iovec of the given iov_iter, to a maximum length
@@ -265,26 +261,16 @@ EXPORT_SYMBOL(iov_iter_advance);
  */
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 {
-	char __user *buf = i->iov->iov_base + i->iov_offset;
-	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-	return fault_in_pages_readable(buf, bytes);
+	if (!(i->type & ITER_BVEC)) {
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+		return fault_in_pages_readable(buf, bytes);
+	}
+	return 0;
 }
 EXPORT_SYMBOL(iov_iter_fault_in_readable);
 
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-	const struct iovec *iov = i->iov;
-	if (i->nr_segs == 1)
-		return i->count;
-	else
-		return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
-unsigned long iov_iter_alignment(const struct iov_iter *i)
+static unsigned long alignment_iovec(const struct iov_iter *i)
 {
 	const struct iovec *iov = i->iov;
 	unsigned long res;
@@ -307,7 +293,6 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 	res |= (unsigned long)iov->iov_base | size;
 	return res;
 }
-EXPORT_SYMBOL(iov_iter_alignment);
 
 void iov_iter_init(struct iov_iter *i, int direction,
 			const struct iovec *iov, unsigned long nr_segs,
@@ -315,7 +300,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 {
 	/* It will get better.  Eventually... */
 	if (segment_eq(get_fs(), KERNEL_DS))
-		direction |= REQ_KERNEL;
+		direction |= ITER_KVEC;
 	i->type = direction;
 	i->iov = iov;
 	i->nr_segs = nr_segs;
@@ -324,7 +309,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-ssize_t iov_iter_get_pages(struct iov_iter *i,
+static ssize_t get_pages_iovec(struct iov_iter *i,
 		   struct page **pages, size_t maxsize,
 		   size_t *start)
 {
@@ -349,9 +334,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 		return res;
 	return (res == n ? len : res * PAGE_SIZE) - *start;
 }
-EXPORT_SYMBOL(iov_iter_get_pages);
 
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
 {
@@ -387,9 +371,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 	*pages = p;
 	return (res == n ? len : res * PAGE_SIZE) - *start;
 }
-EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
-int iov_iter_npages(const struct iov_iter *i, int maxpages)
+static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
 {
 	size_t offset = i->iov_offset;
 	size_t size = i->count;
@@ -414,4 +397,347 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 	}
 	return min(npages, maxpages);
 }
+
+static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
+{
+	char *from = kmap_atomic(page);
+	memcpy(to, from + offset, len);
+	kunmap_atomic(from);
+}
+
+static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len)
+{
+	char *to = kmap_atomic(page);
+	memcpy(to + offset, from, len);
+	kunmap_atomic(to);
+}
+
+static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	size_t skip, copy, wanted;
+	const struct bio_vec *bvec;
+	void *kaddr, *from;
+
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	wanted = bytes;
+	bvec = i->bvec;
+	skip = i->iov_offset;
+	copy = min_t(size_t, bytes, bvec->bv_len - skip);
+
+	kaddr = kmap_atomic(page);
+	from = kaddr + offset;
+	memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
+	skip += copy;
+	from += copy;
+	bytes -= copy;
+	while (bytes) {
+		bvec++;
+		copy = min(bytes, (size_t)bvec->bv_len);
+		memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
+		skip = copy;
+		from += copy;
+		bytes -= copy;
+	}
+	kunmap_atomic(kaddr);
+	if (skip == bvec->bv_len) {
+		bvec++;
+		skip = 0;
+	}
+	i->count -= wanted - bytes;
+	i->nr_segs -= bvec - i->bvec;
+	i->bvec = bvec;
+	i->iov_offset = skip;
+	return wanted - bytes;
+}
+
+static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	size_t skip, copy, wanted;
+	const struct bio_vec *bvec;
+	void *kaddr, *to;
+
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	wanted = bytes;
+	bvec = i->bvec;
+	skip = i->iov_offset;
+
+	kaddr = kmap_atomic(page);
+
+	to = kaddr + offset;
+
+	copy = min(bytes, bvec->bv_len - skip);
+
+	memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
+
+	to += copy;
+	skip += copy;
+	bytes -= copy;
+
+	while (bytes) {
+		bvec++;
+		copy = min(bytes, (size_t)bvec->bv_len);
+		memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
+		skip = copy;
+		to += copy;
+		bytes -= copy;
+	}
+	kunmap_atomic(kaddr);
+	if (skip == bvec->bv_len) {
+		bvec++;
+		skip = 0;
+	}
+	i->count -= wanted;
+	i->nr_segs -= bvec - i->bvec;
+	i->bvec = bvec;
+	i->iov_offset = skip;
+	return wanted;
+}
+
+static size_t copy_from_user_bvec(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t left;
+	const struct bio_vec *bvec;
+	size_t base = i->iov_offset;
+
+	kaddr = kmap_atomic(page);
+	for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
+		size_t copy = min(left, bvec->bv_len - base);
+		if (!bvec->bv_len)
+			continue;
+		memcpy_from_page(kaddr + offset, bvec->bv_page,
+				 bvec->bv_offset + base, copy);
+		offset += copy;
+		left -= copy;
+	}
+	kunmap_atomic(kaddr);
+	return bytes;
+}
+
+static void advance_bvec(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+		i->count -= bytes;
+	} else {
+		const struct bio_vec *bvec = i->bvec;
+		size_t base = i->iov_offset;
+		unsigned long nr_segs = i->nr_segs;
+
+		/*
+		 * The !iov->iov_len check ensures we skip over unlikely
+		 * zero-length segments (without overruning the iovec).
+		 */
+		while (bytes || unlikely(i->count && !bvec->bv_len)) {
+			int copy;
+
+			copy = min(bytes, bvec->bv_len - base);
+			BUG_ON(!i->count || i->count < copy);
+			i->count -= copy;
+			bytes -= copy;
+			base += copy;
+			if (bvec->bv_len == base) {
+				bvec++;
+				nr_segs--;
+				base = 0;
+			}
+		}
+		i->bvec = bvec;
+		i->iov_offset = base;
+		i->nr_segs = nr_segs;
+	}
+}
+
+static unsigned long alignment_bvec(const struct iov_iter *i)
+{
+	const struct bio_vec *bvec = i->bvec;
+	unsigned long res;
+	size_t size = i->count;
+	size_t n;
+
+	if (!size)
+		return 0;
+
+	res = bvec->bv_offset + i->iov_offset;
+	n = bvec->bv_len - i->iov_offset;
+	if (n >= size)
+		return res | size;
+	size -= n;
+	res |= n;
+	while (size > (++bvec)->bv_len) {
+		res |= bvec->bv_offset | bvec->bv_len;
+		size -= bvec->bv_len;
+	}
+	res |= bvec->bv_offset | size;
+	return res;
+}
+
+static ssize_t get_pages_bvec(struct iov_iter *i,
+		   struct page **pages, size_t maxsize,
+		   size_t *start)
+{
+	const struct bio_vec *bvec = i->bvec;
+	size_t len = bvec->bv_len - i->iov_offset;
+	if (len > i->count)
+		len = i->count;
+	if (len > maxsize)
+		len = maxsize;
+	*start = bvec->bv_offset + i->iov_offset;
+
+	get_page(*pages = bvec->bv_page);
+
+	return len;
+}
+
+static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
+		   struct page ***pages, size_t maxsize,
+		   size_t *start)
+{
+	const struct bio_vec *bvec = i->bvec;
+	size_t len = bvec->bv_len - i->iov_offset;
+	if (len > i->count)
+		len = i->count;
+	if (len > maxsize)
+		len = maxsize;
+	*start = bvec->bv_offset + i->iov_offset;
+
+	*pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
+	if (!*pages)
+		return -ENOMEM;
+
+	get_page(**pages = bvec->bv_page);
+
+	return len;
+}
+
+static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
+{
+	size_t offset = i->iov_offset;
+	size_t size = i->count;
+	const struct bio_vec *bvec = i->bvec;
+	int npages = 0;
+	int n;
+
+	for (n = 0; size && n < i->nr_segs; n++, bvec++) {
+		size_t len = bvec->bv_len - offset;
+		offset = 0;
+		if (unlikely(!len))	/* empty segment */
+			continue;
+		if (len > size)
+			len = size;
+		npages++;
+		if (npages >= maxpages)	/* don't bother going further */
+			return maxpages;
+		size -= len;
+		offset = 0;
+	}
+	return min(npages, maxpages);
+}
+
+size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	if (i->type & ITER_BVEC)
+		return copy_page_to_iter_bvec(page, offset, bytes, i);
+	else
+		return copy_page_to_iter_iovec(page, offset, bytes, i);
+}
+EXPORT_SYMBOL(copy_page_to_iter);
+
+size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	if (i->type & ITER_BVEC)
+		return copy_page_from_iter_bvec(page, offset, bytes, i);
+	else
+		return copy_page_from_iter_iovec(page, offset, bytes, i);
+}
+EXPORT_SYMBOL(copy_page_from_iter);
+
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	if (i->type & ITER_BVEC)
+		return copy_from_user_bvec(page, i, offset, bytes);
+	else
+		return copy_from_user_atomic_iovec(page, i, offset, bytes);
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+void iov_iter_advance(struct iov_iter *i, size_t size)
+{
+	if (i->type & ITER_BVEC)
+		advance_bvec(i, size);
+	else
+		advance_iovec(i, size);
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+	if (i->nr_segs == 1)
+		return i->count;
+	else if (i->type & ITER_BVEC)
+		return min(i->count, i->iov->iov_len - i->iov_offset);
+	else
+		return min(i->count, i->bvec->bv_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+
+unsigned long iov_iter_alignment(const struct iov_iter *i)
+{
+	if (i->type & ITER_BVEC)
+		return alignment_bvec(i);
+	else
+		return alignment_iovec(i);
+}
+EXPORT_SYMBOL(iov_iter_alignment);
+
+ssize_t iov_iter_get_pages(struct iov_iter *i,
+		   struct page **pages, size_t maxsize,
+		   size_t *start)
+{
+	if (i->type & ITER_BVEC)
+		return get_pages_bvec(i, pages, maxsize, start);
+	else
+		return get_pages_iovec(i, pages, maxsize, start);
+}
+EXPORT_SYMBOL(iov_iter_get_pages);
+
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+		   struct page ***pages, size_t maxsize,
+		   size_t *start)
+{
+	if (i->type & ITER_BVEC)
+		return get_pages_alloc_bvec(i, pages, maxsize, start);
+	else
+		return get_pages_alloc_iovec(i, pages, maxsize, start);
+}
+EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+
+int iov_iter_npages(const struct iov_iter *i, int maxpages)
+{
+	if (i->type & ITER_BVEC)
+		return iov_iter_npages_bvec(i, maxpages);
+	else
+		return iov_iter_npages_iovec(i, maxpages);
+}
 EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/page_io.c b/mm/page_io.c
index 313bfedb75d1..33bb38c4aad7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -259,23 +259,28 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		struct kiocb kiocb;
 		struct file *swap_file = sis->swap_file;
 		struct address_space *mapping = swap_file->f_mapping;
-		struct iovec iov = {
-			.iov_base = kmap(page),
-			.iov_len  = PAGE_SIZE,
+		struct bio_vec bv = {
+			.bv_page = page,
+			.bv_len  = PAGE_SIZE,
+			.bv_offset = 0
+		};
+		struct iov_iter from = {
+			.type = ITER_BVEC | WRITE,
+			.count = PAGE_SIZE,
+			.iov_offset = 0,
+			.nr_segs = 1,
+			.bvec = &bv
 		};
-		struct iov_iter from;
 
 		init_sync_kiocb(&kiocb, swap_file);
 		kiocb.ki_pos = page_file_offset(page);
 		kiocb.ki_nbytes = PAGE_SIZE;
-		iov_iter_init(&from, KERNEL_WRITE, &iov, 1, PAGE_SIZE);
 
 		set_page_writeback(page);
 		unlock_page(page);
-		ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+		ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
 						&kiocb, &from,
 						kiocb.ki_pos);
-		kunmap(page);
 		if (ret == PAGE_SIZE) {
 			count_vm_event(PSWPOUT);
 			ret = 0;
-- 
cgit v1.2.3-71-gd317


From 9c69de4c94fcb11db919160d5fa0b48f13d1757a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 May 2014 19:37:13 +0200
Subject: nfsd: remove <linux/nfsd/nfsfh.h>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only real user of this header is fs/nfsd/nfsfh.h, so merge the
two.  Various lockѕ source files used it to indirectly get other
sunrpc or nfs headers, so fix those up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/clnt4xdr.c         |  2 ++
 fs/lockd/clntxdr.c          |  2 ++
 fs/lockd/svcsubs.c          |  2 +-
 fs/lockd/xdr.c              |  2 ++
 fs/nfsd/nfsd.h              |  1 +
 fs/nfsd/nfsfh.h             | 59 ++++++++++++++++++++++++++++++++++++++----
 fs/nfsd/state.h             |  1 -
 include/linux/lockd/lockd.h |  2 +-
 include/linux/nfsd/export.h |  6 ++++-
 include/linux/nfsd/nfsfh.h  | 63 ---------------------------------------------
 10 files changed, 68 insertions(+), 72 deletions(-)
 delete mode 100644 include/linux/nfsd/nfsfh.h

(limited to 'include/linux')

diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 00ec0b9c94d1..d3e40db28930 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -14,6 +14,8 @@
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
 
+#include <uapi/linux/nfs3.h>
+
 #define NLMDBG_FACILITY		NLMDBG_XDR
 
 #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 9a55797a1cd4..3e9f7874b975 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -15,6 +15,8 @@
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
 
+#include <uapi/linux/nfs2.h>
+
 #define NLMDBG_FACILITY		NLMDBG_XDR
 
 #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index dc5c75930f0f..7ec6b1074d8c 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -14,12 +14,12 @@
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/addr.h>
-#include <linux/nfsd/nfsfh.h>
 #include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <uapi/linux/nfs2.h>
 
 #define NLMDBG_FACILITY		NLMDBG_SVCSUBS
 
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 964666c68a86..9340e7e10ef6 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,6 +16,8 @@
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
 
+#include <uapi/linux/nfs2.h>
+
 #define NLMDBG_FACILITY		NLMDBG_XDR
 
 
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 479eb681c27c..7d5c310678d0 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -15,6 +15,7 @@
 #include <linux/nfs2.h>
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
+#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/msg_prot.h>
 
 #include <linux/nfsd/debug.h>
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index ad67964d0bb1..2e89e70ac15c 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -1,9 +1,58 @@
-/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */
+/*
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * This file describes the layout of the file handles as passed
+ * over the wire.
+ */
+#ifndef _LINUX_NFSD_NFSFH_H
+#define _LINUX_NFSD_NFSFH_H
+
+#include <linux/sunrpc/svc.h>
+#include <uapi/linux/nfsd/nfsfh.h>
+
+static inline __u32 ino_t_to_u32(ino_t ino)
+{
+	return (__u32) ino;
+}
+
+static inline ino_t u32_to_ino_t(__u32 uino)
+{
+	return (ino_t) uino;
+}
 
-#ifndef _LINUX_NFSD_FH_INT_H
-#define _LINUX_NFSD_FH_INT_H
+/*
+ * This is the internal representation of an NFS handle used in knfsd.
+ * pre_mtime/post_version will be used to support wcc_attr's in NFSv3.
+ */
+typedef struct svc_fh {
+	struct knfsd_fh		fh_handle;	/* FH data */
+	struct dentry *		fh_dentry;	/* validated dentry */
+	struct svc_export *	fh_export;	/* export pointer */
+	int			fh_maxsize;	/* max size for fh_handle */
+
+	unsigned char		fh_locked;	/* inode locked by us */
+	unsigned char		fh_want_write;	/* remount protection taken */
+
+#ifdef CONFIG_NFSD_V3
+	unsigned char		fh_post_saved;	/* post-op attrs saved */
+	unsigned char		fh_pre_saved;	/* pre-op attrs saved */
+
+	/* Pre-op attributes saved during fh_lock */
+	__u64			fh_pre_size;	/* size before operation */
+	struct timespec		fh_pre_mtime;	/* mtime before oper */
+	struct timespec		fh_pre_ctime;	/* ctime before oper */
+	/*
+	 * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
+	 *  to find out if it is valid.
+	 */
+	u64			fh_pre_change;
+
+	/* Post-op attributes saved in fh_unlock */
+	struct kstat		fh_post_attr;	/* full attrs after operation */
+	u64			fh_post_change; /* nfsv4 change; see above */
+#endif /* CONFIG_NFSD_V3 */
 
-#include <linux/nfsd/nfsfh.h>
+} svc_fh;
 
 enum nfsd_fsid {
 	FSID_DEV = 0,
@@ -215,4 +264,4 @@ fh_unlock(struct svc_fh *fhp)
 	}
 }
 
-#endif /* _LINUX_NFSD_FH_INT_H */
+#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 424d8f5f2317..5b3bbf24097c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -37,7 +37,6 @@
 
 #include <linux/idr.h>
 #include <linux/sunrpc/svc_xprt.h>
-#include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
 
 typedef struct {
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index dcaad79f54ed..219d79627c05 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -17,13 +17,13 @@
 #include <linux/fs.h>
 #include <linux/kref.h>
 #include <linux/utsname.h>
-#include <linux/nfsd/nfsfh.h>
 #include <linux/lockd/bind.h>
 #include <linux/lockd/xdr.h>
 #ifdef CONFIG_LOCKD_V4
 #include <linux/lockd/xdr4.h>
 #endif
 #include <linux/lockd/debug.h>
+#include <linux/sunrpc/svc.h>
 
 /*
  * Version string
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 7898c997dfea..b12c4e526ef2 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -9,9 +9,13 @@
 #ifndef NFSD_EXPORT_H
 #define NFSD_EXPORT_H
 
-# include <linux/nfsd/nfsfh.h>
+#include <linux/sunrpc/cache.h>
 #include <uapi/linux/nfsd/export.h>
 
+struct knfsd_fh;
+struct svc_fh;
+struct svc_rqst;
+
 /*
  * FS Locations
  */
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
deleted file mode 100644
index a93593f1fa4e..000000000000
--- a/include/linux/nfsd/nfsfh.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * include/linux/nfsd/nfsfh.h
- *
- * This file describes the layout of the file handles as passed
- * over the wire.
- *
- * Earlier versions of knfsd used to sign file handles using keyed MD5
- * or SHA. I've removed this code, because it doesn't give you more
- * security than blocking external access to port 2049 on your firewall.
- *
- * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
- */
-#ifndef _LINUX_NFSD_FH_H
-#define _LINUX_NFSD_FH_H
-
-# include <linux/sunrpc/svc.h>
-#include <uapi/linux/nfsd/nfsfh.h>
-
-static inline __u32 ino_t_to_u32(ino_t ino)
-{
-	return (__u32) ino;
-}
-
-static inline ino_t u32_to_ino_t(__u32 uino)
-{
-	return (ino_t) uino;
-}
-
-/*
- * This is the internal representation of an NFS handle used in knfsd.
- * pre_mtime/post_version will be used to support wcc_attr's in NFSv3.
- */
-typedef struct svc_fh {
-	struct knfsd_fh		fh_handle;	/* FH data */
-	struct dentry *		fh_dentry;	/* validated dentry */
-	struct svc_export *	fh_export;	/* export pointer */
-	int			fh_maxsize;	/* max size for fh_handle */
-
-	unsigned char		fh_locked;	/* inode locked by us */
-	unsigned char		fh_want_write;	/* remount protection taken */
-
-#ifdef CONFIG_NFSD_V3
-	unsigned char		fh_post_saved;	/* post-op attrs saved */
-	unsigned char		fh_pre_saved;	/* pre-op attrs saved */
-
-	/* Pre-op attributes saved during fh_lock */
-	__u64			fh_pre_size;	/* size before operation */
-	struct timespec		fh_pre_mtime;	/* mtime before oper */
-	struct timespec		fh_pre_ctime;	/* ctime before oper */
-	/*
-	 * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
-	 *  to find out if it is valid.
-	 */
-	u64			fh_pre_change;
-
-	/* Post-op attributes saved in fh_unlock */
-	struct kstat		fh_post_attr;	/* full attrs after operation */
-	u64			fh_post_change; /* nfsv4 change; see above */
-#endif /* CONFIG_NFSD_V3 */
-
-} svc_fh;
-
-#endif /* _LINUX_NFSD_FH_H */
-- 
cgit v1.2.3-71-gd317


From d430e8d530e900c923bf77718d72478b1c280592 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 May 2014 19:37:14 +0200
Subject: nfsd: move <linux/nfsd/export.h> to fs/nfsd

There are no legitimate users outside of fs/nfsd, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svcsubs.c          |   1 -
 fs/nfsd/export.h            | 109 ++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfsd.h              |   3 +-
 include/linux/nfsd/export.h | 114 --------------------------------------------
 4 files changed, 111 insertions(+), 116 deletions(-)
 create mode 100644 fs/nfsd/export.h
 delete mode 100644 include/linux/nfsd/export.h

(limited to 'include/linux')

diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 7ec6b1074d8c..b6f3b84b6e99 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -14,7 +14,6 @@
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/addr.h>
-#include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
 #include <linux/module.h>
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
new file mode 100644
index 000000000000..d7939a62a0ae
--- /dev/null
+++ b/fs/nfsd/export.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef NFSD_EXPORT_H
+#define NFSD_EXPORT_H
+
+#include <linux/sunrpc/cache.h>
+#include <uapi/linux/nfsd/export.h>
+
+struct knfsd_fh;
+struct svc_fh;
+struct svc_rqst;
+
+/*
+ * FS Locations
+ */
+
+#define MAX_FS_LOCATIONS	128
+
+struct nfsd4_fs_location {
+	char *hosts; /* colon separated list of hosts */
+	char *path;  /* slash separated list of path components */
+};
+
+struct nfsd4_fs_locations {
+	uint32_t locations_count;
+	struct nfsd4_fs_location *locations;
+/* If we're not actually serving this data ourselves (only providing a
+ * list of replicas that do serve it) then we set "migrated": */
+	int migrated;
+};
+
+/*
+ * We keep an array of pseudoflavors with the export, in order from most
+ * to least preferred.  For the foreseeable future, we don't expect more
+ * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
+ * spkm3i, and spkm3p (and using all 8 at once should be rare).
+ */
+#define MAX_SECINFO_LIST	8
+
+struct exp_flavor_info {
+	u32	pseudoflavor;
+	u32	flags;
+};
+
+struct svc_export {
+	struct cache_head	h;
+	struct auth_domain *	ex_client;
+	int			ex_flags;
+	struct path		ex_path;
+	kuid_t			ex_anon_uid;
+	kgid_t			ex_anon_gid;
+	int			ex_fsid;
+	unsigned char *		ex_uuid; /* 16 byte fsid */
+	struct nfsd4_fs_locations ex_fslocs;
+	int			ex_nflavors;
+	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
+	struct cache_detail	*cd;
+};
+
+/* an "export key" (expkey) maps a filehandlefragement to an
+ * svc_export for a given client.  There can be several per export,
+ * for the different fsid types.
+ */
+struct svc_expkey {
+	struct cache_head	h;
+
+	struct auth_domain *	ek_client;
+	int			ek_fsidtype;
+	u32			ek_fsid[6];
+
+	struct path		ek_path;
+};
+
+#define EX_ISSYNC(exp)		(!((exp)->ex_flags & NFSEXP_ASYNC))
+#define EX_NOHIDE(exp)		((exp)->ex_flags & NFSEXP_NOHIDE)
+#define EX_WGATHER(exp)		((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
+
+/*
+ * Function declarations
+ */
+int			nfsd_export_init(struct net *);
+void			nfsd_export_shutdown(struct net *);
+void			nfsd_export_flush(struct net *);
+struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
+					     struct path *);
+struct svc_export *	rqst_exp_parent(struct svc_rqst *,
+					struct path *);
+struct svc_export *	rqst_find_fsidzero_export(struct svc_rqst *);
+int			exp_rootfh(struct net *, struct auth_domain *,
+					char *path, struct knfsd_fh *, int maxsize);
+__be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
+__be32			nfserrno(int errno);
+
+static inline void exp_put(struct svc_export *exp)
+{
+	cache_put(&exp->h, exp->cd);
+}
+
+static inline void exp_get(struct svc_export *exp)
+{
+	cache_get(&exp->h);
+}
+struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
+
+#endif /* NFSD_EXPORT_H */
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7d5c310678d0..72004caad718 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -19,9 +19,10 @@
 #include <linux/sunrpc/msg_prot.h>
 
 #include <linux/nfsd/debug.h>
-#include <linux/nfsd/export.h>
 #include <linux/nfsd/stats.h>
 
+#include "export.h"
+
 /*
  * nfsd version
  */
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
deleted file mode 100644
index b12c4e526ef2..000000000000
--- a/include/linux/nfsd/export.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * include/linux/nfsd/export.h
- * 
- * Public declarations for NFS exports. The definitions for the
- * syscall interface are in nfsctl.h
- *
- * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
- */
-#ifndef NFSD_EXPORT_H
-#define NFSD_EXPORT_H
-
-#include <linux/sunrpc/cache.h>
-#include <uapi/linux/nfsd/export.h>
-
-struct knfsd_fh;
-struct svc_fh;
-struct svc_rqst;
-
-/*
- * FS Locations
- */
-
-#define MAX_FS_LOCATIONS	128
-
-struct nfsd4_fs_location {
-	char *hosts; /* colon separated list of hosts */
-	char *path;  /* slash separated list of path components */
-};
-
-struct nfsd4_fs_locations {
-	uint32_t locations_count;
-	struct nfsd4_fs_location *locations;
-/* If we're not actually serving this data ourselves (only providing a
- * list of replicas that do serve it) then we set "migrated": */
-	int migrated;
-};
-
-/*
- * We keep an array of pseudoflavors with the export, in order from most
- * to least preferred.  For the foreseeable future, we don't expect more
- * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
- * spkm3i, and spkm3p (and using all 8 at once should be rare).
- */
-#define MAX_SECINFO_LIST	8
-
-struct exp_flavor_info {
-	u32	pseudoflavor;
-	u32	flags;
-};
-
-struct svc_export {
-	struct cache_head	h;
-	struct auth_domain *	ex_client;
-	int			ex_flags;
-	struct path		ex_path;
-	kuid_t			ex_anon_uid;
-	kgid_t			ex_anon_gid;
-	int			ex_fsid;
-	unsigned char *		ex_uuid; /* 16 byte fsid */
-	struct nfsd4_fs_locations ex_fslocs;
-	int			ex_nflavors;
-	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
-	struct cache_detail	*cd;
-};
-
-/* an "export key" (expkey) maps a filehandlefragement to an
- * svc_export for a given client.  There can be several per export,
- * for the different fsid types.
- */
-struct svc_expkey {
-	struct cache_head	h;
-
-	struct auth_domain *	ek_client;
-	int			ek_fsidtype;
-	u32			ek_fsid[6];
-
-	struct path		ek_path;
-};
-
-#define EX_ISSYNC(exp)		(!((exp)->ex_flags & NFSEXP_ASYNC))
-#define EX_NOHIDE(exp)		((exp)->ex_flags & NFSEXP_NOHIDE)
-#define EX_WGATHER(exp)		((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
-
-int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
-__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
-
-/*
- * Function declarations
- */
-int			nfsd_export_init(struct net *);
-void			nfsd_export_shutdown(struct net *);
-void			nfsd_export_flush(struct net *);
-struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
-					     struct path *);
-struct svc_export *	rqst_exp_parent(struct svc_rqst *,
-					struct path *);
-struct svc_export *	rqst_find_fsidzero_export(struct svc_rqst *);
-int			exp_rootfh(struct net *, struct auth_domain *,
-					char *path, struct knfsd_fh *, int maxsize);
-__be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-__be32			nfserrno(int errno);
-
-static inline void exp_put(struct svc_export *exp)
-{
-	cache_put(&exp->h, exp->cd);
-}
-
-static inline void exp_get(struct svc_export *exp)
-{
-	cache_get(&exp->h);
-}
-struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
-
-#endif /* NFSD_EXPORT_H */
-- 
cgit v1.2.3-71-gd317


From 7f94423e8fcc1e0f3416be76d3da0982f586d565 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 May 2014 19:37:15 +0200
Subject: nfsd: move <linux/nfsd/stats.h> to fs/nfsd

There are no legitimate users outside of fs/nfsd, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsd.h             |  2 +-
 fs/nfsd/stats.c            |  1 -
 fs/nfsd/stats.h            | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfsd/stats.h | 45 ---------------------------------------------
 4 files changed, 44 insertions(+), 47 deletions(-)
 create mode 100644 fs/nfsd/stats.h
 delete mode 100644 include/linux/nfsd/stats.h

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 72004caad718..7a07f9c6ee78 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -19,8 +19,8 @@
 #include <linux/sunrpc/msg_prot.h>
 
 #include <linux/nfsd/debug.h>
-#include <linux/nfsd/stats.h>
 
+#include "stats.h"
 #include "export.h"
 
 /*
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 6d4521feb6e3..cd90878a76aa 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -24,7 +24,6 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/sunrpc/stats.h>
-#include <linux/nfsd/stats.h>
 #include <net/net_namespace.h>
 
 #include "nfsd.h"
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
new file mode 100644
index 000000000000..a5c944b771c6
--- /dev/null
+++ b/fs/nfsd/stats.h
@@ -0,0 +1,43 @@
+/*
+ * Statistics for NFS server.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _NFSD_STATS_H
+#define _NFSD_STATS_H
+
+#include <uapi/linux/nfsd/stats.h>
+
+
+struct nfsd_stats {
+	unsigned int	rchits;		/* repcache hits */
+	unsigned int	rcmisses;	/* repcache hits */
+	unsigned int	rcnocache;	/* uncached reqs */
+	unsigned int	fh_stale;	/* FH stale error */
+	unsigned int	fh_lookup;	/* dentry cached */
+	unsigned int	fh_anon;	/* anon file dentry returned */
+	unsigned int	fh_nocache_dir;	/* filehandle not found in dcache */
+	unsigned int	fh_nocache_nondir;	/* filehandle not found in dcache */
+	unsigned int	io_read;	/* bytes returned to read requests */
+	unsigned int	io_write;	/* bytes passed in write requests */
+	unsigned int	th_cnt;		/* number of available threads */
+	unsigned int	th_usage[10];	/* number of ticks during which n perdeciles
+					 * of available threads were in use */
+	unsigned int	th_fullcnt;	/* number of times last free thread was used */
+	unsigned int	ra_size;	/* size of ra cache */
+	unsigned int	ra_depth[11];	/* number of times ra entry was found that deep
+					 * in the cache (10percentiles). [10] = not found */
+#ifdef CONFIG_NFSD_V4
+	unsigned int	nfs4_opcount[LAST_NFS4_OP + 1];	/* count of individual nfsv4 operations */
+#endif
+
+};
+
+
+extern struct nfsd_stats	nfsdstats;
+extern struct svc_stat		nfsd_svcstats;
+
+void	nfsd_stat_init(void);
+void	nfsd_stat_shutdown(void);
+
+#endif /* _NFSD_STATS_H */
diff --git a/include/linux/nfsd/stats.h b/include/linux/nfsd/stats.h
deleted file mode 100644
index e75b2544ff12..000000000000
--- a/include/linux/nfsd/stats.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * linux/include/linux/nfsd/stats.h
- *
- * Statistics for NFS server.
- *
- * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
- */
-#ifndef LINUX_NFSD_STATS_H
-#define LINUX_NFSD_STATS_H
-
-#include <uapi/linux/nfsd/stats.h>
-
-
-struct nfsd_stats {
-	unsigned int	rchits;		/* repcache hits */
-	unsigned int	rcmisses;	/* repcache hits */
-	unsigned int	rcnocache;	/* uncached reqs */
-	unsigned int	fh_stale;	/* FH stale error */
-	unsigned int	fh_lookup;	/* dentry cached */
-	unsigned int	fh_anon;	/* anon file dentry returned */
-	unsigned int	fh_nocache_dir;	/* filehandle not found in dcache */
-	unsigned int	fh_nocache_nondir;	/* filehandle not found in dcache */
-	unsigned int	io_read;	/* bytes returned to read requests */
-	unsigned int	io_write;	/* bytes passed in write requests */
-	unsigned int	th_cnt;		/* number of available threads */
-	unsigned int	th_usage[10];	/* number of ticks during which n perdeciles
-					 * of available threads were in use */
-	unsigned int	th_fullcnt;	/* number of times last free thread was used */
-	unsigned int	ra_size;	/* size of ra cache */
-	unsigned int	ra_depth[11];	/* number of times ra entry was found that deep
-					 * in the cache (10percentiles). [10] = not found */
-#ifdef CONFIG_NFSD_V4
-	unsigned int	nfs4_opcount[LAST_NFS4_OP + 1];	/* count of individual nfsv4 operations */
-#endif
-
-};
-
-
-extern struct nfsd_stats	nfsdstats;
-extern struct svc_stat		nfsd_svcstats;
-
-void	nfsd_stat_init(void);
-void	nfsd_stat_shutdown(void);
-
-#endif /* LINUX_NFSD_STATS_H */
-- 
cgit v1.2.3-71-gd317


From 6f226e2ab1b895c8685e868af0a5f797fcaaaf57 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 May 2014 19:37:16 +0200
Subject: nfsd: remove <linux/nfsd/debug.h>

There is almost nothing left it in, just merge it into the only file
that includes it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsd.h             |  9 ++++++++-
 include/linux/nfsd/debug.h | 19 -------------------
 2 files changed, 8 insertions(+), 20 deletions(-)
 delete mode 100644 include/linux/nfsd/debug.h

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7a07f9c6ee78..e9f2fd42d184 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -18,11 +18,18 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/msg_prot.h>
 
-#include <linux/nfsd/debug.h>
+#include <uapi/linux/nfsd/debug.h>
 
 #include "stats.h"
 #include "export.h"
 
+#undef ifdebug
+#ifdef NFSD_DEBUG
+# define ifdebug(flag)		if (nfsd_debug & NFSDDBG_##flag)
+#else
+# define ifdebug(flag)		if (0)
+#endif
+
 /*
  * nfsd version
  */
diff --git a/include/linux/nfsd/debug.h b/include/linux/nfsd/debug.h
deleted file mode 100644
index 19ef8375b577..000000000000
--- a/include/linux/nfsd/debug.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * linux/include/linux/nfsd/debug.h
- *
- * Debugging-related stuff for nfsd
- *
- * Copyright (C) 1995 Olaf Kirch <okir@monad.swb.de>
- */
-#ifndef LINUX_NFSD_DEBUG_H
-#define LINUX_NFSD_DEBUG_H
-
-#include <uapi/linux/nfsd/debug.h>
-
-# undef ifdebug
-# ifdef NFSD_DEBUG
-#  define ifdebug(flag)		if (nfsd_debug & NFSDDBG_##flag)
-# else
-#  define ifdebug(flag)		if (0)
-# endif
-#endif /* LINUX_NFSD_DEBUG_H */
-- 
cgit v1.2.3-71-gd317


From 6403eb1f646a49cc92f25c08f8716f8870a4a865 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Sat, 26 Apr 2014 19:59:52 +0800
Subject: f2fs: introduce help macro ADDRS_PER_PAGE()

Introduce help macro ADDRS_PER_PAGE() to get the number of address pointers in
direct node or inode.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c          | 6 ++----
 fs/f2fs/file.c          | 5 +----
 fs/f2fs/recovery.c      | 5 +----
 include/linux/f2fs_fs.h | 3 +++
 4 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0147de7e3973..273fe1631af9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -652,8 +652,7 @@ static int get_data_block(struct inode *inode, sector_t iblock,
 		goto put_out;
 	}
 
-	end_offset = IS_INODE(dn.node_page) ?
-			ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+	end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
 	bh_result->b_size = (((size_t)1) << blkbits);
 	dn.ofs_in_node++;
 	pgofs++;
@@ -675,8 +674,7 @@ get_next:
 		if (dn.data_blkaddr == NEW_ADDR)
 			goto put_out;
 
-		end_offset = IS_INODE(dn.node_page) ?
-			ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
 	}
 
 	if (maxblocks > (bh_result->b_size >> blkbits)) {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 60e7d5448a1d..bb365c932555 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -288,10 +288,7 @@ int truncate_blocks(struct inode *inode, u64 from)
 		return err;
 	}
 
-	if (IS_INODE(dn.node_page))
-		count = ADDRS_PER_INODE(F2FS_I(inode));
-	else
-		count = ADDRS_PER_BLOCK;
+	count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
 
 	count -= dn.ofs_in_node;
 	f2fs_bug_on(count < 0);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9eb6487f383d..be1e3e881725 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -301,10 +301,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 		goto out;
 
 	start = start_bidx_of_node(ofs_of_node(page), fi);
-	if (IS_INODE(page))
-		end = start + ADDRS_PER_INODE(fi);
-	else
-		end = start + ADDRS_PER_BLOCK;
+	end = start + ADDRS_PER_PAGE(page, fi);
 
 	f2fs_lock_op(sbi);
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index df53e1753a76..8c03f71307c6 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -146,6 +146,9 @@ struct f2fs_extent {
 #define ADDRS_PER_BLOCK		1018	/* Address Pointers in a Direct Block */
 #define NIDS_PER_BLOCK		1018	/* Node IDs in an Indirect Block */
 
+#define ADDRS_PER_PAGE(page, fi)	\
+	(IS_INODE(page) ? ADDRS_PER_INODE(fi) : ADDRS_PER_BLOCK)
+
 #define	NODE_DIR1_BLOCK		(DEF_ADDRS_PER_INODE + 1)
 #define	NODE_DIR2_BLOCK		(DEF_ADDRS_PER_INODE + 2)
 #define	NODE_IND1_BLOCK		(DEF_ADDRS_PER_INODE + 3)
-- 
cgit v1.2.3-71-gd317


From 257462dbf3ed233de0dc2e489dcc58579535b33f Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Thu, 27 Feb 2014 14:53:34 +0900
Subject: pwm-backlight: switch to gpiod interface

Switch to the new gpiod interface, which allows to handle GPIO
properties such as active low transparently and removes a whole bunch of
code.

There are still a couple of users of this driver that rely on passing
the enable GPIO number through platform data, so a fallback mechanism
using a GPIO number is still available to avoid breaking them. It will
be removed once current users have switched to the GPIO lookup tables
provided by the gpiod interface.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/video/backlight/pwm_bl.c | 69 +++++++++++++++++-----------------------
 include/linux/pwm_backlight.h    |  5 +--
 2 files changed, 30 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index fa7f5c35b7fb..cc49347bbcc0 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -10,8 +10,8 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/gpio/consumer.h>
 #include <linux/gpio.h>
-#include <linux/of_gpio.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -32,8 +32,7 @@ struct pwm_bl_data {
 	unsigned int		*levels;
 	bool			enabled;
 	struct regulator	*power_supply;
-	int			enable_gpio;
-	unsigned long		enable_gpio_flags;
+	struct gpio_desc	*enable_gpio;
 	unsigned int		scale;
 	int			(*notify)(struct device *,
 					  int brightness);
@@ -54,12 +53,8 @@ static void pwm_backlight_power_on(struct pwm_bl_data *pb, int brightness)
 	if (err < 0)
 		dev_err(pb->dev, "failed to enable power supply\n");
 
-	if (gpio_is_valid(pb->enable_gpio)) {
-		if (pb->enable_gpio_flags & PWM_BACKLIGHT_GPIO_ACTIVE_LOW)
-			gpio_set_value(pb->enable_gpio, 0);
-		else
-			gpio_set_value(pb->enable_gpio, 1);
-	}
+	if (pb->enable_gpio)
+		gpiod_set_value(pb->enable_gpio, 1);
 
 	pwm_enable(pb->pwm);
 	pb->enabled = true;
@@ -73,12 +68,8 @@ static void pwm_backlight_power_off(struct pwm_bl_data *pb)
 	pwm_config(pb->pwm, 0, pb->period);
 	pwm_disable(pb->pwm);
 
-	if (gpio_is_valid(pb->enable_gpio)) {
-		if (pb->enable_gpio_flags & PWM_BACKLIGHT_GPIO_ACTIVE_LOW)
-			gpio_set_value(pb->enable_gpio, 1);
-		else
-			gpio_set_value(pb->enable_gpio, 0);
-	}
+	if (pb->enable_gpio)
+		gpiod_set_value(pb->enable_gpio, 0);
 
 	regulator_disable(pb->power_supply);
 	pb->enabled = false;
@@ -148,7 +139,6 @@ static int pwm_backlight_parse_dt(struct device *dev,
 				  struct platform_pwm_backlight_data *data)
 {
 	struct device_node *node = dev->of_node;
-	enum of_gpio_flags flags;
 	struct property *prop;
 	int length;
 	u32 value;
@@ -189,14 +179,6 @@ static int pwm_backlight_parse_dt(struct device *dev,
 		data->max_brightness--;
 	}
 
-	data->enable_gpio = of_get_named_gpio_flags(node, "enable-gpios", 0,
-						    &flags);
-	if (data->enable_gpio == -EPROBE_DEFER)
-		return -EPROBE_DEFER;
-
-	if (gpio_is_valid(data->enable_gpio) && (flags & OF_GPIO_ACTIVE_LOW))
-		data->enable_gpio_flags |= PWM_BACKLIGHT_GPIO_ACTIVE_LOW;
-
 	return 0;
 }
 
@@ -256,8 +238,6 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 	} else
 		pb->scale = data->max_brightness;
 
-	pb->enable_gpio = data->enable_gpio;
-	pb->enable_gpio_flags = data->enable_gpio_flags;
 	pb->notify = data->notify;
 	pb->notify_after = data->notify_after;
 	pb->check_fb = data->check_fb;
@@ -265,26 +245,38 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 	pb->dev = &pdev->dev;
 	pb->enabled = false;
 
-	if (gpio_is_valid(pb->enable_gpio)) {
-		unsigned long flags;
-
-		if (pb->enable_gpio_flags & PWM_BACKLIGHT_GPIO_ACTIVE_LOW)
-			flags = GPIOF_OUT_INIT_HIGH;
+	pb->enable_gpio = devm_gpiod_get(&pdev->dev, "enable");
+	if (IS_ERR(pb->enable_gpio)) {
+		ret = PTR_ERR(pb->enable_gpio);
+		if (ret == -ENOENT)
+			pb->enable_gpio = NULL;
 		else
-			flags = GPIOF_OUT_INIT_LOW;
+			goto err_alloc;
+	}
 
-		ret = gpio_request_one(pb->enable_gpio, flags, "enable");
+	/*
+	 * Compatibility fallback for drivers still using the integer GPIO
+	 * platform data. Must go away soon.
+	 */
+	if (!pb->enable_gpio && gpio_is_valid(data->enable_gpio)) {
+		ret = devm_gpio_request_one(&pdev->dev, data->enable_gpio,
+					    GPIOF_OUT_INIT_HIGH, "enable");
 		if (ret < 0) {
 			dev_err(&pdev->dev, "failed to request GPIO#%d: %d\n",
-				pb->enable_gpio, ret);
+				data->enable_gpio, ret);
 			goto err_alloc;
 		}
+
+		pb->enable_gpio = gpio_to_desc(data->enable_gpio);
 	}
 
+	if (pb->enable_gpio)
+		gpiod_direction_output(pb->enable_gpio, 1);
+
 	pb->power_supply = devm_regulator_get(&pdev->dev, "power");
 	if (IS_ERR(pb->power_supply)) {
 		ret = PTR_ERR(pb->power_supply);
-		goto err_gpio;
+		goto err_alloc;
 	}
 
 	pb->pwm = devm_pwm_get(&pdev->dev, NULL);
@@ -295,7 +287,7 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 		if (IS_ERR(pb->pwm)) {
 			dev_err(&pdev->dev, "unable to request legacy PWM\n");
 			ret = PTR_ERR(pb->pwm);
-			goto err_gpio;
+			goto err_alloc;
 		}
 	}
 
@@ -320,7 +312,7 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 	if (IS_ERR(bl)) {
 		dev_err(&pdev->dev, "failed to register backlight\n");
 		ret = PTR_ERR(bl);
-		goto err_gpio;
+		goto err_alloc;
 	}
 
 	if (data->dft_brightness > data->max_brightness) {
@@ -336,9 +328,6 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, bl);
 	return 0;
 
-err_gpio:
-	if (gpio_is_valid(pb->enable_gpio))
-		gpio_free(pb->enable_gpio);
 err_alloc:
 	if (data->exit)
 		data->exit(&pdev->dev);
diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h
index 2de2e275b2cb..efdd9227a49c 100644
--- a/include/linux/pwm_backlight.h
+++ b/include/linux/pwm_backlight.h
@@ -6,9 +6,6 @@
 
 #include <linux/backlight.h>
 
-/* TODO: convert to gpiod_*() API once it has been merged */
-#define PWM_BACKLIGHT_GPIO_ACTIVE_LOW	(1 << 0)
-
 struct platform_pwm_backlight_data {
 	int pwm_id;
 	unsigned int max_brightness;
@@ -16,8 +13,8 @@ struct platform_pwm_backlight_data {
 	unsigned int lth_brightness;
 	unsigned int pwm_period_ns;
 	unsigned int *levels;
+	/* TODO remove once all users are switched to gpiod_* API */
 	int enable_gpio;
-	unsigned long enable_gpio_flags;
 	int (*init)(struct device *dev);
 	int (*notify)(struct device *dev, int brightness);
 	void (*notify_after)(struct device *dev, int brightness);
-- 
cgit v1.2.3-71-gd317


From 2b53f41fa8604845f4f7c538723694a453088b15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 7 May 2014 09:21:56 -0400
Subject: cgroup: remove unused CGRP_SANE_BEHAVIOR

This cgroup flag has never been used.  Only CGRP_ROOT_SANE_BEHAVIOR is
used.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2dfabb3b749a..f482f95c2c72 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -140,8 +140,6 @@ enum {
 	 * specified at mount time and thus is implemented here.
 	 */
 	CGRP_CPUSET_CLONE_CHILDREN,
-	/* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
-	CGRP_SANE_BEHAVIOR,
 };
 
 struct cgroup {
-- 
cgit v1.2.3-71-gd317


From 7c65bbc7dcface00b295bbd18bce82fe1db3d633 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 6 May 2014 09:26:30 -0400
Subject: tracing: Add trace_<tracepoint>_enabled() function

There are some code paths in the kernel that need to do some preparations
before it calls a tracepoint. As that code is worthless overhead when
the tracepoint is not enabled, it would be prudent to have that code
only run when the tracepoint is active. To accomplish this, all tracepoints
now get a static inline function called "trace_<tracepoint-name>_enabled()"
which returns true when the tracepoint is enabled and false otherwise.

As an added bonus, that function uses the static_key of the tracepoint
such that no branch is needed.

  if (trace_mytracepoint_enabled()) {
	arg = process_tp_arg();
	trace_mytracepoint(arg);
  }

Will keep the "process_tp_arg()" (which may be expensive to run) from
being executed when the tracepoint isn't enabled.

It's best to encapsulate the tracepoint itself in the if statement
just to keep races. For example, if you had:

  if (trace_mytracepoint_enabled())
	arg = process_tp_arg();
  trace_mytracepoint(arg);

There's a chance that the tracepoint could be enabled just after the
if statement, and arg will be undefined when calling the tracepoint.

Link: http://lkml.kernel.org/r/20140506094407.507b6435@gandalf.local.home

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/tracepoints.txt | 24 ++++++++++++++++++++++++
 include/linux/tracepoint.h          | 10 ++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/trace/tracepoints.txt b/Documentation/trace/tracepoints.txt
index 6b018b53177a..a3efac621c5a 100644
--- a/Documentation/trace/tracepoints.txt
+++ b/Documentation/trace/tracepoints.txt
@@ -115,6 +115,30 @@ If the tracepoint has to be used in kernel modules, an
 EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
 used to export the defined tracepoints.
 
+If you need to do a bit of work for a tracepoint parameter, and
+that work is only used for the tracepoint, that work can be encapsulated
+within an if statement with the following:
+
+	if (trace_foo_bar_enabled()) {
+		int i;
+		int tot = 0;
+
+		for (i = 0; i < count; i++)
+			tot += calculate_nuggets();
+
+		trace_foo_bar(tot);
+	}
+
+All trace_<tracepoint>() calls have a matching trace_<tracepoint>_enabled()
+function defined that returns true if the tracepoint is enabled and
+false otherwise. The trace_<tracepoint>() should always be within the
+block of the if (trace_<tracepoint>_enabled()) to prevent races between
+the tracepoint being enabled and the check being seen.
+
+The advantage of using the trace_<tracepoint>_enabled() is that it uses
+the static_key of the tracepoint to allow the if statement to be implemented
+with jump labels and avoid conditional branches.
+
 Note: The convenience macro TRACE_EVENT provides an alternative way to
       define tracepoints. Check http://lwn.net/Articles/379903,
       http://lwn.net/Articles/381064 and http://lwn.net/Articles/383362
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 9d30ee469c2a..2e2a5f7717e5 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -185,6 +185,11 @@ extern void syscall_unregfunc(void);
 	static inline void						\
 	check_trace_callback_type_##name(void (*cb)(data_proto))	\
 	{								\
+	}								\
+	static inline bool						\
+	trace_##name##_enabled(void)					\
+	{								\
+		return static_key_false(&__tracepoint_##name.key);	\
 	}
 
 /*
@@ -230,6 +235,11 @@ extern void syscall_unregfunc(void);
 	}								\
 	static inline void check_trace_callback_type_##name(void (*cb)(data_proto)) \
 	{								\
+	}								\
+	static inline bool						\
+	trace_##name##_enabled(void)					\
+	{								\
+		return false;						\
 	}
 
 #define DEFINE_TRACE_FN(name, reg, unreg)
-- 
cgit v1.2.3-71-gd317


From 552a515707a5caf9fa9f3620d68fc28146c6b1b9 Mon Sep 17 00:00:00 2001
From: Helmut Schaa <helmut.schaa@googlemail.com>
Date: Wed, 7 May 2014 09:28:31 +0200
Subject: ath9k: Allow platform override without EEPROM override

Add a new platform data flag "use_eeprom" that indicates that the eeprom
found on the card itself should be used instead of the one present in
the platform data.

This allows to override the MAC address of a PCI card while preserving
the eeprom data from the card itself.

The default behavior is preserved.

Signed-off-by: Helmut Schaa <helmut.schaa@googlemail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/init.c | 2 +-
 include/linux/ath9k_platform.h        | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index a6e273a2c44b..bcc7cfb1866d 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -508,7 +508,7 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
 	sc->tx99_power = MAX_RATE_POWER + 1;
 	init_waitqueue_head(&sc->tx_wait);
 
-	if (!pdata) {
+	if (!pdata || pdata->use_eeprom) {
 		ah->ah_flags |= AH_USE_EEPROM;
 		sc->sc_ah->led_pin = -1;
 	} else {
diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
index 8598f8eacb20..a495a959e8a7 100644
--- a/include/linux/ath9k_platform.h
+++ b/include/linux/ath9k_platform.h
@@ -36,6 +36,8 @@ struct ath9k_platform_data {
 
 	int (*get_mac_revision)(void);
 	int (*external_reset)(void);
+
+	bool use_eeprom;
 };
 
 #endif /* _LINUX_ATH9K_PLATFORM_H */
-- 
cgit v1.2.3-71-gd317


From db6d8cc00773d8ef5a8b421b42a5ded235307b10 Mon Sep 17 00:00:00 2001
From: Alex Hung <alex.hung@canonical.com>
Date: Wed, 26 Mar 2014 02:27:02 -0700
Subject: dell-led: add mic mute led interface

This patch provides similar led functional of

  420f973 thinkpad-acpi: Add mute and mic-mute LED functionality

Signed-off-by: Alex Hung <alex.hung@canonical.com>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 drivers/leds/dell-led.c  | 171 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/dell-led.h |  10 +++
 2 files changed, 174 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/dell-led.h

(limited to 'include/linux')

diff --git a/drivers/leds/dell-led.c b/drivers/leds/dell-led.c
index e5c57389efd6..c36acaf566a6 100644
--- a/drivers/leds/dell-led.c
+++ b/drivers/leds/dell-led.c
@@ -15,12 +15,15 @@
 #include <linux/leds.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/dmi.h>
+#include <linux/dell-led.h>
 
 MODULE_AUTHOR("Louis Davis/Jim Dailey");
 MODULE_DESCRIPTION("Dell LED Control Driver");
 MODULE_LICENSE("GPL");
 
 #define DELL_LED_BIOS_GUID "F6E4FE6E-909D-47cb-8BAB-C9F6F2F8D396"
+#define DELL_APP_GUID "A80593CE-A997-11DA-B012-B622A1EF5492"
 MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
 
 /* Error Result Codes: */
@@ -39,6 +42,149 @@ MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
 #define CMD_LED_OFF	17
 #define CMD_LED_BLINK	18
 
+struct app_wmi_args {
+	u16 class;
+	u16 selector;
+	u32 arg1;
+	u32 arg2;
+	u32 arg3;
+	u32 arg4;
+	u32 res1;
+	u32 res2;
+	u32 res3;
+	u32 res4;
+	char dummy[92];
+};
+
+#define GLOBAL_MIC_MUTE_ENABLE	0x364
+#define GLOBAL_MIC_MUTE_DISABLE	0x365
+
+struct dell_bios_data_token {
+	u16 tokenid;
+	u16 location;
+	u16 value;
+};
+
+struct __attribute__ ((__packed__)) dell_bios_calling_interface {
+	struct	dmi_header header;
+	u16	cmd_io_addr;
+	u8	cmd_io_code;
+	u32	supported_cmds;
+	struct	dell_bios_data_token damap[];
+};
+
+static struct dell_bios_data_token dell_mic_tokens[2];
+
+static int dell_wmi_perform_query(struct app_wmi_args *args)
+{
+	struct app_wmi_args *bios_return;
+	union acpi_object *obj;
+	struct acpi_buffer input;
+	struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
+	acpi_status status;
+	u32 rc = -EINVAL;
+
+	input.length = 128;
+	input.pointer = args;
+
+	status = wmi_evaluate_method(DELL_APP_GUID, 0, 1, &input, &output);
+	if (!ACPI_SUCCESS(status))
+		goto err_out0;
+
+	obj = output.pointer;
+	if (!obj)
+		goto err_out0;
+
+	if (obj->type != ACPI_TYPE_BUFFER)
+		goto err_out1;
+
+	bios_return = (struct app_wmi_args *)obj->buffer.pointer;
+	rc = bios_return->res1;
+	if (rc)
+		goto err_out1;
+
+	memcpy(args, bios_return, sizeof(struct app_wmi_args));
+	rc = 0;
+
+ err_out1:
+	kfree(obj);
+ err_out0:
+	return rc;
+}
+
+static void __init find_micmute_tokens(const struct dmi_header *dm, void *dummy)
+{
+	struct dell_bios_calling_interface *calling_interface;
+	struct dell_bios_data_token *token;
+	int token_size = sizeof(struct dell_bios_data_token);
+	int i = 0;
+
+	if (dm->type == 0xda && dm->length > 17) {
+		calling_interface = container_of(dm,
+				struct dell_bios_calling_interface, header);
+
+		token = &calling_interface->damap[i];
+		while (token->tokenid != 0xffff) {
+			if (token->tokenid == GLOBAL_MIC_MUTE_DISABLE)
+				memcpy(&dell_mic_tokens[0], token, token_size);
+			else if (token->tokenid == GLOBAL_MIC_MUTE_ENABLE)
+				memcpy(&dell_mic_tokens[1], token, token_size);
+
+			i++;
+			token = &calling_interface->damap[i];
+		}
+	}
+}
+
+static int dell_micmute_led_set(int state)
+{
+	struct app_wmi_args args;
+	struct dell_bios_data_token *token;
+
+	if (!wmi_has_guid(DELL_APP_GUID))
+		return -ENODEV;
+
+	if (state == 0 || state == 1)
+		token = &dell_mic_tokens[state];
+	else
+		return -EINVAL;
+
+	memset(&args, 0, sizeof(struct app_wmi_args));
+
+	args.class = 1;
+	args.arg1 = token->location;
+	args.arg2 = token->value;
+
+	dell_wmi_perform_query(&args);
+
+	return state;
+}
+
+int dell_app_wmi_led_set(int whichled, int on)
+{
+	int state = 0;
+
+	switch (whichled) {
+	case DELL_LED_MICMUTE:
+		state = dell_micmute_led_set(on);
+		break;
+	default:
+		pr_warn("led type %x is not supported\n", whichled);
+		break;
+	}
+
+	return state;
+}
+EXPORT_SYMBOL_GPL(dell_app_wmi_led_set);
+
+static int __init dell_micmute_led_init(void)
+{
+	memset(dell_mic_tokens, 0, sizeof(struct dell_bios_data_token) * 2);
+	dmi_walk(find_micmute_tokens, NULL);
+
+	return 0;
+}
+
 struct bios_args {
 	unsigned char length;
 	unsigned char result_code;
@@ -181,21 +327,32 @@ static int __init dell_led_init(void)
 {
 	int error = 0;
 
-	if (!wmi_has_guid(DELL_LED_BIOS_GUID))
+	if (!wmi_has_guid(DELL_LED_BIOS_GUID) && !wmi_has_guid(DELL_APP_GUID))
 		return -ENODEV;
 
-	error = led_off();
-	if (error != 0)
-		return -ENODEV;
+	if (wmi_has_guid(DELL_APP_GUID))
+		error = dell_micmute_led_init();
 
-	return led_classdev_register(NULL, &dell_led);
+	if (wmi_has_guid(DELL_LED_BIOS_GUID)) {
+		error = led_off();
+		if (error != 0)
+			return -ENODEV;
+
+		error = led_classdev_register(NULL, &dell_led);
+	}
+
+	return error;
 }
 
 static void __exit dell_led_exit(void)
 {
-	led_classdev_unregister(&dell_led);
+	int error = 0;
 
-	led_off();
+	if (wmi_has_guid(DELL_LED_BIOS_GUID)) {
+		error = led_off();
+		if (error == 0)
+			led_classdev_unregister(&dell_led);
+	}
 }
 
 module_init(dell_led_init);
diff --git a/include/linux/dell-led.h b/include/linux/dell-led.h
new file mode 100644
index 000000000000..7009b8bec77b
--- /dev/null
+++ b/include/linux/dell-led.h
@@ -0,0 +1,10 @@
+#ifndef __DELL_LED_H__
+#define __DELL_LED_H__
+
+enum {
+	DELL_LED_MICMUTE,
+};
+
+int dell_app_wmi_led_set(int whichled, int on);
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 2ce112f1e788a695630e2c275f47d57a801673d3 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Sat, 5 Apr 2014 20:16:34 -0700
Subject: leds: pca9685: Remove leds-pca9685 driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This driver is replaced by pwm-pca9685 driver and there is no user uses this
driver in current tree. So remove it.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Acked-by: Steffen Trumtrar <s.trumtrar@pengutronix.de>
Acked-by: Maximilian Güntner <maximilian.guentner@gmail.com>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 drivers/leds/Kconfig                       |  10 --
 drivers/leds/Makefile                      |   1 -
 drivers/leds/leds-pca9685.c                | 213 -----------------------------
 include/linux/platform_data/leds-pca9685.h |  35 -----
 4 files changed, 259 deletions(-)
 delete mode 100644 drivers/leds/leds-pca9685.c
 delete mode 100644 include/linux/platform_data/leds-pca9685.h

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 6de9dfbf61c1..6713dbb6bfda 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -300,16 +300,6 @@ config LEDS_PCA963X
 	  LED driver chip accessed via the I2C bus. Supported
 	  devices include PCA9633 and PCA9634
 
-config LEDS_PCA9685
-	tristate "LED support for PCA9685 I2C chip"
-	depends on LEDS_CLASS
-	depends on I2C
-	help
-	  This option enables support for LEDs connected to the PCA9685
-	  LED driver chip accessed via the I2C bus.
-	  The PCA9685 offers 12-bit PWM (4095 levels of brightness) on
-	  16 individual channels.
-
 config LEDS_WM831X_STATUS
 	tristate "LED support for status LEDs on WM831x PMICs"
 	depends on LEDS_CLASS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 3cd76dbd9be2..8979b0b2c85e 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -36,7 +36,6 @@ obj-$(CONFIG_LEDS_OT200)		+= leds-ot200.o
 obj-$(CONFIG_LEDS_FSG)			+= leds-fsg.o
 obj-$(CONFIG_LEDS_PCA955X)		+= leds-pca955x.o
 obj-$(CONFIG_LEDS_PCA963X)		+= leds-pca963x.o
-obj-$(CONFIG_LEDS_PCA9685)		+= leds-pca9685.o
 obj-$(CONFIG_LEDS_DA903X)		+= leds-da903x.o
 obj-$(CONFIG_LEDS_DA9052)		+= leds-da9052.o
 obj-$(CONFIG_LEDS_WM831X_STATUS)	+= leds-wm831x-status.o
diff --git a/drivers/leds/leds-pca9685.c b/drivers/leds/leds-pca9685.c
deleted file mode 100644
index 6e1ef3a9d6ef..000000000000
--- a/drivers/leds/leds-pca9685.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright 2013 Maximilian Güntner <maximilian.guentner@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of
- * the GNU General Public License.  See the file COPYING in the main
- * directory of this archive for more details.
- *
- * Based on leds-pca963x.c driver by
- * Peter Meerwald <p.meerwald@bct-electronic.com>
- *
- * Driver for the NXP PCA9685 12-Bit PWM LED driver chip.
- *
- */
-
-#include <linux/ctype.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/i2c.h>
-#include <linux/leds.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/workqueue.h>
-
-#include <linux/platform_data/leds-pca9685.h>
-
-/* Register Addresses */
-#define PCA9685_MODE1 0x00
-#define PCA9685_MODE2 0x01
-#define PCA9685_LED0_ON_L 0x06
-#define PCA9685_ALL_LED_ON_L 0xFA
-
-/* MODE1 Register */
-#define PCA9685_ALLCALL 0x00
-#define PCA9685_SLEEP   0x04
-#define PCA9685_AI      0x05
-
-/* MODE2 Register */
-#define PCA9685_INVRT   0x04
-#define PCA9685_OUTDRV  0x02
-
-static const struct i2c_device_id pca9685_id[] = {
-	{ "pca9685", 0 },
-	{ }
-};
-MODULE_DEVICE_TABLE(i2c, pca9685_id);
-
-struct pca9685_led {
-	struct i2c_client *client;
-	struct work_struct work;
-	u16 brightness;
-	struct led_classdev led_cdev;
-	int led_num; /* 0-15 */
-	char name[32];
-};
-
-static void pca9685_write_msg(struct i2c_client *client, u8 *buf, u8 len)
-{
-	struct i2c_msg msg = {
-		.addr = client->addr,
-		.flags = 0x00,
-		.len = len,
-		.buf = buf
-	};
-	i2c_transfer(client->adapter, &msg, 1);
-}
-
-static void pca9685_all_off(struct i2c_client *client)
-{
-	u8 i2c_buffer[5] = {PCA9685_ALL_LED_ON_L, 0x00, 0x00, 0x00, 0x10};
-	pca9685_write_msg(client, i2c_buffer, 5);
-}
-
-static void pca9685_led_work(struct work_struct *work)
-{
-	struct pca9685_led *pca9685;
-	u8 i2c_buffer[5];
-
-	pca9685 = container_of(work, struct pca9685_led, work);
-	i2c_buffer[0] = PCA9685_LED0_ON_L + 4 * pca9685->led_num;
-	/*
-	 * 4095 is the maximum brightness, so we set the ON time to 0x1000
-	 * which disables the PWM generator for that LED
-	 */
-	if (pca9685->brightness == 4095)
-		*((__le16 *)(i2c_buffer+1)) = cpu_to_le16(0x1000);
-	else
-		*((__le16 *)(i2c_buffer+1)) = 0x0000;
-
-	if (pca9685->brightness == 0)
-		*((__le16 *)(i2c_buffer+3)) = cpu_to_le16(0x1000);
-	else if (pca9685->brightness == 4095)
-		*((__le16 *)(i2c_buffer+3)) = 0x0000;
-	else
-		*((__le16 *)(i2c_buffer+3)) = cpu_to_le16(pca9685->brightness);
-
-	pca9685_write_msg(pca9685->client, i2c_buffer, 5);
-}
-
-static void pca9685_led_set(struct led_classdev *led_cdev,
-		enum led_brightness value)
-{
-	struct pca9685_led *pca9685;
-	pca9685 = container_of(led_cdev, struct pca9685_led, led_cdev);
-	pca9685->brightness = value;
-
-	schedule_work(&pca9685->work);
-}
-
-static int pca9685_probe(struct i2c_client *client,
-		const struct i2c_device_id *id)
-{
-	struct pca9685_led *pca9685;
-	struct pca9685_platform_data *pdata;
-	int err;
-	u8 i;
-
-	pdata = dev_get_platdata(&client->dev);
-	if (pdata) {
-		if (pdata->leds.num_leds < 1 || pdata->leds.num_leds > 15) {
-			dev_err(&client->dev, "board info must claim 1-16 LEDs");
-			return -EINVAL;
-		}
-	}
-
-	pca9685 = devm_kzalloc(&client->dev, 16 * sizeof(*pca9685), GFP_KERNEL);
-	if (!pca9685)
-		return -ENOMEM;
-
-	i2c_set_clientdata(client, pca9685);
-	pca9685_all_off(client);
-
-	for (i = 0; i < 16; i++) {
-		pca9685[i].client = client;
-		pca9685[i].led_num = i;
-		pca9685[i].name[0] = '\0';
-		if (pdata && i < pdata->leds.num_leds) {
-			if (pdata->leds.leds[i].name)
-				strncpy(pca9685[i].name,
-					pdata->leds.leds[i].name,
-					sizeof(pca9685[i].name)-1);
-			if (pdata->leds.leds[i].default_trigger)
-				pca9685[i].led_cdev.default_trigger =
-					pdata->leds.leds[i].default_trigger;
-		}
-		if (strlen(pca9685[i].name) == 0) {
-			/*
-			 * Write adapter and address to the name as well.
-			 * Otherwise multiple chips attached to one host would
-			 * not work.
-			 */
-			snprintf(pca9685[i].name, sizeof(pca9685[i].name),
-					"pca9685:%d:x%.2x:%d",
-					client->adapter->nr, client->addr, i);
-		}
-		pca9685[i].led_cdev.name = pca9685[i].name;
-		pca9685[i].led_cdev.max_brightness = 0xfff;
-		pca9685[i].led_cdev.brightness_set = pca9685_led_set;
-
-		INIT_WORK(&pca9685[i].work, pca9685_led_work);
-		err = led_classdev_register(&client->dev, &pca9685[i].led_cdev);
-		if (err < 0)
-			goto exit;
-	}
-
-	if (pdata)
-		i2c_smbus_write_byte_data(client, PCA9685_MODE2,
-			pdata->outdrv << PCA9685_OUTDRV |
-			pdata->inverted << PCA9685_INVRT);
-	else
-		i2c_smbus_write_byte_data(client, PCA9685_MODE2,
-			PCA9685_TOTEM_POLE << PCA9685_OUTDRV);
-	/* Enable Auto-Increment, enable oscillator, ALLCALL/SUBADDR disabled */
-	i2c_smbus_write_byte_data(client, PCA9685_MODE1, BIT(PCA9685_AI));
-
-	return 0;
-
-exit:
-	while (i--) {
-		led_classdev_unregister(&pca9685[i].led_cdev);
-		cancel_work_sync(&pca9685[i].work);
-	}
-	return err;
-}
-
-static int pca9685_remove(struct i2c_client *client)
-{
-	struct pca9685_led *pca9685 = i2c_get_clientdata(client);
-	u8 i;
-
-	for (i = 0; i < 16; i++) {
-		led_classdev_unregister(&pca9685[i].led_cdev);
-		cancel_work_sync(&pca9685[i].work);
-	}
-	pca9685_all_off(client);
-	return 0;
-}
-
-static struct i2c_driver pca9685_driver = {
-	.driver = {
-		.name = "leds-pca9685",
-		.owner = THIS_MODULE,
-	},
-	.probe = pca9685_probe,
-	.remove = pca9685_remove,
-	.id_table = pca9685_id,
-};
-
-module_i2c_driver(pca9685_driver);
-
-MODULE_AUTHOR("Maximilian Güntner <maximilian.guentner@gmail.com>");
-MODULE_DESCRIPTION("PCA9685 LED Driver");
-MODULE_LICENSE("GPL v2");
diff --git a/include/linux/platform_data/leds-pca9685.h b/include/linux/platform_data/leds-pca9685.h
deleted file mode 100644
index 778e9e4249cc..000000000000
--- a/include/linux/platform_data/leds-pca9685.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright 2013 Maximilian Güntner <maximilian.guentner@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of
- * the GNU General Public License.  See the file COPYING in the main
- * directory of this archive for more details.
- *
- * Based on leds-pca963x.h by Peter Meerwald <p.meerwald@bct-electronic.com>
- *
- * LED driver for the NXP PCA9685 PWM chip
- *
- */
-
-#ifndef __LINUX_PCA9685_H
-#define __LINUX_PCA9685_H
-
-#include <linux/leds.h>
-
-enum pca9685_outdrv {
-	PCA9685_OPEN_DRAIN,
-	PCA9685_TOTEM_POLE,
-};
-
-enum pca9685_inverted {
-	PCA9685_NOT_INVERTED,
-	PCA9685_INVERTED,
-};
-
-struct pca9685_platform_data {
-	struct led_platform_data leds;
-	enum pca9685_outdrv outdrv;
-	enum pca9685_inverted inverted;
-};
-
-#endif /* __LINUX_PCA9685_H */
-- 
cgit v1.2.3-71-gd317


From 2070d50e1cbe3d7f157cbf8e63279c893f375d7f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 9 May 2014 15:11:53 -0400
Subject: percpu-refcount: rename percpu_ref_tryget() to
 percpu_ref_tryget_live()

percpu_ref_tryget() is different from the usual tryget semantics in
that it fails if the refcnt is in its dying stage even if the refcnt
hasn't reached zero yet.  We're about to introduce the more
conventional tryget and the current one has only one user.  Let's
rename it to percpu_ref_tryget_live() so that it explicitly signifies
the peculiarities of its semantics.

This is pure rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Kent Overstreet <kmo@daterainc.com>
---
 include/linux/cgroup.h          | 2 +-
 include/linux/percpu-refcount.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c2515851c1aa..549aed8de32b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -101,7 +101,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
-	return percpu_ref_tryget(&css->refcnt);
+	return percpu_ref_tryget_live(&css->refcnt);
 }
 
 /**
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 95961f0bf62d..e22d15597cc3 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -118,7 +118,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 }
 
 /**
- * percpu_ref_tryget - try to increment a percpu refcount
+ * percpu_ref_tryget_live - try to increment a live percpu refcount
  * @ref: percpu_ref to try-get
  *
  * Increment a percpu refcount unless it has already been killed.  Returns
@@ -129,7 +129,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
  * used.  After the confirm_kill callback is invoked, it's guaranteed that
  * no new reference will be given out by percpu_ref_tryget().
  */
-static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
 {
 	unsigned __percpu *pcpu_count;
 	int ret = false;
-- 
cgit v1.2.3-71-gd317


From 4fb6e25049cb6fa0accc7f1b7c192b952fad7ac8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 9 May 2014 15:11:53 -0400
Subject: percpu-refcount: implement percpu_ref_tryget()

Implement percpu_ref_tryget() which fails if the refcnt already
reached zero.  Note that this is different from the recently renamed
percpu_ref_tryget_live() which fails if the refcnt has been killed and
is draining the remaining references.  percpu_ref_tryget() succeeds on
a killed refcnt as long as its current refcnt is above zero.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Kent Overstreet <kmo@daterainc.com>
---
 include/linux/percpu-refcount.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index e22d15597cc3..dba35c411e8c 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -117,6 +117,36 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 	rcu_read_unlock_sched();
 }
 
+/**
+ * percpu_ref_tryget - try to increment a percpu refcount
+ * @ref: percpu_ref to try-get
+ *
+ * Increment a percpu refcount unless its count already reached zero.
+ * Returns %true on success; %false on failure.
+ *
+ * The caller is responsible for ensuring that @ref stays accessible.
+ */
+static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+{
+	unsigned __percpu *pcpu_count;
+	int ret = false;
+
+	rcu_read_lock_sched();
+
+	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+	if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) {
+		__this_cpu_inc(*pcpu_count);
+		ret = true;
+	} else {
+		ret = atomic_inc_not_zero(&ref->count);
+	}
+
+	rcu_read_unlock_sched();
+
+	return ret;
+}
+
 /**
  * percpu_ref_tryget_live - try to increment a live percpu refcount
  * @ref: percpu_ref to try-get
@@ -128,6 +158,8 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
  * will fail.  For such guarantee, percpu_ref_kill_and_confirm() should be
  * used.  After the confirm_kill callback is invoked, it's guaranteed that
  * no new reference will be given out by percpu_ref_tryget().
+ *
+ * The caller is responsible for ensuring that @ref stays accessible.
  */
 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
 {
-- 
cgit v1.2.3-71-gd317


From 49c50b97b5522a987b80fbbf9d9869deee8d23b0 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Tue, 6 May 2014 16:02:19 -0700
Subject: mtd: nand: refactor erase_cmd() to return chip status

The nand_chip::erase_cmd callback previously served a dual purpose; for
one, it allowed a per-flash-chip override, so that AG-AND devices could
use a different erase command than other NAND. These AND devices were
dropped in commit 14c6578683367b1e7af0c3c09e872b45a45183a7 (mtd: nand:
remove AG-AND support). On the other hand, some drivers (denali and
doc-g4) need to use this sort of callback to implement
controller-specific erase operations.

To make the latter operation easier for some drivers (e.g., ST's new BCH
NAND driver), it helps if the command dispatch and wait functions can be
lumped together, rather than called separately.

This patch does two things:
 1. Pull the call to chip->waitfunc() into chip->erase_cmd(), and return
    the status from this callback
 2. Rename erase_cmd() to just erase(), since this callback does a
    little more than just send a command

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Tested-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mtd/nand/denali.c    |  7 +++----
 drivers/mtd/nand/docg4.c     |  6 ++++--
 drivers/mtd/nand/nand_base.c | 14 +++++++-------
 include/linux/mtd/nand.h     |  5 ++---
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c
index c07cd573ad3a..9f2012a3e764 100644
--- a/drivers/mtd/nand/denali.c
+++ b/drivers/mtd/nand/denali.c
@@ -1233,7 +1233,7 @@ static int denali_waitfunc(struct mtd_info *mtd, struct nand_chip *chip)
 	return status;
 }
 
-static void denali_erase(struct mtd_info *mtd, int page)
+static int denali_erase(struct mtd_info *mtd, int page)
 {
 	struct denali_nand_info *denali = mtd_to_denali(mtd);
 
@@ -1250,8 +1250,7 @@ static void denali_erase(struct mtd_info *mtd, int page)
 	irq_status = wait_for_irq(denali, INTR_STATUS__ERASE_COMP |
 					INTR_STATUS__ERASE_FAIL);
 
-	denali->status = (irq_status & INTR_STATUS__ERASE_FAIL) ?
-						NAND_STATUS_FAIL : PASS;
+	return (irq_status & INTR_STATUS__ERASE_FAIL) ? NAND_STATUS_FAIL : PASS;
 }
 
 static void denali_cmdfunc(struct mtd_info *mtd, unsigned int cmd, int col,
@@ -1584,7 +1583,7 @@ int denali_init(struct denali_nand_info *denali)
 	denali->nand.ecc.write_page_raw = denali_write_page_raw;
 	denali->nand.ecc.read_oob = denali_read_oob;
 	denali->nand.ecc.write_oob = denali_write_oob;
-	denali->nand.erase_cmd = denali_erase;
+	denali->nand.erase = denali_erase;
 
 	if (nand_scan_tail(&denali->mtd)) {
 		ret = -ENXIO;
diff --git a/drivers/mtd/nand/docg4.c b/drivers/mtd/nand/docg4.c
index 1b0265e85a06..ce24637e14f1 100644
--- a/drivers/mtd/nand/docg4.c
+++ b/drivers/mtd/nand/docg4.c
@@ -872,7 +872,7 @@ static int docg4_read_oob(struct mtd_info *mtd, struct nand_chip *nand,
 	return 0;
 }
 
-static void docg4_erase_block(struct mtd_info *mtd, int page)
+static int docg4_erase_block(struct mtd_info *mtd, int page)
 {
 	struct nand_chip *nand = mtd->priv;
 	struct docg4_priv *doc = nand->priv;
@@ -916,6 +916,8 @@ static void docg4_erase_block(struct mtd_info *mtd, int page)
 	write_nop(docptr);
 	poll_status(doc);
 	write_nop(docptr);
+
+	return nand->waitfunc(mtd, nand);
 }
 
 static int write_page(struct mtd_info *mtd, struct nand_chip *nand,
@@ -1236,7 +1238,7 @@ static void __init init_mtd_structs(struct mtd_info *mtd)
 	nand->block_markbad = docg4_block_markbad;
 	nand->read_buf = docg4_read_buf;
 	nand->write_buf = docg4_write_buf16;
-	nand->erase_cmd = docg4_erase_block;
+	nand->erase = docg4_erase_block;
 	nand->ecc.read_page = docg4_read_page;
 	nand->ecc.write_page = docg4_write_page;
 	nand->ecc.read_page_raw = docg4_read_page_raw;
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f6c5685b79a6..7853b9b0a05e 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2617,18 +2617,20 @@ out:
 }
 
 /**
- * single_erase_cmd - [GENERIC] NAND standard block erase command function
+ * single_erase - [GENERIC] NAND standard block erase command function
  * @mtd: MTD device structure
  * @page: the page address of the block which will be erased
  *
- * Standard erase command for NAND chips.
+ * Standard erase command for NAND chips. Returns NAND status.
  */
-static void single_erase_cmd(struct mtd_info *mtd, int page)
+static int single_erase(struct mtd_info *mtd, int page)
 {
 	struct nand_chip *chip = mtd->priv;
 	/* Send commands to erase a block */
 	chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page);
 	chip->cmdfunc(mtd, NAND_CMD_ERASE2, -1, -1);
+
+	return chip->waitfunc(mtd, chip);
 }
 
 /**
@@ -2709,9 +2711,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		    (page + pages_per_block))
 			chip->pagebuf = -1;
 
-		chip->erase_cmd(mtd, page & chip->pagemask);
-
-		status = chip->waitfunc(mtd, chip);
+		status = chip->erase(mtd, page & chip->pagemask);
 
 		/*
 		 * See if operation failed and additional status checks are
@@ -3684,7 +3684,7 @@ ident_done:
 	}
 
 	chip->badblockbits = 8;
-	chip->erase_cmd = single_erase_cmd;
+	chip->erase = single_erase;
 
 	/* Do not replace user supplied command function! */
 	if (mtd->writesize > 512 && chip->cmdfunc == nand_command)
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 450d61ec7f06..7a922e6c4e4b 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -552,8 +552,7 @@ struct nand_buffers {
  * @ecc:		[BOARDSPECIFIC] ECC control structure
  * @buffers:		buffer structure for read/write
  * @hwcontrol:		platform-specific hardware control structure
- * @erase_cmd:		[INTERN] erase command write function, selectable due
- *			to AND support.
+ * @erase:		[REPLACEABLE] erase function
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transferring
  *			data from array to read regs (tR).
@@ -637,7 +636,7 @@ struct nand_chip {
 	void (*cmdfunc)(struct mtd_info *mtd, unsigned command, int column,
 			int page_addr);
 	int(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this);
-	void (*erase_cmd)(struct mtd_info *mtd, int page);
+	int (*erase)(struct mtd_info *mtd, int page);
 	int (*scan_bbt)(struct mtd_info *mtd);
 	int (*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state,
 			int status, int page);
-- 
cgit v1.2.3-71-gd317


From 9739eef13c926645fbf88bcb77e66442fa75d688 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 8 May 2014 14:10:51 -0700
Subject: net: filter: make BPF conversion more readable

Introduce BPF helper macros to define instructions
(similar to old BPF_STMT/BPF_JUMP macros)

Use them while converting classic BPF to internal
and in BPF testsuite later.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  51 ++++++++++++++++++
 net/core/filter.c      | 142 +++++++++++++++++--------------------------------
 2 files changed, 101 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ed1efab10b8f..4457b383961c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -79,6 +79,57 @@ enum {
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
+/* bpf_add|sub|...: a += x, bpf_mov: a = x */
+#define BPF_ALU64_REG(op, a, x) \
+	((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_X, a, x, 0, 0})
+#define BPF_ALU32_REG(op, a, x) \
+	((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_X, a, x, 0, 0})
+
+/* bpf_add|sub|...: a += imm, bpf_mov: a = imm */
+#define BPF_ALU64_IMM(op, a, imm) \
+	((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_K, a, 0, 0, imm})
+#define BPF_ALU32_IMM(op, a, imm) \
+	((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_K, a, 0, 0, imm})
+
+/* R0 = *(uint *) (skb->data + off) */
+#define BPF_LD_ABS(size, off) \
+	((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_ABS, 0, 0, 0, off})
+
+/* R0 = *(uint *) (skb->data + x + off) */
+#define BPF_LD_IND(size, x, off) \
+	((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_IND, 0, x, 0, off})
+
+/* a = *(uint *) (x + off) */
+#define BPF_LDX_MEM(sz, a, x, off) \
+	((struct sock_filter_int) {BPF_LDX|BPF_SIZE(sz)|BPF_MEM, a, x, off, 0})
+
+/* if (a 'op' x) goto pc+off */
+#define BPF_JMP_REG(op, a, x, off) \
+	((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_X, a, x, off, 0})
+
+/* if (a 'op' imm) goto pc+off */
+#define BPF_JMP_IMM(op, a, imm, off) \
+	((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_K, a, 0, off, imm})
+
+#define BPF_EXIT_INSN() \
+	((struct sock_filter_int) {BPF_JMP|BPF_EXIT, 0, 0, 0, 0})
+
+static inline int size_to_bpf(int size)
+{
+	switch (size) {
+	case 1:
+		return BPF_B;
+	case 2:
+		return BPF_H;
+	case 4:
+		return BPF_W;
+	case 8:
+		return BPF_DW;
+	default:
+		return -EINVAL;
+	}
+}
+
 /* Macro to invoke filter function. */
 #define SK_RUN_FILTER(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index eb020a7d6f55..9aaa05ad8fe3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -668,10 +668,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_PROTOCOL:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
-		insn->code = BPF_LDX | BPF_MEM | BPF_H;
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_CTX;
-		insn->off = offsetof(struct sk_buff, protocol);
+		/* A = *(u16 *) (ctx + offsetof(protocol)) */
+		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+				    offsetof(struct sk_buff, protocol));
 		insn++;
 
 		/* A = ntohs(A) [emitting a nop or swap16] */
@@ -681,37 +680,27 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		break;
 
 	case SKF_AD_OFF + SKF_AD_PKTTYPE:
-		insn->code = BPF_LDX | BPF_MEM | BPF_B;
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_CTX;
-		insn->off = pkt_type_offset();
+		*insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
+				    pkt_type_offset());
 		if (insn->off < 0)
 			return false;
 		insn++;
 
-		insn->code = BPF_ALU | BPF_AND | BPF_K;
-		insn->a_reg = BPF_REG_A;
-		insn->imm = PKT_TYPE_MAX;
+		*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
 		break;
 
 	case SKF_AD_OFF + SKF_AD_IFINDEX:
 	case SKF_AD_OFF + SKF_AD_HATYPE:
-		if (FIELD_SIZEOF(struct sk_buff, dev) == 8)
-			insn->code = BPF_LDX | BPF_MEM | BPF_DW;
-		else
-			insn->code = BPF_LDX | BPF_MEM | BPF_W;
-		insn->a_reg = BPF_REG_TMP;
-		insn->x_reg = BPF_REG_CTX;
-		insn->off = offsetof(struct sk_buff, dev);
+		*insn = BPF_LDX_MEM(size_to_bpf(FIELD_SIZEOF(struct sk_buff, dev)),
+				    BPF_REG_TMP, BPF_REG_CTX,
+				    offsetof(struct sk_buff, dev));
 		insn++;
 
-		insn->code = BPF_JMP | BPF_JNE | BPF_K;
-		insn->a_reg = BPF_REG_TMP;
-		insn->imm = 0;
-		insn->off = 1;
+		/* if (tmp != 0) goto pc+1 */
+		*insn = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
 		insn++;
 
-		insn->code = BPF_JMP | BPF_EXIT;
+		*insn = BPF_EXIT_INSN();
 		insn++;
 
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
@@ -732,55 +721,45 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_MARK:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 
-		insn->code = BPF_LDX | BPF_MEM | BPF_W;
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_CTX;
-		insn->off = offsetof(struct sk_buff, mark);
+		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+				    offsetof(struct sk_buff, mark));
 		break;
 
 	case SKF_AD_OFF + SKF_AD_RXHASH:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 
-		insn->code = BPF_LDX | BPF_MEM | BPF_W;
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_CTX;
-		insn->off = offsetof(struct sk_buff, hash);
+		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+				    offsetof(struct sk_buff, hash));
 		break;
 
 	case SKF_AD_OFF + SKF_AD_QUEUE:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
 
-		insn->code = BPF_LDX | BPF_MEM | BPF_H;
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_CTX;
-		insn->off = offsetof(struct sk_buff, queue_mapping);
+		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+				    offsetof(struct sk_buff, queue_mapping));
 		break;
 
 	case SKF_AD_OFF + SKF_AD_VLAN_TAG:
 	case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 
-		insn->code = BPF_LDX | BPF_MEM | BPF_H;
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_CTX;
-		insn->off = offsetof(struct sk_buff, vlan_tci);
+		/* A = *(u16 *) (ctx + offsetof(vlan_tci)) */
+		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+				    offsetof(struct sk_buff, vlan_tci));
 		insn++;
 
 		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
 
 		if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
-			insn->code = BPF_ALU | BPF_AND | BPF_K;
-			insn->a_reg = BPF_REG_A;
-			insn->imm = ~VLAN_TAG_PRESENT;
+			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
+					      ~VLAN_TAG_PRESENT);
 		} else {
-			insn->code = BPF_ALU | BPF_RSH | BPF_K;
-			insn->a_reg = BPF_REG_A;
-			insn->imm = 12;
+			/* A >>= 12 */
+			*insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
 			insn++;
 
-			insn->code = BPF_ALU | BPF_AND | BPF_K;
-			insn->a_reg = BPF_REG_A;
-			insn->imm = 1;
+			/* A &= 1 */
+			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
 		}
 		break;
 
@@ -790,21 +769,15 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
 		/* arg1 = ctx */
-		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		insn->a_reg = BPF_REG_ARG1;
-		insn->x_reg = BPF_REG_CTX;
+		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
 
 		/* arg2 = A */
-		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		insn->a_reg = BPF_REG_ARG2;
-		insn->x_reg = BPF_REG_A;
+		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG2, BPF_REG_A);
 		insn++;
 
 		/* arg3 = X */
-		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		insn->a_reg = BPF_REG_ARG3;
-		insn->x_reg = BPF_REG_X;
+		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
 		insn++;
 
 		/* Emit call(ctx, arg2=A, arg3=X) */
@@ -829,9 +802,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		break;
 
 	case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
-		insn->code = BPF_ALU | BPF_XOR | BPF_X;
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_X;
+		/* A ^= X */
+		*insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
 		break;
 
 	default:
@@ -897,9 +869,7 @@ do_pass:
 	fp = prog;
 
 	if (new_insn) {
-		new_insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-		new_insn->a_reg = BPF_REG_CTX;
-		new_insn->x_reg = BPF_REG_ARG1;
+		*new_insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_CTX, BPF_REG_ARG1);
 	}
 	new_insn++;
 
@@ -1027,34 +997,28 @@ do_pass:
 
 		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
 		case BPF_LDX | BPF_MSH | BPF_B:
-			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = BPF_REG_TMP;
-			insn->x_reg = BPF_REG_A;
+			/* tmp = A */
+			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_TMP, BPF_REG_A);
 			insn++;
 
-			insn->code = BPF_LD | BPF_ABS | BPF_B;
-			insn->a_reg = BPF_REG_A;
-			insn->imm = fp->k;
+			/* A = R0 = *(u8 *) (skb->data + K) */
+			*insn = BPF_LD_ABS(BPF_B, fp->k);
 			insn++;
 
-			insn->code = BPF_ALU | BPF_AND | BPF_K;
-			insn->a_reg = BPF_REG_A;
-			insn->imm = 0xf;
+			/* A &= 0xf */
+			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 			insn++;
 
-			insn->code = BPF_ALU | BPF_LSH | BPF_K;
-			insn->a_reg = BPF_REG_A;
-			insn->imm = 2;
+			/* A <<= 2 */
+			*insn = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
 			insn++;
 
-			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = BPF_REG_X;
-			insn->x_reg = BPF_REG_A;
+			/* X = A */
+			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_X, BPF_REG_A);
 			insn++;
 
-			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = BPF_REG_A;
-			insn->x_reg = BPF_REG_TMP;
+			/* A = tmp */
+			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_A, BPF_REG_TMP);
 			break;
 
 		/* RET_K, RET_A are remaped into 2 insns. */
@@ -1068,7 +1032,7 @@ do_pass:
 			insn->imm = fp->k;
 			insn++;
 
-			insn->code = BPF_JMP | BPF_EXIT;
+			*insn = BPF_EXIT_INSN();
 			break;
 
 		/* Store to stack. */
@@ -1102,16 +1066,12 @@ do_pass:
 
 		/* X = A */
 		case BPF_MISC | BPF_TAX:
-			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = BPF_REG_X;
-			insn->x_reg = BPF_REG_A;
+			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_X, BPF_REG_A);
 			break;
 
 		/* A = X */
 		case BPF_MISC | BPF_TXA:
-			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
-			insn->a_reg = BPF_REG_A;
-			insn->x_reg = BPF_REG_X;
+			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_A, BPF_REG_X);
 			break;
 
 		/* A = skb->len or X = skb->len */
@@ -1126,10 +1086,8 @@ do_pass:
 
 		/* access seccomp_data fields */
 		case BPF_LDX | BPF_ABS | BPF_W:
-			insn->code = BPF_LDX | BPF_MEM | BPF_W;
-			insn->a_reg = BPF_REG_A;
-			insn->x_reg = BPF_REG_CTX;
-			insn->off = fp->k;
+			/* A = *(u32 *) (ctx + K) */
+			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
 			break;
 
 		default:
-- 
cgit v1.2.3-71-gd317


From 60ff746739bf805a912484643c720b6124826140 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sun, 4 May 2014 16:39:18 -0700
Subject: net: rename local_df to ignore_df
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As suggested by several people, rename local_df to ignore_df,
since it means "ignore df bit if it is set".

Cc: Maciej Żenczykowski <maze@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h                  |  4 ++--
 include/net/ip.h                        |  6 +++---
 include/net/ip6_route.h                 |  2 +-
 net/core/skbuff.c                       |  4 ++--
 net/ipv4/ip_forward.c                   |  2 +-
 net/ipv4/ip_output.c                    | 12 ++++++------
 net/ipv4/netfilter/nf_defrag_ipv4.c     |  2 +-
 net/ipv4/xfrm4_output.c                 |  2 +-
 net/ipv6/ip6_output.c                   | 12 ++++++------
 net/ipv6/netfilter/nf_conntrack_reasm.c |  2 +-
 net/ipv6/xfrm6_output.c                 |  6 +++---
 net/l2tp/l2tp_core.c                    |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c         | 20 ++++++++++----------
 net/openvswitch/vport-gre.c             |  2 +-
 net/openvswitch/vport-vxlan.c           |  2 +-
 net/sctp/ipv6.c                         |  2 +-
 net/sctp/output.c                       |  2 +-
 17 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3ca0dda5a42e..7a9beeb1c458 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -426,7 +426,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
  *	@csum_start: Offset from skb->head where checksumming should start
  *	@csum_offset: Offset from csum_start where checksum should be stored
  *	@priority: Packet queueing priority
- *	@local_df: allow local fragmentation
+ *	@ignore_df: allow local fragmentation
  *	@cloned: Head may be cloned (check refcnt to be sure)
  *	@ip_summed: Driver fed us an IP checksum
  *	@nohdr: Payload reference only, must not modify header
@@ -514,7 +514,7 @@ struct sk_buff {
 	};
 	__u32			priority;
 	kmemcheck_bitfield_begin(flags1);
-	__u8			local_df:1,
+	__u8			ignore_df:1,
 				cloned:1,
 				ip_summed:2,
 				nohdr:1,
diff --git a/include/net/ip.h b/include/net/ip.h
index 16146b667ddb..55752985c144 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -269,7 +269,7 @@ static inline bool ip_sk_use_pmtu(const struct sock *sk)
 	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
 }
 
-static inline bool ip_sk_local_df(const struct sock *sk)
+static inline bool ip_sk_ignore_df(const struct sock *sk)
 {
 	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO ||
 	       inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT;
@@ -304,7 +304,7 @@ static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, s
 {
 	struct iphdr *iph = ip_hdr(skb);
 
-	if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) {
+	if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
 		/* This is only to work around buggy Windows95/2000
 		 * VJ compression implementations.  If the ID field
 		 * does not change, they drop every other packet in
@@ -320,7 +320,7 @@ static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *d
 {
 	struct iphdr *iph = ip_hdr(skb);
 
-	if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) {
+	if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
 		if (sk && inet_sk(sk)->inet_daddr) {
 			iph->id = htons(inet_sk(sk)->inet_id);
 			inet_sk(sk)->inet_id += 1 + more;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 6c4f5eac98e7..38e41e4d0998 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -185,7 +185,7 @@ static inline bool ip6_sk_accept_pmtu(const struct sock *sk)
 	       inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT;
 }
 
-static inline bool ip6_sk_local_df(const struct sock *sk)
+static inline bool ip6_sk_ignore_df(const struct sock *sk)
 {
 	return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO ||
 	       inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1b62343f5837..3d74530ae82b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -694,7 +694,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	memcpy(new->cb, old->cb, sizeof(old->cb));
 	new->csum		= old->csum;
-	new->local_df		= old->local_df;
+	new->ignore_df		= old->ignore_df;
 	new->pkt_type		= old->pkt_type;
 	new->ip_summed		= old->ip_summed;
 	skb_copy_queue_mapping(new, old);
@@ -3913,7 +3913,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->tstamp.tv64 = 0;
 	skb->pkt_type = PACKET_HOST;
 	skb->skb_iif = 0;
-	skb->local_df = 0;
+	skb->ignore_df = 0;
 	skb_dst_drop(skb);
 	skb->mark = 0;
 	secpath_reset(skb);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 6f111e48e11c..3a83ce5efa80 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -42,7 +42,7 @@
 static bool ip_may_fragment(const struct sk_buff *skb)
 {
 	return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
-		skb->local_df;
+		skb->ignore_df;
 }
 
 static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a52f50187b54..6aa4380fde1a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -415,7 +415,7 @@ packet_routed:
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
-	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
+	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
 		iph->frag_off = htons(IP_DF);
 	else
 		iph->frag_off = 0;
@@ -501,7 +501,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	iph = ip_hdr(skb);
 
 	mtu = ip_skb_dst_mtu(skb);
-	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
 		     (IPCB(skb)->frag_max_size &&
 		      IPCB(skb)->frag_max_size > mtu))) {
 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -866,7 +866,7 @@ static int __ip_append_data(struct sock *sk,
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-	maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
+	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
 
 	if (cork->length + length > maxnonfragsize - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1189,7 +1189,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-	maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
+	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
 
 	if (cork->length + size > maxnonfragsize - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1350,10 +1350,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	 * to fragment the frame generated here. No matter, what transforms
 	 * how transforms change size of the packet, it will come out.
 	 */
-	skb->local_df = ip_sk_local_df(sk);
+	skb->ignore_df = ip_sk_ignore_df(sk);
 
 	/* DF bit is set when we want to see DF on outgoing frames.
-	 * If local_df is set too, we still allow to fragment this frame
+	 * If ignore_df is set too, we still allow to fragment this frame
 	 * locally. */
 	if (inet->pmtudisc == IP_PMTUDISC_DO ||
 	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index f40f321b41fc..b8f6381c7d0b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -34,7 +34,7 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
 
 	if (!err) {
 		ip_send_check(ip_hdr(skb));
-		skb->local_df = 1;
+		skb->ignore_df = 1;
 	}
 
 	return err;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 40e701f2e1e0..8e8c018d9d2d 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
 		goto out;
 
-	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
+	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
 		goto out;
 
 	mtu = dst_mtu(skb_dst(skb));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 31a38bde69ef..ab0cc57f779c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -219,7 +219,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	skb->mark = sk->sk_mark;
 
 	mtu = dst_mtu(dst);
-	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
+	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_OUT, skb->len);
 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
@@ -347,11 +347,11 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 	if (skb->len <= mtu)
 		return false;
 
-	/* ipv6 conntrack defrag sets max_frag_size + local_df */
+	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 		return true;
 
-	if (skb->local_df)
+	if (skb->ignore_df)
 		return false;
 
 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
@@ -559,7 +559,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	/* We must not fragment if the socket is set to force MTU discovery
 	 * or if the skb it not generated by a local socket.
 	 */
-	if (unlikely(!skb->local_df && skb->len > mtu) ||
+	if (unlikely(!skb->ignore_df && skb->len > mtu) ||
 		     (IP6CB(skb)->frag_max_size &&
 		      IP6CB(skb)->frag_max_size > mtu)) {
 		if (skb->sk && dst_allfrag(skb_dst(skb)))
@@ -1234,7 +1234,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 			      sizeof(struct frag_hdr) : 0) +
 			     rt->rt6i_nfheader_len;
 
-		if (ip6_sk_local_df(sk))
+		if (ip6_sk_ignore_df(sk))
 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
 		else
 			maxnonfragsize = mtu;
@@ -1544,7 +1544,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	}
 
 	/* Allow local fragmentation. */
-	skb->local_df = ip6_sk_local_df(sk);
+	skb->ignore_df = ip6_sk_ignore_df(sk);
 
 	*final_dst = fl6->daddr;
 	__skb_pull(skb, skb_network_header_len(skb));
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 767ab8da8218..0d5279fd852a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -451,7 +451,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 	}
 	sub_frag_mem_limit(&fq->q, head->truesize);
 
-	head->local_df = 1;
+	head->ignore_df = 1;
 	head->next = NULL;
 	head->dev = dev;
 	head->tstamp = fq->q.stamp;
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 19ef329bdbf8..f47c8b153dd3 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -78,7 +78,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 
-	if (!skb->local_df && skb->len > mtu) {
+	if (!skb->ignore_df && skb->len > mtu) {
 		skb->dev = dst->dev;
 
 		if (xfrm6_local_dontfrag(skb))
@@ -120,7 +120,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
 #endif
 
 	skb->protocol = htons(ETH_P_IPV6);
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	return x->outer_mode->output2(x, skb);
 }
@@ -150,7 +150,7 @@ static int __xfrm6_output(struct sk_buff *skb)
 	if (skb->len > mtu && xfrm6_local_dontfrag(skb)) {
 		xfrm6_local_rxpmtu(skb, mtu);
 		return -EMSGSIZE;
-	} else if (!skb->local_df && skb->len > mtu && skb->sk) {
+	} else if (!skb->ignore_df && skb->len > mtu && skb->sk) {
 		xfrm_local_error(skb, mtu);
 		return -EMSGSIZE;
 	}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index aa1a9d44c107..ed0716a075ba 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1073,7 +1073,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
 	}
 
 	/* Queue the packet to IP for output */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped)
 		error = inet6_csk_xmit(tunnel->sock, skb, NULL);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index c47444e4cf8c..487b55e04337 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -562,7 +562,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_send_check(iph);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 	rcu_read_unlock();
@@ -590,7 +590,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 	rcu_read_unlock();
@@ -684,7 +684,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	   MTU problem. */
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 	rcu_read_unlock();
@@ -774,7 +774,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	   MTU problem. */
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 	rcu_read_unlock();
@@ -886,7 +886,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_select_ident(skb, &rt->dst, NULL);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
 	if (ret == NF_ACCEPT)
@@ -974,7 +974,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	iph->hop_limit		=	old_iph->hop_limit;
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
 	if (ret == NF_ACCEPT)
@@ -1023,7 +1023,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_send_check(ip_hdr(skb));
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 	rcu_read_unlock();
@@ -1060,7 +1060,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	}
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 	rcu_read_unlock();
@@ -1157,7 +1157,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_vs_nat_icmp(skb, pp, cp, 0);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 	rcu_read_unlock();
@@ -1249,7 +1249,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 	rcu_read_unlock();
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index ebb6e2442554..0856f014d7a9 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -172,7 +172,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 	df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
 		htons(IP_DF) : 0;
 
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	return iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
 			     OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE,
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 21cceb3bdf78..a93efa3f64c3 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -170,7 +170,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
 		htons(IP_DF) : 0;
 
-	skb->local_df = 1;
+	skb->ignore_df = 1;
 
 	inet_get_local_port_range(net, &port_min, &port_max);
 	src_port = vxlan_src_port(port_min, port_max, skb);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 2b1738ef9394..4dc5d9e08311 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -216,7 +216,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
 	IP6_ECN_flow_xmit(sk, fl6->flowlabel);
 
 	if (!(transport->param_flags & SPP_PMTUD_ENABLE))
-		skb->local_df = 1;
+		skb->ignore_df = 1;
 
 	SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 0f4d15fc2627..01ab8e0723f0 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -591,7 +591,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 
 	pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len);
 
-	nskb->local_df = packet->ipfragok;
+	nskb->ignore_df = packet->ipfragok;
 	tp->af_specific->sctp_xmit(nskb, tp);
 
 out:
-- 
cgit v1.2.3-71-gd317


From f06c7f9f92295faf701a9628b383156c4efb6119 Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Fri, 9 May 2014 14:58:05 +0800
Subject: vlan: rename __vlan_find_dev_deep() to __vlan_find_dev_deep_rcu()

The __vlan_find_dev_deep should always called in RCU, according
David's suggestion, rename to __vlan_find_dev_deep_rcu looks more
reasonable.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c   |  2 +-
 drivers/net/usb/cdc_mbim.c                         |  4 ++--
 drivers/s390/net/qeth_l3_main.c                    | 10 +++++-----
 include/linux/if_vlan.h                            |  4 ++--
 net/8021q/vlan_core.c                              |  6 +++---
 net/bridge/br_netfilter.c                          |  2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
index c0a9dd55f4e5..b0cbb2b7fd48 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
@@ -185,7 +185,7 @@ static struct net_device *get_iff_from_mac(struct adapter *adapter,
 		if (ether_addr_equal(dev->dev_addr, mac)) {
 			rcu_read_lock();
 			if (vlan && vlan != VLAN_VID_MASK) {
-				dev = __vlan_find_dev_deep(dev, htons(ETH_P_8021Q), vlan);
+				dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), vlan);
 			} else if (netif_is_bond_slave(dev)) {
 				struct net_device *upper_dev;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 8efeed3325b5..0f1e886d89e3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4068,7 +4068,7 @@ static int update_root_dev_clip(struct net_device *dev)
 
 	/* Parse all bond and vlan devices layered on top of the physical dev */
 	for (i = 0; i < VLAN_N_VID; i++) {
-		root_dev = __vlan_find_dev_deep(dev, htons(ETH_P_8021Q), i);
+		root_dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), i);
 		if (!root_dev)
 			continue;
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index d2e18b52caba..8a2aeb85e320 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -4146,7 +4146,7 @@ void qlcnic_restore_indev_addr(struct net_device *netdev, unsigned long event)
 
 	rcu_read_lock();
 	for_each_set_bit(vid, adapter->vlans, VLAN_N_VID) {
-		dev = __vlan_find_dev_deep(netdev, htons(ETH_P_8021Q), vid);
+		dev = __vlan_find_dev_deep_rcu(netdev, htons(ETH_P_8021Q), vid);
 		if (!dev)
 			continue;
 		qlcnic_config_indev_addr(adapter, dev, event);
diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 2e025ddcef21..0ab79fca822c 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -223,8 +223,8 @@ static void do_neigh_solicit(struct usbnet *dev, u8 *buf, u16 tci)
 	/* need to send the NA on the VLAN dev, if any */
 	rcu_read_lock();
 	if (tci) {
-		netdev = __vlan_find_dev_deep(dev->net, htons(ETH_P_8021Q),
-					      tci);
+		netdev = __vlan_find_dev_deep_rcu(dev->net, htons(ETH_P_8021Q),
+						  tci);
 		if (!netdev) {
 			rcu_read_unlock();
 			return;
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index c8d91d797ec8..bc2499a24884 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1659,7 +1659,7 @@ static void qeth_l3_add_vlan_mc(struct qeth_card *card)
 	for_each_set_bit(vid, card->active_vlans, VLAN_N_VID) {
 		struct net_device *netdev;
 
-		netdev = __vlan_find_dev_deep(card->dev, htons(ETH_P_8021Q),
+		netdev = __vlan_find_dev_deep_rcu(card->dev, htons(ETH_P_8021Q),
 					      vid);
 		if (netdev == NULL ||
 		    !(netdev->flags & IFF_UP))
@@ -1721,7 +1721,7 @@ static void qeth_l3_add_vlan_mc6(struct qeth_card *card)
 	for_each_set_bit(vid, card->active_vlans, VLAN_N_VID) {
 		struct net_device *netdev;
 
-		netdev = __vlan_find_dev_deep(card->dev, htons(ETH_P_8021Q),
+		netdev = __vlan_find_dev_deep_rcu(card->dev, htons(ETH_P_8021Q),
 					      vid);
 		if (netdev == NULL ||
 		    !(netdev->flags & IFF_UP))
@@ -1766,7 +1766,7 @@ static void qeth_l3_free_vlan_addresses4(struct qeth_card *card,
 
 	QETH_CARD_TEXT(card, 4, "frvaddr4");
 
-	netdev = __vlan_find_dev_deep(card->dev, htons(ETH_P_8021Q), vid);
+	netdev = __vlan_find_dev_deep_rcu(card->dev, htons(ETH_P_8021Q), vid);
 	if (!netdev)
 		return;
 	in_dev = in_dev_get(netdev);
@@ -1796,7 +1796,7 @@ static void qeth_l3_free_vlan_addresses6(struct qeth_card *card,
 
 	QETH_CARD_TEXT(card, 4, "frvaddr6");
 
-	netdev = __vlan_find_dev_deep(card->dev, htons(ETH_P_8021Q), vid);
+	netdev = __vlan_find_dev_deep_rcu(card->dev, htons(ETH_P_8021Q), vid);
 	if (!netdev)
 		return;
 	in6_dev = in6_dev_get(netdev);
@@ -2089,7 +2089,7 @@ static int qeth_l3_verify_vlan_dev(struct net_device *dev,
 		struct net_device *netdev;
 
 		rcu_read_lock();
-		netdev = __vlan_find_dev_deep(card->dev, htons(ETH_P_8021Q),
+		netdev = __vlan_find_dev_deep_rcu(card->dev, htons(ETH_P_8021Q),
 					      vid);
 		rcu_read_unlock();
 		if (netdev == dev) {
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 13bbbde00e68..8c0fb7f3a9a5 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -106,7 +106,7 @@ struct vlan_pcpu_stats {
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 
-extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev,
+extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
 					       __be16 vlan_proto, u16 vlan_id);
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
 extern u16 vlan_dev_vlan_id(const struct net_device *dev);
@@ -199,7 +199,7 @@ extern void vlan_vids_del_by_dev(struct net_device *dev,
 extern bool vlan_uses_dev(const struct net_device *dev);
 #else
 static inline struct net_device *
-__vlan_find_dev_deep(struct net_device *real_dev,
+__vlan_find_dev_deep_rcu(struct net_device *real_dev,
 		     __be16 vlan_proto, u16 vlan_id)
 {
 	return NULL;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 3c32bd257b73..9012b1c922b6 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -63,7 +63,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
 }
 
 /* Must be invoked with rcu_read_lock. */
-struct net_device *__vlan_find_dev_deep(struct net_device *dev,
+struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
 					__be16 vlan_proto, u16 vlan_id)
 {
 	struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);
@@ -81,13 +81,13 @@ struct net_device *__vlan_find_dev_deep(struct net_device *dev,
 
 		upper_dev = netdev_master_upper_dev_get_rcu(dev);
 		if (upper_dev)
-			return __vlan_find_dev_deep(upper_dev,
+			return __vlan_find_dev_deep_rcu(upper_dev,
 						    vlan_proto, vlan_id);
 	}
 
 	return NULL;
 }
-EXPORT_SYMBOL(__vlan_find_dev_deep);
+EXPORT_SYMBOL(__vlan_find_dev_deep_rcu);
 
 struct net_device *vlan_dev_real_dev(const struct net_device *dev)
 {
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 2acf7fa1fec6..a615264cf01a 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -535,7 +535,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 	if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb))
 		return br;
 
-	vlan = __vlan_find_dev_deep(br, skb->vlan_proto,
+	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
 				    vlan_tx_tag_get(skb) & VLAN_VID_MASK);
 
 	return vlan ? vlan : br;
-- 
cgit v1.2.3-71-gd317


From 907abd51012b9b0b0b687e97d5ebadbddbc682fe Mon Sep 17 00:00:00 2001
From: Jaehoon Chung <jh80.chung@samsung.com>
Date: Mon, 3 Mar 2014 11:36:43 +0900
Subject: mmc: dw_mmc: remove unused member variable.

Since using the device-tree, didn't use the callback pointer.
So removed the unused callback pointer.
When the set_power callback is used, it should be added in future.

Signed-off-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/dw_mmc.c  | 33 +++------------------------------
 include/linux/mmc/dw_mmc.h | 14 --------------
 2 files changed, 3 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 55c34cb702d4..81991eca5671 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -850,8 +850,6 @@ static void __dw_mci_start_request(struct dw_mci *host,
 	u32 cmdflags;
 
 	mrq = slot->mrq;
-	if (host->pdata->select_slot)
-		host->pdata->select_slot(slot->id);
 
 	host->cur_slot = slot;
 	host->mrq = mrq;
@@ -985,17 +983,11 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	switch (ios->power_mode) {
 	case MMC_POWER_UP:
 		set_bit(DW_MMC_CARD_NEED_INIT, &slot->flags);
-		/* Power up slot */
-		if (slot->host->pdata->setpower)
-			slot->host->pdata->setpower(slot->id, mmc->ocr_avail);
 		regs = mci_readl(slot->host, PWREN);
 		regs |= (1 << slot->id);
 		mci_writel(slot->host, PWREN, regs);
 		break;
 	case MMC_POWER_OFF:
-		/* Power down slot */
-		if (slot->host->pdata->setpower)
-			slot->host->pdata->setpower(slot->id, 0);
 		regs = mci_readl(slot->host, PWREN);
 		regs &= ~(1 << slot->id);
 		mci_writel(slot->host, PWREN, regs);
@@ -1009,13 +1001,10 @@ static int dw_mci_get_ro(struct mmc_host *mmc)
 {
 	int read_only;
 	struct dw_mci_slot *slot = mmc_priv(mmc);
-	struct dw_mci_board *brd = slot->host->pdata;
 
 	/* Use platform get_ro function, else try on board write protect */
 	if (slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT)
 		read_only = 0;
-	else if (brd->get_ro)
-		read_only = brd->get_ro(slot->id);
 	else if (gpio_is_valid(slot->wp_gpio))
 		read_only = gpio_get_value(slot->wp_gpio);
 	else
@@ -1039,8 +1028,6 @@ static int dw_mci_get_cd(struct mmc_host *mmc)
 	/* Use platform get_cd function, else try onboard card detect */
 	if (brd->quirks & DW_MCI_QUIRK_BROKEN_CARD_DETECTION)
 		present = 1;
-	else if (brd->get_cd)
-		present = !brd->get_cd(slot->id);
 	else if (!IS_ERR_VALUE(gpio_cd))
 		present = gpio_cd;
 	else
@@ -2138,17 +2125,7 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
 		mmc->f_max = freq[1];
 	}
 
-	if (host->pdata->get_ocr)
-		mmc->ocr_avail = host->pdata->get_ocr(id);
-	else
-		mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
-
-	/*
-	 * Start with slot power disabled, it will be enabled when a card
-	 * is detected.
-	 */
-	if (host->pdata->setpower)
-		host->pdata->setpower(id, 0);
+	mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
 
 	if (host->pdata->caps)
 		mmc->caps = host->pdata->caps;
@@ -2217,10 +2194,6 @@ err_setup_bus:
 
 static void dw_mci_cleanup_slot(struct dw_mci_slot *slot, unsigned int id)
 {
-	/* Shutdown detect IRQ */
-	if (slot->host->pdata->exit)
-		slot->host->pdata->exit(id);
-
 	/* Debugfs stuff is cleaned up by mmc core */
 	mmc_remove_host(slot->mmc);
 	slot->host->slot[id] = NULL;
@@ -2395,9 +2368,9 @@ int dw_mci_probe(struct dw_mci *host)
 		}
 	}
 
-	if (!host->pdata->select_slot && host->pdata->num_slots > 1) {
+	if (host->pdata->num_slots > 1) {
 		dev_err(host->dev,
-			"Platform data must supply select_slot function\n");
+			"Platform data must supply num_slots.\n");
 		return -ENODEV;
 	}
 
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 6ce7d2cd3c7a..babaea93bca6 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -248,20 +248,6 @@ struct dw_mci_board {
 	/* delay in mS before detecting cards after interrupt */
 	u32 detect_delay_ms;
 
-	int (*init)(u32 slot_id, irq_handler_t , void *);
-	int (*get_ro)(u32 slot_id);
-	int (*get_cd)(u32 slot_id);
-	int (*get_ocr)(u32 slot_id);
-	int (*get_bus_wd)(u32 slot_id);
-	/*
-	 * Enable power to selected slot and set voltage to desired level.
-	 * Voltage levels are specified using MMC_VDD_xxx defines defined
-	 * in linux/mmc/host.h file.
-	 */
-	void (*setpower)(u32 slot_id, u32 volt);
-	void (*exit)(u32 slot_id);
-	void (*select_slot)(u32 slot_id);
-
 	struct dw_mci_dma_ops *dma_ops;
 	struct dma_pdata *data;
 	struct block_settings *blk_settings;
-- 
cgit v1.2.3-71-gd317


From cdc991790c51c693d0c347a5286af017826a5d01 Mon Sep 17 00:00:00 2001
From: Seungwon Jeon <tgih.jun@samsung.com>
Date: Wed, 23 Apr 2014 17:07:35 +0900
Subject: mmc: drop the speed mode of card's state

Timing mode identifier has same role and can take the place
of speed mode. This change removes all related speed mode.

Signed-off-by: Seungwon Jeon <tgih.jun@samsung.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Acked-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/bus.c                  |  8 ++++----
 drivers/mmc/core/core.c                 |  3 +--
 drivers/mmc/core/mmc.c                  | 11 +++--------
 drivers/mmc/core/sd.c                   | 16 +++-------------
 drivers/mmc/core/sd.h                   |  1 -
 drivers/mmc/core/sdio.c                 |  8 ++------
 drivers/net/wireless/rsi/rsi_91x_sdio.c |  4 +---
 include/linux/mmc/card.h                | 23 ++++++-----------------
 include/linux/mmc/host.h                | 23 +++++++++++++++++++++++
 9 files changed, 43 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 824644875d41..f37e9d6af84a 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -341,16 +341,16 @@ int mmc_add_card(struct mmc_card *card)
 	if (mmc_host_is_spi(card->host)) {
 		pr_info("%s: new %s%s%s card on SPI\n",
 			mmc_hostname(card->host),
-			mmc_card_highspeed(card) ? "high speed " : "",
-			mmc_card_ddr_mode(card) ? "DDR " : "",
+			mmc_card_hs(card) ? "high speed " : "",
+			mmc_card_ddr52(card) ? "DDR " : "",
 			type);
 	} else {
 		pr_info("%s: new %s%s%s%s%s card at address %04x\n",
 			mmc_hostname(card->host),
 			mmc_card_uhs(card) ? "ultra high speed " :
-			(mmc_card_highspeed(card) ? "high speed " : ""),
+			(mmc_card_hs(card) ? "high speed " : ""),
 			(mmc_card_hs200(card) ? "HS200 " : ""),
-			mmc_card_ddr_mode(card) ? "DDR " : "",
+			mmc_card_ddr52(card) ? "DDR " : "",
 			uhs_bus_speed_mode, type, card->rca);
 	}
 
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index d97dff5fab62..02baa30653fa 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2192,7 +2192,7 @@ int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen)
 {
 	struct mmc_command cmd = {0};
 
-	if (mmc_card_blockaddr(card) || mmc_card_ddr_mode(card))
+	if (mmc_card_blockaddr(card) || mmc_card_ddr52(card))
 		return 0;
 
 	cmd.opcode = MMC_SET_BLOCKLEN;
@@ -2272,7 +2272,6 @@ static int mmc_do_hw_reset(struct mmc_host *host, int check)
 		}
 	}
 
-	host->card->state &= ~(MMC_STATE_HIGHSPEED | MMC_STATE_HIGHSPEED_DDR);
 	if (mmc_host_is_spi(host)) {
 		host->ios.chip_select = MMC_CS_HIGH;
 		host->ios.bus_mode = MMC_BUSMODE_PUSHPULL;
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index fbcf93d81858..31220529e171 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1083,11 +1083,9 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		} else {
 			if (card->ext_csd.hs_max_dtr > 52000000 &&
 			    host->caps2 & MMC_CAP2_HS200) {
-				mmc_card_set_hs200(card);
 				mmc_set_timing(card->host,
 					       MMC_TIMING_MMC_HS200);
 			} else {
-				mmc_card_set_highspeed(card);
 				mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
 			}
 		}
@@ -1098,10 +1096,10 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	 */
 	max_dtr = (unsigned int)-1;
 
-	if (mmc_card_highspeed(card) || mmc_card_hs200(card)) {
+	if (mmc_card_hs(card) || mmc_card_hs200(card)) {
 		if (max_dtr > card->ext_csd.hs_max_dtr)
 			max_dtr = card->ext_csd.hs_max_dtr;
-		if (mmc_card_highspeed(card) && (max_dtr > 52000000))
+		if (mmc_card_hs(card) && (max_dtr > 52000000))
 			max_dtr = 52000000;
 	} else if (max_dtr > card->csd.max_dtr) {
 		max_dtr = card->csd.max_dtr;
@@ -1112,7 +1110,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	/*
 	 * Indicate DDR mode (if supported).
 	 */
-	if (mmc_card_highspeed(card)) {
+	if (mmc_card_hs(card)) {
 		if ((card->ext_csd.card_type & EXT_CSD_CARD_TYPE_DDR_1_8V)
 			&& (host->caps & MMC_CAP_1_8V_DDR))
 				ddr = MMC_1_8V_DDR_MODE;
@@ -1255,7 +1253,6 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 				if (err)
 					goto err;
 			}
-			mmc_card_set_ddr_mode(card);
 			mmc_set_timing(card->host, MMC_TIMING_MMC_DDR52);
 			mmc_set_bus_width(card->host, bus_width);
 		}
@@ -1499,7 +1496,6 @@ static int _mmc_suspend(struct mmc_host *host, bool is_suspend)
 		err = mmc_sleep(host);
 	else if (!mmc_host_is_spi(host))
 		err = mmc_deselect_cards(host);
-	host->card->state &= ~(MMC_STATE_HIGHSPEED | MMC_STATE_HIGHSPEED_200);
 
 	if (!err) {
 		mmc_power_off(host);
@@ -1629,7 +1625,6 @@ static int mmc_power_restore(struct mmc_host *host)
 {
 	int ret;
 
-	host->card->state &= ~(MMC_STATE_HIGHSPEED | MMC_STATE_HIGHSPEED_200);
 	mmc_claim_host(host);
 	ret = mmc_init_card(host, host->card->ocr, host->card);
 	mmc_release_host(host);
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index aef515755e5b..0c44510bf717 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -887,7 +887,7 @@ unsigned mmc_sd_get_max_clock(struct mmc_card *card)
 {
 	unsigned max_dtr = (unsigned int)-1;
 
-	if (mmc_card_highspeed(card)) {
+	if (mmc_card_hs(card)) {
 		if (max_dtr > card->sw_caps.hs_max_dtr)
 			max_dtr = card->sw_caps.hs_max_dtr;
 	} else if (max_dtr > card->csd.max_dtr) {
@@ -897,12 +897,6 @@ unsigned mmc_sd_get_max_clock(struct mmc_card *card)
 	return max_dtr;
 }
 
-void mmc_sd_go_highspeed(struct mmc_card *card)
-{
-	mmc_card_set_highspeed(card);
-	mmc_set_timing(card->host, MMC_TIMING_SD_HS);
-}
-
 /*
  * Handle the detection and initialisation of a card.
  *
@@ -977,16 +971,13 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr,
 		err = mmc_sd_init_uhs_card(card);
 		if (err)
 			goto free_card;
-
-		/* Card is an ultra-high-speed card */
-		mmc_card_set_uhs(card);
 	} else {
 		/*
 		 * Attempt to change to high-speed (if supported)
 		 */
 		err = mmc_sd_switch_hs(card);
 		if (err > 0)
-			mmc_sd_go_highspeed(card);
+			mmc_set_timing(card->host, MMC_TIMING_SD_HS);
 		else if (err)
 			goto free_card;
 
@@ -1081,7 +1072,7 @@ static int _mmc_sd_suspend(struct mmc_host *host)
 
 	if (!mmc_host_is_spi(host))
 		err = mmc_deselect_cards(host);
-	host->card->state &= ~MMC_STATE_HIGHSPEED;
+
 	if (!err) {
 		mmc_power_off(host);
 		mmc_card_set_suspended(host->card);
@@ -1190,7 +1181,6 @@ static int mmc_sd_power_restore(struct mmc_host *host)
 {
 	int ret;
 
-	host->card->state &= ~MMC_STATE_HIGHSPEED;
 	mmc_claim_host(host);
 	ret = mmc_sd_init_card(host, host->card->ocr, host->card);
 	mmc_release_host(host);
diff --git a/drivers/mmc/core/sd.h b/drivers/mmc/core/sd.h
index 4b34b24f3f76..aab824a9a7f3 100644
--- a/drivers/mmc/core/sd.h
+++ b/drivers/mmc/core/sd.h
@@ -12,6 +12,5 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
 	bool reinit);
 unsigned mmc_sd_get_max_clock(struct mmc_card *card);
 int mmc_sd_switch_hs(struct mmc_card *card);
-void mmc_sd_go_highspeed(struct mmc_card *card);
 
 #endif
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 9933e426bc36..e636d9e99e4a 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -363,7 +363,7 @@ static unsigned mmc_sdio_get_max_clock(struct mmc_card *card)
 {
 	unsigned max_dtr;
 
-	if (mmc_card_highspeed(card)) {
+	if (mmc_card_hs(card)) {
 		/*
 		 * The SDIO specification doesn't mention how
 		 * the CIS transfer speed register relates to
@@ -733,7 +733,6 @@ try_again:
 		mmc_set_clock(host, card->cis.max_dtr);
 
 		if (card->cccr.high_speed) {
-			mmc_card_set_highspeed(card);
 			mmc_set_timing(card->host, MMC_TIMING_SD_HS);
 		}
 
@@ -792,16 +791,13 @@ try_again:
 		err = mmc_sdio_init_uhs_card(card);
 		if (err)
 			goto remove;
-
-		/* Card is an ultra-high-speed card */
-		mmc_card_set_uhs(card);
 	} else {
 		/*
 		 * Switch to high-speed (if supported).
 		 */
 		err = sdio_enable_hs(card);
 		if (err > 0)
-			mmc_sd_go_highspeed(card);
+			mmc_set_timing(card->host, MMC_TIMING_SD_HS);
 		else if (err)
 			goto remove;
 
diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio.c b/drivers/net/wireless/rsi/rsi_91x_sdio.c
index 2e39d38d6a9e..46e7af446f01 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio.c
@@ -285,7 +285,6 @@ static void rsi_reset_card(struct sdio_func *pfunction)
 		if (err) {
 			rsi_dbg(ERR_ZONE, "%s: CCCR speed reg read failed: %d\n",
 				__func__, err);
-			card->state &= ~MMC_STATE_HIGHSPEED;
 		} else {
 			err = rsi_cmd52writebyte(card,
 						 SDIO_CCCR_SPEED,
@@ -296,14 +295,13 @@ static void rsi_reset_card(struct sdio_func *pfunction)
 					__func__, err);
 				return;
 			}
-			mmc_card_set_highspeed(card);
 			host->ios.timing = MMC_TIMING_SD_HS;
 			host->ops->set_ios(host, &host->ios);
 		}
 	}
 
 	/* Set clock */
-	if (mmc_card_highspeed(card))
+	if (mmc_card_hs(card))
 		clock = 50000000;
 	else
 		clock = card->cis.max_dtr;
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index aa7e57f60fb2..aadeaf155d0e 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -194,6 +194,7 @@ struct sdio_cis {
 };
 
 struct mmc_host;
+struct mmc_ios;
 struct sdio_func;
 struct sdio_func_tuple;
 
@@ -250,15 +251,11 @@ struct mmc_card {
 	unsigned int		state;		/* (our) card state */
 #define MMC_STATE_PRESENT	(1<<0)		/* present in sysfs */
 #define MMC_STATE_READONLY	(1<<1)		/* card is read-only */
-#define MMC_STATE_HIGHSPEED	(1<<2)		/* card is in high speed mode */
-#define MMC_STATE_BLOCKADDR	(1<<3)		/* card uses block-addressing */
-#define MMC_STATE_HIGHSPEED_DDR (1<<4)		/* card is in high speed mode */
-#define MMC_STATE_ULTRAHIGHSPEED (1<<5)		/* card is in ultra high speed mode */
-#define MMC_CARD_SDXC		(1<<6)		/* card is SDXC */
-#define MMC_CARD_REMOVED	(1<<7)		/* card has been removed */
-#define MMC_STATE_HIGHSPEED_200	(1<<8)		/* card is in HS200 mode */
-#define MMC_STATE_DOING_BKOPS	(1<<10)		/* card is doing BKOPS */
-#define MMC_STATE_SUSPENDED	(1<<11)		/* card is suspended */
+#define MMC_STATE_BLOCKADDR	(1<<2)		/* card uses block-addressing */
+#define MMC_CARD_SDXC		(1<<3)		/* card is SDXC */
+#define MMC_CARD_REMOVED	(1<<4)		/* card has been removed */
+#define MMC_STATE_DOING_BKOPS	(1<<5)		/* card is doing BKOPS */
+#define MMC_STATE_SUSPENDED	(1<<6)		/* card is suspended */
 	unsigned int		quirks; 	/* card quirks */
 #define MMC_QUIRK_LENIENT_FN0	(1<<0)		/* allow SDIO FN0 writes outside of the VS CCCR range */
 #define MMC_QUIRK_BLKSZ_FOR_BYTE_MODE (1<<1)	/* use func->cur_blksize */
@@ -418,11 +415,7 @@ static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data)
 
 #define mmc_card_present(c)	((c)->state & MMC_STATE_PRESENT)
 #define mmc_card_readonly(c)	((c)->state & MMC_STATE_READONLY)
-#define mmc_card_highspeed(c)	((c)->state & MMC_STATE_HIGHSPEED)
-#define mmc_card_hs200(c)	((c)->state & MMC_STATE_HIGHSPEED_200)
 #define mmc_card_blockaddr(c)	((c)->state & MMC_STATE_BLOCKADDR)
-#define mmc_card_ddr_mode(c)	((c)->state & MMC_STATE_HIGHSPEED_DDR)
-#define mmc_card_uhs(c)		((c)->state & MMC_STATE_ULTRAHIGHSPEED)
 #define mmc_card_ext_capacity(c) ((c)->state & MMC_CARD_SDXC)
 #define mmc_card_removed(c)	((c) && ((c)->state & MMC_CARD_REMOVED))
 #define mmc_card_doing_bkops(c)	((c)->state & MMC_STATE_DOING_BKOPS)
@@ -430,11 +423,7 @@ static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data)
 
 #define mmc_card_set_present(c)	((c)->state |= MMC_STATE_PRESENT)
 #define mmc_card_set_readonly(c) ((c)->state |= MMC_STATE_READONLY)
-#define mmc_card_set_highspeed(c) ((c)->state |= MMC_STATE_HIGHSPEED)
-#define mmc_card_set_hs200(c)	((c)->state |= MMC_STATE_HIGHSPEED_200)
 #define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR)
-#define mmc_card_set_ddr_mode(c) ((c)->state |= MMC_STATE_HIGHSPEED_DDR)
-#define mmc_card_set_uhs(c) ((c)->state |= MMC_STATE_ULTRAHIGHSPEED)
 #define mmc_card_set_ext_capacity(c) ((c)->state |= MMC_CARD_SDXC)
 #define mmc_card_set_removed(c) ((c)->state |= MMC_CARD_REMOVED)
 #define mmc_card_set_doing_bkops(c)	((c)->state |= MMC_STATE_DOING_BKOPS)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0cf705c83998..a43853779799 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -17,6 +17,7 @@
 #include <linux/fault-inject.h>
 
 #include <linux/mmc/core.h>
+#include <linux/mmc/card.h>
 #include <linux/mmc/pm.h>
 
 struct mmc_ios {
@@ -478,4 +479,26 @@ static inline unsigned int mmc_host_clk_rate(struct mmc_host *host)
 	return host->ios.clock;
 }
 #endif
+
+static inline int mmc_card_hs(struct mmc_card *card)
+{
+	return card->host->ios.timing == MMC_TIMING_SD_HS ||
+		card->host->ios.timing == MMC_TIMING_MMC_HS;
+}
+
+static inline int mmc_card_uhs(struct mmc_card *card)
+{
+	return card->host->ios.timing >= MMC_TIMING_UHS_SDR12 &&
+		card->host->ios.timing <= MMC_TIMING_UHS_DDR50;
+}
+
+static inline bool mmc_card_hs200(struct mmc_card *card)
+{
+	return card->host->ios.timing == MMC_TIMING_MMC_HS200;
+}
+
+static inline bool mmc_card_ddr52(struct mmc_card *card)
+{
+	return card->host->ios.timing == MMC_TIMING_MMC_DDR52;
+}
 #endif /* LINUX_MMC_HOST_H */
-- 
cgit v1.2.3-71-gd317


From 2415c0ef618b3cd95581c7f633cbab78b29b7ab0 Mon Sep 17 00:00:00 2001
From: Seungwon Jeon <tgih.jun@samsung.com>
Date: Wed, 23 Apr 2014 17:07:58 +0900
Subject: mmc: identify available device type to select

Device types which are supported by both host and device can be
identified when EXT_CSD is read. There is no need to check host's
capability anymore.

Signed-off-by: Seungwon Jeon <tgih.jun@samsung.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Acked-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/mmc.c   | 72 ++++++++++++++++++++++++++----------------------
 include/linux/mmc/card.h |  2 +-
 include/linux/mmc/host.h |  6 ----
 include/linux/mmc/mmc.h  | 12 +++++---
 4 files changed, 48 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 31220529e171..b5691fee9629 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -243,28 +243,46 @@ static void mmc_select_card_type(struct mmc_card *card)
 	u8 card_type = card->ext_csd.raw_card_type & EXT_CSD_CARD_TYPE_MASK;
 	u32 caps = host->caps, caps2 = host->caps2;
 	unsigned int hs_max_dtr = 0;
+	unsigned int avail_type = 0;
 
-	if (card_type & EXT_CSD_CARD_TYPE_26)
+	if (caps & MMC_CAP_MMC_HIGHSPEED &&
+	    card_type & EXT_CSD_CARD_TYPE_HS_26) {
 		hs_max_dtr = MMC_HIGH_26_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_HS_26;
+	}
 
 	if (caps & MMC_CAP_MMC_HIGHSPEED &&
-			card_type & EXT_CSD_CARD_TYPE_52)
+	    card_type & EXT_CSD_CARD_TYPE_HS_52) {
 		hs_max_dtr = MMC_HIGH_52_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_HS_52;
+	}
 
-	if ((caps & MMC_CAP_1_8V_DDR &&
-			card_type & EXT_CSD_CARD_TYPE_DDR_1_8V) ||
-	    (caps & MMC_CAP_1_2V_DDR &&
-			card_type & EXT_CSD_CARD_TYPE_DDR_1_2V))
+	if (caps & MMC_CAP_1_8V_DDR &&
+	    card_type & EXT_CSD_CARD_TYPE_DDR_1_8V) {
 		hs_max_dtr = MMC_HIGH_DDR_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_DDR_1_8V;
+	}
+
+	if (caps & MMC_CAP_1_2V_DDR &&
+	    card_type & EXT_CSD_CARD_TYPE_DDR_1_2V) {
+		hs_max_dtr = MMC_HIGH_DDR_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_DDR_1_2V;
+	}
 
-	if ((caps2 & MMC_CAP2_HS200_1_8V_SDR &&
-			card_type & EXT_CSD_CARD_TYPE_SDR_1_8V) ||
-	    (caps2 & MMC_CAP2_HS200_1_2V_SDR &&
-			card_type & EXT_CSD_CARD_TYPE_SDR_1_2V))
+	if (caps2 & MMC_CAP2_HS200_1_8V_SDR &&
+	    card_type & EXT_CSD_CARD_TYPE_HS200_1_8V) {
 		hs_max_dtr = MMC_HS200_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_HS200_1_8V;
+	}
+
+	if (caps2 & MMC_CAP2_HS200_1_2V_SDR &&
+	    card_type & EXT_CSD_CARD_TYPE_HS200_1_2V) {
+		hs_max_dtr = MMC_HS200_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_HS200_1_2V;
+	}
 
 	card->ext_csd.hs_max_dtr = hs_max_dtr;
-	card->ext_csd.card_type = card_type;
+	card->mmc_avail_type = avail_type;
 }
 
 /*
@@ -800,12 +818,10 @@ static int mmc_select_hs200(struct mmc_card *card)
 
 	host = card->host;
 
-	if (card->ext_csd.card_type & EXT_CSD_CARD_TYPE_SDR_1_2V &&
-			host->caps2 & MMC_CAP2_HS200_1_2V_SDR)
+	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_2V)
 		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120);
 
-	if (err && card->ext_csd.card_type & EXT_CSD_CARD_TYPE_SDR_1_8V &&
-			host->caps2 & MMC_CAP2_HS200_1_8V_SDR)
+	if (err && card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_8V)
 		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180);
 
 	/* If fails try again during next card power cycle */
@@ -1064,10 +1080,9 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	 */
 	if (card->ext_csd.hs_max_dtr != 0) {
 		err = 0;
-		if (card->ext_csd.hs_max_dtr > 52000000 &&
-		    host->caps2 & MMC_CAP2_HS200)
+		if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200)
 			err = mmc_select_hs200(card);
-		else if	(host->caps & MMC_CAP_MMC_HIGHSPEED)
+		else if	(card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS)
 			err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
 					EXT_CSD_HS_TIMING, 1,
 					card->ext_csd.generic_cmd6_time,
@@ -1081,13 +1096,11 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 			       mmc_hostname(card->host));
 			err = 0;
 		} else {
-			if (card->ext_csd.hs_max_dtr > 52000000 &&
-			    host->caps2 & MMC_CAP2_HS200) {
+			if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200)
 				mmc_set_timing(card->host,
 					       MMC_TIMING_MMC_HS200);
-			} else {
+			else
 				mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
-			}
 		}
 	}
 
@@ -1110,14 +1123,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	/*
 	 * Indicate DDR mode (if supported).
 	 */
-	if (mmc_card_hs(card)) {
-		if ((card->ext_csd.card_type & EXT_CSD_CARD_TYPE_DDR_1_8V)
-			&& (host->caps & MMC_CAP_1_8V_DDR))
-				ddr = MMC_1_8V_DDR_MODE;
-		else if ((card->ext_csd.card_type & EXT_CSD_CARD_TYPE_DDR_1_2V)
-			&& (host->caps & MMC_CAP_1_2V_DDR))
-				ddr = MMC_1_2V_DDR_MODE;
-	}
+	if (mmc_card_hs(card))
+		ddr = card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_52;
 
 	/*
 	 * Indicate HS200 SDR mode (if supported).
@@ -1137,8 +1144,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		 * 3. set the clock to > 52Mhz <=200MHz and
 		 * 4. execute tuning for HS200
 		 */
-		if ((host->caps2 & MMC_CAP2_HS200) &&
-		    card->host->ops->execute_tuning) {
+		if (card->host->ops->execute_tuning) {
 			mmc_host_clk_hold(card->host);
 			err = card->host->ops->execute_tuning(card->host,
 				MMC_SEND_TUNING_BLOCK_HS200);
@@ -1247,7 +1253,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 			 *
 			 * WARNING: eMMC rules are NOT the same as SD DDR
 			 */
-			if (ddr == MMC_1_2V_DDR_MODE) {
+			if (ddr & EXT_CSD_CARD_TYPE_DDR_1_2V) {
 				err = __mmc_set_signal_voltage(host,
 					MMC_SIGNAL_VOLTAGE_120);
 				if (err)
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index aadeaf155d0e..fe31f8d89a03 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -68,7 +68,6 @@ struct mmc_ext_csd {
 #define MMC_HIGH_DDR_MAX_DTR	52000000
 #define MMC_HS200_MAX_DTR	200000000
 	unsigned int		sectors;
-	unsigned int		card_type;
 	unsigned int		hc_erase_size;		/* In sectors */
 	unsigned int		hc_erase_timeout;	/* In milliseconds */
 	unsigned int		sec_trim_mult;	/* Secure trim multiplier  */
@@ -298,6 +297,7 @@ struct mmc_card {
 	struct sdio_func_tuple	*tuples;	/* unknown common tuples */
 
 	unsigned int		sd_bus_speed;	/* Bus Speed Mode set for the card */
+	unsigned int		mmc_avail_type;	/* supported device type by both host and card */
 
 	struct dentry		*debugfs_root;
 	struct mmc_part	part[MMC_NUM_PHY_PARTITION]; /* physical partitions */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index a43853779799..6b1e9ee6ca10 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -62,12 +62,6 @@ struct mmc_ios {
 #define MMC_TIMING_MMC_DDR52	8
 #define MMC_TIMING_MMC_HS200	9
 
-#define MMC_SDR_MODE		0
-#define MMC_1_2V_DDR_MODE	1
-#define MMC_1_8V_DDR_MODE	2
-#define MMC_1_2V_SDR_MODE	3
-#define MMC_1_8V_SDR_MODE	4
-
 	unsigned char	signal_voltage;		/* signalling voltage (1.8V or 3.3V) */
 
 #define MMC_SIGNAL_VOLTAGE_330	0
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 50bcde3677ca..f734c0c64575 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -354,18 +354,22 @@ struct _mmc_csd {
 #define EXT_CSD_CMD_SET_SECURE		(1<<1)
 #define EXT_CSD_CMD_SET_CPSECURE	(1<<2)
 
-#define EXT_CSD_CARD_TYPE_26	(1<<0)	/* Card can run at 26MHz */
-#define EXT_CSD_CARD_TYPE_52	(1<<1)	/* Card can run at 52MHz */
 #define EXT_CSD_CARD_TYPE_MASK	0x3F	/* Mask out reserved bits */
+#define EXT_CSD_CARD_TYPE_HS_26	(1<<0)	/* Card can run at 26MHz */
+#define EXT_CSD_CARD_TYPE_HS_52	(1<<1)	/* Card can run at 52MHz */
+#define EXT_CSD_CARD_TYPE_HS	(EXT_CSD_CARD_TYPE_HS_26 | \
+				 EXT_CSD_CARD_TYPE_HS_52)
 #define EXT_CSD_CARD_TYPE_DDR_1_8V  (1<<2)   /* Card can run at 52MHz */
 					     /* DDR mode @1.8V or 3V I/O */
 #define EXT_CSD_CARD_TYPE_DDR_1_2V  (1<<3)   /* Card can run at 52MHz */
 					     /* DDR mode @1.2V I/O */
 #define EXT_CSD_CARD_TYPE_DDR_52       (EXT_CSD_CARD_TYPE_DDR_1_8V  \
 					| EXT_CSD_CARD_TYPE_DDR_1_2V)
-#define EXT_CSD_CARD_TYPE_SDR_1_8V	(1<<4)	/* Card can run at 200MHz */
-#define EXT_CSD_CARD_TYPE_SDR_1_2V	(1<<5)	/* Card can run at 200MHz */
+#define EXT_CSD_CARD_TYPE_HS200_1_8V	(1<<4)	/* Card can run at 200MHz */
+#define EXT_CSD_CARD_TYPE_HS200_1_2V	(1<<5)	/* Card can run at 200MHz */
 						/* SDR mode @1.2V I/O */
+#define EXT_CSD_CARD_TYPE_HS200		(EXT_CSD_CARD_TYPE_HS200_1_8V | \
+					 EXT_CSD_CARD_TYPE_HS200_1_2V)
 
 #define EXT_CSD_BUS_WIDTH_1	0	/* Card is in 1 bit mode */
 #define EXT_CSD_BUS_WIDTH_4	1	/* Card is in 4 bit mode */
-- 
cgit v1.2.3-71-gd317


From 577fb13199b11d8cd75609183649be4b5561243f Mon Sep 17 00:00:00 2001
From: Seungwon Jeon <tgih.jun@samsung.com>
Date: Wed, 23 Apr 2014 17:08:44 +0900
Subject: mmc: rework selection of bus speed mode

Current implementation for bus speed mode selection is too
complicated. This patch is to simplify the codes and remove
some duplicate parts.

The following changes are including:
* Adds functions for each mode selection(HS, HS-DDR, HS200 and etc)
* Rearranged the mode selection sequence with supported device type
* Adds maximum speed for HS200 mode(hs200_max_dtr)
* Adds field definition for HS_TIMING of EXT_CSD

Signed-off-by: Seungwon Jeon <tgih.jun@samsung.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Acked-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/debugfs.c |   2 +-
 drivers/mmc/core/mmc.c     | 431 ++++++++++++++++++++++++---------------------
 include/linux/mmc/card.h   |   1 +
 include/linux/mmc/mmc.h    |   4 +
 4 files changed, 238 insertions(+), 200 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 509229b48b55..1f730dbfaeea 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -139,7 +139,7 @@ static int mmc_ios_show(struct seq_file *s, void *data)
 		str = "mmc DDR52";
 		break;
 	case MMC_TIMING_MMC_HS200:
-		str = "mmc high-speed SDR200";
+		str = "mmc HS200";
 		break;
 	default:
 		str = "invalid";
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 4538541ac5ab..bec6786efd19 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -242,7 +242,7 @@ static void mmc_select_card_type(struct mmc_card *card)
 	struct mmc_host *host = card->host;
 	u8 card_type = card->ext_csd.raw_card_type & EXT_CSD_CARD_TYPE_MASK;
 	u32 caps = host->caps, caps2 = host->caps2;
-	unsigned int hs_max_dtr = 0;
+	unsigned int hs_max_dtr = 0, hs200_max_dtr = 0;
 	unsigned int avail_type = 0;
 
 	if (caps & MMC_CAP_MMC_HIGHSPEED &&
@@ -271,17 +271,18 @@ static void mmc_select_card_type(struct mmc_card *card)
 
 	if (caps2 & MMC_CAP2_HS200_1_8V_SDR &&
 	    card_type & EXT_CSD_CARD_TYPE_HS200_1_8V) {
-		hs_max_dtr = MMC_HS200_MAX_DTR;
+		hs200_max_dtr = MMC_HS200_MAX_DTR;
 		avail_type |= EXT_CSD_CARD_TYPE_HS200_1_8V;
 	}
 
 	if (caps2 & MMC_CAP2_HS200_1_2V_SDR &&
 	    card_type & EXT_CSD_CARD_TYPE_HS200_1_2V) {
-		hs_max_dtr = MMC_HS200_MAX_DTR;
+		hs200_max_dtr = MMC_HS200_MAX_DTR;
 		avail_type |= EXT_CSD_CARD_TYPE_HS200_1_2V;
 	}
 
 	card->ext_csd.hs_max_dtr = hs_max_dtr;
+	card->ext_csd.hs200_max_dtr = hs200_max_dtr;
 	card->mmc_avail_type = avail_type;
 }
 
@@ -825,37 +826,46 @@ static int mmc_select_powerclass(struct mmc_card *card)
 }
 
 /*
- * Selects the desired buswidth and switch to the HS200 mode
- * if bus width set without error
+ * Set the bus speed for the selected speed mode.
  */
-static int mmc_select_hs200(struct mmc_card *card)
+static void mmc_set_bus_speed(struct mmc_card *card)
+{
+	unsigned int max_dtr = (unsigned int)-1;
+
+	if (mmc_card_hs200(card) && max_dtr > card->ext_csd.hs200_max_dtr)
+		max_dtr = card->ext_csd.hs200_max_dtr;
+	else if (mmc_card_hs(card) && max_dtr > card->ext_csd.hs_max_dtr)
+		max_dtr = card->ext_csd.hs_max_dtr;
+	else if (max_dtr > card->csd.max_dtr)
+		max_dtr = card->csd.max_dtr;
+
+	mmc_set_clock(card->host, max_dtr);
+}
+
+/*
+ * Select the bus width amoung 4-bit and 8-bit(SDR).
+ * If the bus width is changed successfully, return the selected width value.
+ * Zero is returned instead of error value if the wide width is not supported.
+ */
+static int mmc_select_bus_width(struct mmc_card *card)
 {
-	int idx, err = -EINVAL;
-	struct mmc_host *host;
 	static unsigned ext_csd_bits[] = {
-		EXT_CSD_BUS_WIDTH_4,
 		EXT_CSD_BUS_WIDTH_8,
+		EXT_CSD_BUS_WIDTH_4,
 	};
 	static unsigned bus_widths[] = {
-		MMC_BUS_WIDTH_4,
 		MMC_BUS_WIDTH_8,
+		MMC_BUS_WIDTH_4,
 	};
+	struct mmc_host *host = card->host;
+	unsigned idx, bus_width = 0;
+	int err = 0;
 
-	BUG_ON(!card);
-
-	host = card->host;
-
-	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_2V)
-		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120);
-
-	if (err && card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_8V)
-		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180);
-
-	/* If fails try again during next card power cycle */
-	if (err)
-		goto err;
+	if ((card->csd.mmca_vsn < CSD_SPEC_VER_4) &&
+	    !(host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA)))
+		return 0;
 
-	idx = (host->caps & MMC_CAP_8_BIT_DATA) ? 1 : 0;
+	idx = (host->caps & MMC_CAP_8_BIT_DATA) ? 0 : 1;
 
 	/*
 	 * Unlike SD, MMC cards dont have a configuration register to notify
@@ -863,8 +873,7 @@ static int mmc_select_hs200(struct mmc_card *card)
 	 * the supported bus width or compare the ext csd values of current
 	 * bus width and ext csd values of 1 bit mode read earlier.
 	 */
-	for (; idx >= 0; idx--) {
-
+	for (; idx < ARRAY_SIZE(bus_widths); idx++) {
 		/*
 		 * Host is capable of 8bit transfer, then switch
 		 * the device to work in 8bit transfer mode. If the
@@ -879,26 +888,201 @@ static int mmc_select_hs200(struct mmc_card *card)
 		if (err)
 			continue;
 
-		mmc_set_bus_width(card->host, bus_widths[idx]);
+		bus_width = bus_widths[idx];
+		mmc_set_bus_width(host, bus_width);
 
+		/*
+		 * If controller can't handle bus width test,
+		 * compare ext_csd previously read in 1 bit mode
+		 * against ext_csd at new bus width
+		 */
 		if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
-			err = mmc_compare_ext_csds(card, bus_widths[idx]);
+			err = mmc_compare_ext_csds(card, bus_width);
 		else
-			err = mmc_bus_test(card, bus_widths[idx]);
-		if (!err)
+			err = mmc_bus_test(card, bus_width);
+
+		if (!err) {
+			err = bus_width;
 			break;
+		} else {
+			pr_warn("%s: switch to bus width %d failed\n",
+				mmc_hostname(host), ext_csd_bits[idx]);
+		}
 	}
 
-	/* switch to HS200 mode if bus width set successfully */
+	return err;
+}
+
+/*
+ * Switch to the high-speed mode
+ */
+static int mmc_select_hs(struct mmc_card *card)
+{
+	int err;
+
+	err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+			   EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS,
+			   card->ext_csd.generic_cmd6_time,
+			   true, true, true);
 	if (!err)
+		mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
+
+	return err;
+}
+
+/*
+ * Activate wide bus and DDR if supported.
+ */
+static int mmc_select_hs_ddr(struct mmc_card *card)
+{
+	struct mmc_host *host = card->host;
+	u32 bus_width, ext_csd_bits;
+	int err = 0;
+
+	if (!(card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_52))
+		return 0;
+
+	bus_width = host->ios.bus_width;
+	if (bus_width == MMC_BUS_WIDTH_1)
+		return 0;
+
+	ext_csd_bits = (bus_width == MMC_BUS_WIDTH_8) ?
+		EXT_CSD_DDR_BUS_WIDTH_8 : EXT_CSD_DDR_BUS_WIDTH_4;
+
+	err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+			EXT_CSD_BUS_WIDTH,
+			ext_csd_bits,
+			card->ext_csd.generic_cmd6_time);
+	if (err) {
+		pr_warn("%s: switch to bus width %d ddr failed\n",
+			mmc_hostname(host), 1 << bus_width);
+		return err;
+	}
+
+	/*
+	 * eMMC cards can support 3.3V to 1.2V i/o (vccq)
+	 * signaling.
+	 *
+	 * EXT_CSD_CARD_TYPE_DDR_1_8V means 3.3V or 1.8V vccq.
+	 *
+	 * 1.8V vccq at 3.3V core voltage (vcc) is not required
+	 * in the JEDEC spec for DDR.
+	 *
+	 * Do not force change in vccq since we are obviously
+	 * working and no change to vccq is needed.
+	 *
+	 * WARNING: eMMC rules are NOT the same as SD DDR
+	 */
+	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_1_2V) {
+		err = __mmc_set_signal_voltage(host,
+				MMC_SIGNAL_VOLTAGE_120);
+		if (err)
+			return err;
+	}
+
+	mmc_set_timing(host, MMC_TIMING_MMC_DDR52);
+
+	return err;
+}
+
+/*
+ * For device supporting HS200 mode, the following sequence
+ * should be done before executing the tuning process.
+ * 1. set the desired bus width(4-bit or 8-bit, 1-bit is not supported)
+ * 2. switch to HS200 mode
+ * 3. set the clock to > 52Mhz and <=200MHz
+ */
+static int mmc_select_hs200(struct mmc_card *card)
+{
+	struct mmc_host *host = card->host;
+	int err = -EINVAL;
+
+	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_2V)
+		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120);
+
+	if (err && card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_8V)
+		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180);
+
+	/* If fails try again during next card power cycle */
+	if (err)
+		goto err;
+
+	/*
+	 * Set the bus width(4 or 8) with host's support and
+	 * switch to HS200 mode if bus width is set successfully.
+	 */
+	err = mmc_select_bus_width(card);
+	if (!IS_ERR_VALUE(err)) {
 		err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-				EXT_CSD_HS_TIMING, 2,
-				card->ext_csd.generic_cmd6_time,
-				true, true, true);
+				   EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS200,
+				   card->ext_csd.generic_cmd6_time,
+				   true, true, true);
+		if (!err)
+			mmc_set_timing(host, MMC_TIMING_MMC_HS200);
+	}
 err:
 	return err;
 }
 
+/*
+ * Activate High Speed or HS200 mode if supported.
+ */
+static int mmc_select_timing(struct mmc_card *card)
+{
+	int err = 0;
+
+	if ((card->csd.mmca_vsn < CSD_SPEC_VER_4 &&
+	     card->ext_csd.hs_max_dtr == 0))
+		goto bus_speed;
+
+	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200)
+		err = mmc_select_hs200(card);
+	else if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS)
+		err = mmc_select_hs(card);
+
+	if (err && err != -EBADMSG)
+		return err;
+
+	if (err) {
+		pr_warn("%s: switch to %s failed\n",
+			mmc_card_hs(card) ? "high-speed" :
+			(mmc_card_hs200(card) ? "hs200" : ""),
+			mmc_hostname(card->host));
+		err = 0;
+	}
+
+bus_speed:
+	/*
+	 * Set the bus speed to the selected bus timing.
+	 * If timing is not selected, backward compatible is the default.
+	 */
+	mmc_set_bus_speed(card);
+	return err;
+}
+
+/*
+ * Execute tuning sequence to seek the proper bus operating
+ * conditions for HS200, which sends CMD21 to the device.
+ */
+static int mmc_hs200_tuning(struct mmc_card *card)
+{
+	struct mmc_host *host = card->host;
+	int err = 0;
+
+	if (host->ops->execute_tuning) {
+		mmc_host_clk_hold(host);
+		err = host->ops->execute_tuning(host,
+				MMC_SEND_TUNING_BLOCK_HS200);
+		mmc_host_clk_release(host);
+
+		if (err)
+			pr_warn("%s: tuning execution failed\n",
+				mmc_hostname(host));
+	}
+
+	return err;
+}
+
 /*
  * Handle the detection and initialisation of a card.
  *
@@ -909,9 +1093,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	struct mmc_card *oldcard)
 {
 	struct mmc_card *card;
-	int err, ddr = 0;
+	int err;
 	u32 cid[4];
-	unsigned int max_dtr;
 	u32 rocr;
 	u8 *ext_csd = NULL;
 
@@ -1103,173 +1286,23 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	}
 
 	/*
-	 * Activate high speed (if supported)
-	 */
-	if (card->ext_csd.hs_max_dtr != 0) {
-		err = 0;
-		if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200)
-			err = mmc_select_hs200(card);
-		else if	(card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS)
-			err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-					EXT_CSD_HS_TIMING, 1,
-					card->ext_csd.generic_cmd6_time,
-					true, true, true);
-
-		if (err && err != -EBADMSG)
-			goto free_card;
-
-		if (err) {
-			pr_warning("%s: switch to highspeed failed\n",
-			       mmc_hostname(card->host));
-			err = 0;
-		} else {
-			if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200)
-				mmc_set_timing(card->host,
-					       MMC_TIMING_MMC_HS200);
-			else
-				mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
-		}
-	}
-
-	/*
-	 * Compute bus speed.
-	 */
-	max_dtr = (unsigned int)-1;
-
-	if (mmc_card_hs(card) || mmc_card_hs200(card)) {
-		if (max_dtr > card->ext_csd.hs_max_dtr)
-			max_dtr = card->ext_csd.hs_max_dtr;
-		if (mmc_card_hs(card) && (max_dtr > 52000000))
-			max_dtr = 52000000;
-	} else if (max_dtr > card->csd.max_dtr) {
-		max_dtr = card->csd.max_dtr;
-	}
-
-	mmc_set_clock(host, max_dtr);
-
-	/*
-	 * Indicate DDR mode (if supported).
+	 * Select timing interface
 	 */
-	if (mmc_card_hs(card))
-		ddr = card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_52;
+	err = mmc_select_timing(card);
+	if (err)
+		goto free_card;
 
-	/*
-	 * Indicate HS200 SDR mode (if supported).
-	 */
 	if (mmc_card_hs200(card)) {
-		u32 ext_csd_bits;
-		u32 bus_width = card->host->ios.bus_width;
-
-		/*
-		 * For devices supporting HS200 mode, the bus width has
-		 * to be set before executing the tuning function. If
-		 * set before tuning, then device will respond with CRC
-		 * errors for responses on CMD line. So for HS200 the
-		 * sequence will be
-		 * 1. set bus width 4bit / 8 bit (1 bit not supported)
-		 * 2. switch to HS200 mode
-		 * 3. set the clock to > 52Mhz <=200MHz and
-		 * 4. execute tuning for HS200
-		 */
-		if (card->host->ops->execute_tuning) {
-			mmc_host_clk_hold(card->host);
-			err = card->host->ops->execute_tuning(card->host,
-				MMC_SEND_TUNING_BLOCK_HS200);
-			mmc_host_clk_release(card->host);
-		}
-		if (err) {
-			pr_warning("%s: tuning execution failed\n",
-				   mmc_hostname(card->host));
+		err = mmc_hs200_tuning(card);
+		if (err)
 			goto err;
-		}
-
-		ext_csd_bits = (bus_width == MMC_BUS_WIDTH_8) ?
-				EXT_CSD_BUS_WIDTH_8 : EXT_CSD_BUS_WIDTH_4;
-	}
-
-	/*
-	 * Activate wide bus and DDR (if supported).
-	 */
-	if (!mmc_card_hs200(card) &&
-	    (card->csd.mmca_vsn >= CSD_SPEC_VER_4) &&
-	    (host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA))) {
-		static unsigned ext_csd_bits[][2] = {
-			{ EXT_CSD_BUS_WIDTH_8, EXT_CSD_DDR_BUS_WIDTH_8 },
-			{ EXT_CSD_BUS_WIDTH_4, EXT_CSD_DDR_BUS_WIDTH_4 },
-			{ EXT_CSD_BUS_WIDTH_1, EXT_CSD_BUS_WIDTH_1 },
-		};
-		static unsigned bus_widths[] = {
-			MMC_BUS_WIDTH_8,
-			MMC_BUS_WIDTH_4,
-			MMC_BUS_WIDTH_1
-		};
-		unsigned idx, bus_width = 0;
-
-		if (host->caps & MMC_CAP_8_BIT_DATA)
-			idx = 0;
-		else
-			idx = 1;
-		for (; idx < ARRAY_SIZE(bus_widths); idx++) {
-			bus_width = bus_widths[idx];
-			if (bus_width == MMC_BUS_WIDTH_1)
-				ddr = 0; /* no DDR for 1-bit width */
-
-			err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-					 EXT_CSD_BUS_WIDTH,
-					 ext_csd_bits[idx][0],
-					 card->ext_csd.generic_cmd6_time);
-			if (!err) {
-				mmc_set_bus_width(card->host, bus_width);
-
-				/*
-				 * If controller can't handle bus width test,
-				 * compare ext_csd previously read in 1 bit mode
-				 * against ext_csd at new bus width
-				 */
-				if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
-					err = mmc_compare_ext_csds(card,
-						bus_width);
-				else
-					err = mmc_bus_test(card, bus_width);
-				if (!err)
-					break;
-			}
-		}
-
-		if (!err && ddr) {
-			err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-					 EXT_CSD_BUS_WIDTH,
-					 ext_csd_bits[idx][1],
-					 card->ext_csd.generic_cmd6_time);
-		}
-		if (err) {
-			pr_warning("%s: switch to bus width %d ddr %d "
-				"failed\n", mmc_hostname(card->host),
-				1 << bus_width, ddr);
-			goto free_card;
-		} else if (ddr) {
-			/*
-			 * eMMC cards can support 3.3V to 1.2V i/o (vccq)
-			 * signaling.
-			 *
-			 * EXT_CSD_CARD_TYPE_DDR_1_8V means 3.3V or 1.8V vccq.
-			 *
-			 * 1.8V vccq at 3.3V core voltage (vcc) is not required
-			 * in the JEDEC spec for DDR.
-			 *
-			 * Do not force change in vccq since we are obviously
-			 * working and no change to vccq is needed.
-			 *
-			 * WARNING: eMMC rules are NOT the same as SD DDR
-			 */
-			if (ddr & EXT_CSD_CARD_TYPE_DDR_1_2V) {
-				err = __mmc_set_signal_voltage(host,
-					MMC_SIGNAL_VOLTAGE_120);
-				if (err)
-					goto err;
-			}
-			mmc_set_timing(card->host, MMC_TIMING_MMC_DDR52);
-			mmc_set_bus_width(card->host, bus_width);
+	} else if (mmc_card_hs(card)) {
+		/* Select the desired bus width optionally */
+		err = mmc_select_bus_width(card);
+		if (!IS_ERR_VALUE(err)) {
+			err = mmc_select_hs_ddr(card);
+			if (err)
+				goto err;
 		}
 	}
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index fe31f8d89a03..176073692872 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -63,6 +63,7 @@ struct mmc_ext_csd {
 	unsigned int            power_off_longtime;     /* Units: ms */
 	u8			power_off_notification;	/* state */
 	unsigned int		hs_max_dtr;
+	unsigned int		hs200_max_dtr;
 #define MMC_HIGH_26_MAX_DTR	26000000
 #define MMC_HIGH_52_MAX_DTR	52000000
 #define MMC_HIGH_DDR_MAX_DTR	52000000
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index f734c0c64575..f429f13be433 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -377,6 +377,10 @@ struct _mmc_csd {
 #define EXT_CSD_DDR_BUS_WIDTH_4	5	/* Card is in 4 bit DDR mode */
 #define EXT_CSD_DDR_BUS_WIDTH_8	6	/* Card is in 8 bit DDR mode */
 
+#define EXT_CSD_TIMING_BC	0	/* Backwards compatility */
+#define EXT_CSD_TIMING_HS	1	/* High speed */
+#define EXT_CSD_TIMING_HS200	2	/* HS200 */
+
 #define EXT_CSD_SEC_ER_EN	BIT(0)
 #define EXT_CSD_SEC_BD_BLK_EN	BIT(2)
 #define EXT_CSD_SEC_GB_CL_EN	BIT(4)
-- 
cgit v1.2.3-71-gd317


From 0a5b6438ee482696360bb013e67b8488f63d3e9e Mon Sep 17 00:00:00 2001
From: Seungwon Jeon <tgih.jun@samsung.com>
Date: Wed, 23 Apr 2014 17:14:58 +0900
Subject: mmc: add support for HS400 mode of eMMC5.0

This patch adds HS400 mode support for eMMC5.0 device.  HS400 mode is high
speed DDR interface timing from HS200.  Clock frequency is up to 200MHz
and only 8-bit bus width is supported. In addition, tuning process of
HS200 is required to synchronize the command response on the CMD line
because CMD input timing for HS400 mode is the same as HS200 mode.

Signed-off-by: Seungwon Jeon <tgih.jun@samsung.com>
Reviewed-by: Jackey Shen <jackey.shen@amd.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Acked-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/bus.c     |  1 +
 drivers/mmc/core/debugfs.c |  3 ++
 drivers/mmc/core/mmc.c     | 98 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/mmc/card.h   |  1 +
 include/linux/mmc/host.h   | 14 +++++++
 include/linux/mmc/mmc.h    |  7 +++-
 6 files changed, 118 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index f37e9d6af84a..d2dbf02022bd 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -349,6 +349,7 @@ int mmc_add_card(struct mmc_card *card)
 			mmc_hostname(card->host),
 			mmc_card_uhs(card) ? "ultra high speed " :
 			(mmc_card_hs(card) ? "high speed " : ""),
+			mmc_card_hs400(card) ? "HS400 " :
 			(mmc_card_hs200(card) ? "HS200 " : ""),
 			mmc_card_ddr52(card) ? "DDR " : "",
 			uhs_bus_speed_mode, type, card->rca);
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 1f730dbfaeea..91eb16223246 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -141,6 +141,9 @@ static int mmc_ios_show(struct seq_file *s, void *data)
 	case MMC_TIMING_MMC_HS200:
 		str = "mmc HS200";
 		break;
+	case MMC_TIMING_MMC_HS400:
+		str = "mmc HS400";
+		break;
 	default:
 		str = "invalid";
 		break;
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index bec6786efd19..793c6f7ddb04 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -240,7 +240,7 @@ static int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd)
 static void mmc_select_card_type(struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
-	u8 card_type = card->ext_csd.raw_card_type & EXT_CSD_CARD_TYPE_MASK;
+	u8 card_type = card->ext_csd.raw_card_type;
 	u32 caps = host->caps, caps2 = host->caps2;
 	unsigned int hs_max_dtr = 0, hs200_max_dtr = 0;
 	unsigned int avail_type = 0;
@@ -281,6 +281,18 @@ static void mmc_select_card_type(struct mmc_card *card)
 		avail_type |= EXT_CSD_CARD_TYPE_HS200_1_2V;
 	}
 
+	if (caps2 & MMC_CAP2_HS400_1_8V &&
+	    card_type & EXT_CSD_CARD_TYPE_HS400_1_8V) {
+		hs200_max_dtr = MMC_HS200_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_HS400_1_8V;
+	}
+
+	if (caps2 & MMC_CAP2_HS400_1_2V &&
+	    card_type & EXT_CSD_CARD_TYPE_HS400_1_2V) {
+		hs200_max_dtr = MMC_HS200_MAX_DTR;
+		avail_type |= EXT_CSD_CARD_TYPE_HS400_1_2V;
+	}
+
 	card->ext_csd.hs_max_dtr = hs_max_dtr;
 	card->ext_csd.hs200_max_dtr = hs200_max_dtr;
 	card->mmc_avail_type = avail_type;
@@ -499,6 +511,8 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 			ext_csd[EXT_CSD_PWR_CL_DDR_52_195];
 		card->ext_csd.raw_pwr_cl_ddr_52_360 =
 			ext_csd[EXT_CSD_PWR_CL_DDR_52_360];
+		card->ext_csd.raw_pwr_cl_ddr_200_360 =
+			ext_csd[EXT_CSD_PWR_CL_DDR_200_360];
 	}
 
 	if (card->ext_csd.rev >= 5) {
@@ -665,7 +679,10 @@ static int mmc_compare_ext_csds(struct mmc_card *card, unsigned bus_width)
 		(card->ext_csd.raw_pwr_cl_ddr_52_195 ==
 			bw_ext_csd[EXT_CSD_PWR_CL_DDR_52_195]) &&
 		(card->ext_csd.raw_pwr_cl_ddr_52_360 ==
-			bw_ext_csd[EXT_CSD_PWR_CL_DDR_52_360]));
+			bw_ext_csd[EXT_CSD_PWR_CL_DDR_52_360]) &&
+		(card->ext_csd.raw_pwr_cl_ddr_200_360 ==
+			bw_ext_csd[EXT_CSD_PWR_CL_DDR_200_360]));
+
 	if (err)
 		err = -EINVAL;
 
@@ -768,7 +785,9 @@ static int __mmc_select_powerclass(struct mmc_card *card,
 				ext_csd->raw_pwr_cl_52_360 :
 				ext_csd->raw_pwr_cl_ddr_52_360;
 		else if (host->ios.clock <= MMC_HS200_MAX_DTR)
-			pwrclass_val = ext_csd->raw_pwr_cl_200_360;
+			pwrclass_val = (bus_width == EXT_CSD_DDR_BUS_WIDTH_8) ?
+				ext_csd->raw_pwr_cl_ddr_200_360 :
+				ext_csd->raw_pwr_cl_200_360;
 		break;
 	default:
 		pr_warning("%s: Voltage range not supported "
@@ -832,7 +851,8 @@ static void mmc_set_bus_speed(struct mmc_card *card)
 {
 	unsigned int max_dtr = (unsigned int)-1;
 
-	if (mmc_card_hs200(card) && max_dtr > card->ext_csd.hs200_max_dtr)
+	if ((mmc_card_hs200(card) || mmc_card_hs400(card)) &&
+	     max_dtr > card->ext_csd.hs200_max_dtr)
 		max_dtr = card->ext_csd.hs200_max_dtr;
 	else if (mmc_card_hs(card) && max_dtr > card->ext_csd.hs_max_dtr)
 		max_dtr = card->ext_csd.hs_max_dtr;
@@ -985,6 +1005,61 @@ static int mmc_select_hs_ddr(struct mmc_card *card)
 	return err;
 }
 
+static int mmc_select_hs400(struct mmc_card *card)
+{
+	struct mmc_host *host = card->host;
+	int err = 0;
+
+	/*
+	 * HS400 mode requires 8-bit bus width
+	 */
+	if (!(card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS400 &&
+	      host->ios.bus_width == MMC_BUS_WIDTH_8))
+		return 0;
+
+	/*
+	 * Before switching to dual data rate operation for HS400,
+	 * it is required to convert from HS200 mode to HS mode.
+	 */
+	mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
+	mmc_set_bus_speed(card);
+
+	err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+			   EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS,
+			   card->ext_csd.generic_cmd6_time,
+			   true, true, true);
+	if (err) {
+		pr_warn("%s: switch to high-speed from hs200 failed, err:%d\n",
+			mmc_hostname(host), err);
+		return err;
+	}
+
+	err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+			 EXT_CSD_BUS_WIDTH,
+			 EXT_CSD_DDR_BUS_WIDTH_8,
+			 card->ext_csd.generic_cmd6_time);
+	if (err) {
+		pr_warn("%s: switch to bus width for hs400 failed, err:%d\n",
+			mmc_hostname(host), err);
+		return err;
+	}
+
+	err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+			   EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS400,
+			   card->ext_csd.generic_cmd6_time,
+			   true, true, true);
+	if (err) {
+		pr_warn("%s: switch to hs400 failed, err:%d\n",
+			 mmc_hostname(host), err);
+		return err;
+	}
+
+	mmc_set_timing(host, MMC_TIMING_MMC_HS400);
+	mmc_set_bus_speed(card);
+
+	return 0;
+}
+
 /*
  * For device supporting HS200 mode, the following sequence
  * should be done before executing the tuning process.
@@ -1062,13 +1137,22 @@ bus_speed:
 
 /*
  * Execute tuning sequence to seek the proper bus operating
- * conditions for HS200, which sends CMD21 to the device.
+ * conditions for HS200 and HS400, which sends CMD21 to the device.
  */
 static int mmc_hs200_tuning(struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
 	int err = 0;
 
+	/*
+	 * Timing should be adjusted to the HS400 target
+	 * operation frequency for tuning process
+	 */
+	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS400 &&
+	    host->ios.bus_width == MMC_BUS_WIDTH_8)
+		if (host->ops->prepare_hs400_tuning)
+			host->ops->prepare_hs400_tuning(host, &host->ios);
+
 	if (host->ops->execute_tuning) {
 		mmc_host_clk_hold(host);
 		err = host->ops->execute_tuning(host,
@@ -1296,6 +1380,10 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		err = mmc_hs200_tuning(card);
 		if (err)
 			goto err;
+
+		err = mmc_select_hs400(card);
+		if (err)
+			goto err;
 	} else if (mmc_card_hs(card)) {
 		/* Select the desired bus width optionally */
 		err = mmc_select_bus_width(card);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 176073692872..d424b9de3aff 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -110,6 +110,7 @@ struct mmc_ext_csd {
 	u8			raw_pwr_cl_200_360;	/* 237 */
 	u8			raw_pwr_cl_ddr_52_195;	/* 238 */
 	u8			raw_pwr_cl_ddr_52_360;	/* 239 */
+	u8			raw_pwr_cl_ddr_200_360;	/* 253 */
 	u8			raw_bkops_status;	/* 246 */
 	u8			raw_sectors[4];		/* 212 - 4 bytes */
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 6b1e9ee6ca10..183087374215 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -61,6 +61,7 @@ struct mmc_ios {
 #define MMC_TIMING_UHS_DDR50	7
 #define MMC_TIMING_MMC_DDR52	8
 #define MMC_TIMING_MMC_HS200	9
+#define MMC_TIMING_MMC_HS400	10
 
 	unsigned char	signal_voltage;		/* signalling voltage (1.8V or 3.3V) */
 
@@ -132,6 +133,9 @@ struct mmc_host_ops {
 
 	/* The tuning command opcode value is different for SD and eMMC cards */
 	int	(*execute_tuning)(struct mmc_host *host, u32 opcode);
+
+	/* Prepare HS400 target operating frequency depending host driver */
+	int	(*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios);
 	int	(*select_drive_strength)(unsigned int max_dtr, int host_drv, int card_drv);
 	void	(*hw_reset)(struct mmc_host *host);
 	void	(*card_event)(struct mmc_host *host);
@@ -274,6 +278,10 @@ struct mmc_host {
 #define MMC_CAP2_PACKED_CMD	(MMC_CAP2_PACKED_RD | \
 				 MMC_CAP2_PACKED_WR)
 #define MMC_CAP2_NO_PRESCAN_POWERUP (1 << 14)	/* Don't power up before scan */
+#define MMC_CAP2_HS400_1_8V	(1 << 15)	/* Can support HS400 1.8V */
+#define MMC_CAP2_HS400_1_2V	(1 << 16)	/* Can support HS400 1.2V */
+#define MMC_CAP2_HS400		(MMC_CAP2_HS400_1_8V | \
+				 MMC_CAP2_HS400_1_2V)
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
@@ -495,4 +503,10 @@ static inline bool mmc_card_ddr52(struct mmc_card *card)
 {
 	return card->host->ios.timing == MMC_TIMING_MMC_DDR52;
 }
+
+static inline bool mmc_card_hs400(struct mmc_card *card)
+{
+	return card->host->ios.timing == MMC_TIMING_MMC_HS400;
+}
+
 #endif /* LINUX_MMC_HOST_H */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index f429f13be433..64ec963ed347 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -325,6 +325,7 @@ struct _mmc_csd {
 #define EXT_CSD_POWER_OFF_LONG_TIME	247	/* RO */
 #define EXT_CSD_GENERIC_CMD6_TIME	248	/* RO */
 #define EXT_CSD_CACHE_SIZE		249	/* RO, 4 bytes */
+#define EXT_CSD_PWR_CL_DDR_200_360	253	/* RO */
 #define EXT_CSD_TAG_UNIT_SIZE		498	/* RO */
 #define EXT_CSD_DATA_TAG_SUPPORT	499	/* RO */
 #define EXT_CSD_MAX_PACKED_WRITES	500	/* RO */
@@ -354,7 +355,6 @@ struct _mmc_csd {
 #define EXT_CSD_CMD_SET_SECURE		(1<<1)
 #define EXT_CSD_CMD_SET_CPSECURE	(1<<2)
 
-#define EXT_CSD_CARD_TYPE_MASK	0x3F	/* Mask out reserved bits */
 #define EXT_CSD_CARD_TYPE_HS_26	(1<<0)	/* Card can run at 26MHz */
 #define EXT_CSD_CARD_TYPE_HS_52	(1<<1)	/* Card can run at 52MHz */
 #define EXT_CSD_CARD_TYPE_HS	(EXT_CSD_CARD_TYPE_HS_26 | \
@@ -370,6 +370,10 @@ struct _mmc_csd {
 						/* SDR mode @1.2V I/O */
 #define EXT_CSD_CARD_TYPE_HS200		(EXT_CSD_CARD_TYPE_HS200_1_8V | \
 					 EXT_CSD_CARD_TYPE_HS200_1_2V)
+#define EXT_CSD_CARD_TYPE_HS400_1_8V	(1<<6)	/* Card can run at 200MHz DDR, 1.8V */
+#define EXT_CSD_CARD_TYPE_HS400_1_2V	(1<<7)	/* Card can run at 200MHz DDR, 1.2V */
+#define EXT_CSD_CARD_TYPE_HS400		(EXT_CSD_CARD_TYPE_HS400_1_8V | \
+					 EXT_CSD_CARD_TYPE_HS400_1_2V)
 
 #define EXT_CSD_BUS_WIDTH_1	0	/* Card is in 1 bit mode */
 #define EXT_CSD_BUS_WIDTH_4	1	/* Card is in 4 bit mode */
@@ -380,6 +384,7 @@ struct _mmc_csd {
 #define EXT_CSD_TIMING_BC	0	/* Backwards compatility */
 #define EXT_CSD_TIMING_HS	1	/* High speed */
 #define EXT_CSD_TIMING_HS200	2	/* HS200 */
+#define EXT_CSD_TIMING_HS400	3	/* HS400 */
 
 #define EXT_CSD_SEC_ER_EN	BIT(0)
 #define EXT_CSD_SEC_BD_BLK_EN	BIT(2)
-- 
cgit v1.2.3-71-gd317


From 4d1f52f9a9f9a63371dba589093b3ae90fc80c3d Mon Sep 17 00:00:00 2001
From: Tim Kryger <tim.kryger@linaro.org>
Date: Tue, 6 May 2014 15:57:01 -0700
Subject: mmc: core: Improve support for deferred regulators

Callers of mmc_regulator_get_supply could benefit from knowing if either
of the regulators are present but not yet available.  Since callers do
not currently examine the return value, modify this function to return
zero or -EPROBE_DEFER if either regulator get returns the same.

Furthermore, since callers check vmmc/vqmmc using IS_ERR and can deal
with absent regulators, switch to devm_regulator_get_optional. This has
the added benefit of allowing this function to behave correctly even in
the !CONFIG_REGULATOR case such that the stub can be removed.

Signed-off-by: Tim Kryger <tim.kryger@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/core.c  | 31 +++++++++++++++++++------------
 include/linux/mmc/host.h |  8 ++------
 2 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 02baa30653fa..7dc0c85fdb60 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1314,31 +1314,38 @@ int mmc_regulator_set_ocr(struct mmc_host *mmc,
 }
 EXPORT_SYMBOL_GPL(mmc_regulator_set_ocr);
 
+#endif /* CONFIG_REGULATOR */
+
 int mmc_regulator_get_supply(struct mmc_host *mmc)
 {
 	struct device *dev = mmc_dev(mmc);
-	struct regulator *supply;
 	int ret;
 
-	supply = devm_regulator_get(dev, "vmmc");
-	mmc->supply.vmmc = supply;
+	mmc->supply.vmmc = devm_regulator_get_optional(dev, "vmmc");
 	mmc->supply.vqmmc = devm_regulator_get_optional(dev, "vqmmc");
 
-	if (IS_ERR(supply))
-		return PTR_ERR(supply);
+	if (IS_ERR(mmc->supply.vmmc)) {
+		if (PTR_ERR(mmc->supply.vmmc) == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+		dev_info(dev, "No vmmc regulator found\n");
+	} else {
+		ret = mmc_regulator_get_ocrmask(mmc->supply.vmmc);
+		if (ret > 0)
+			mmc->ocr_avail = ret;
+		else
+			dev_warn(dev, "Failed getting OCR mask: %d\n", ret);
+	}
 
-	ret = mmc_regulator_get_ocrmask(supply);
-	if (ret > 0)
-		mmc->ocr_avail = ret;
-	else
-		dev_warn(mmc_dev(mmc), "Failed getting OCR mask: %d\n", ret);
+	if (IS_ERR(mmc->supply.vqmmc)) {
+		if (PTR_ERR(mmc->supply.vqmmc) == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+		dev_info(dev, "No vqmmc regulator found\n");
+	}
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mmc_regulator_get_supply);
 
-#endif /* CONFIG_REGULATOR */
-
 /*
  * Mask off any voltages we don't support and select
  * the lowest voltage
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 183087374215..cd595275e118 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -402,7 +402,6 @@ int mmc_regulator_get_ocrmask(struct regulator *supply);
 int mmc_regulator_set_ocr(struct mmc_host *mmc,
 			struct regulator *supply,
 			unsigned short vdd_bit);
-int mmc_regulator_get_supply(struct mmc_host *mmc);
 #else
 static inline int mmc_regulator_get_ocrmask(struct regulator *supply)
 {
@@ -415,13 +414,10 @@ static inline int mmc_regulator_set_ocr(struct mmc_host *mmc,
 {
 	return 0;
 }
-
-static inline int mmc_regulator_get_supply(struct mmc_host *mmc)
-{
-	return 0;
-}
 #endif
 
+int mmc_regulator_get_supply(struct mmc_host *mmc);
+
 int mmc_pm_notify(struct notifier_block *notify_block, unsigned long, void *);
 
 static inline int mmc_card_is_removable(struct mmc_host *host)
-- 
cgit v1.2.3-71-gd317


From ec903c0c858e4963a9e0724bdcadfa837253341c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 May 2014 12:11:01 -0400
Subject: cgroup: rename css_tryget*() to css_tryget_online*()

Unlike the more usual refcnting, what css_tryget() provides is the
distinction between online and offline csses instead of protection
against upping a refcnt which already reached zero.  cgroup is
planning to provide actual tryget which fails if the refcnt already
reached zero.  Let's rename the existing trygets so that they clearly
indicate that they're onliness.

I thought about keeping the existing names as-are and introducing new
names for the planned actual tryget; however, given that each
controller participates in the synchronization of the online state, it
seems worthwhile to make it explicit that these functions are about
on/offline state.

Rename css_tryget() to css_tryget_online() and css_tryget_from_dir()
to css_tryget_online_from_dir().  This is pure rename.

v2: cgroup_freezer grew new usages of css_tryget().  Update
    accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
---
 block/blk-cgroup.c      |  2 +-
 fs/bio.c                |  2 +-
 include/linux/cgroup.h  | 14 +++++++-------
 kernel/cgroup.c         | 40 ++++++++++++++++++++--------------------
 kernel/cgroup_freezer.c |  4 ++--
 kernel/cpuset.c         |  6 +++---
 kernel/events/core.c    |  3 ++-
 mm/hugetlb_cgroup.c     |  2 +-
 mm/memcontrol.c         | 46 ++++++++++++++++++++++++----------------------
 9 files changed, 61 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1039fb9ff5f5..9f5bce33e6fe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -185,7 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	lockdep_assert_held(q->queue_lock);
 
 	/* blkg holds a reference to blkcg */
-	if (!css_tryget(&blkcg->css)) {
+	if (!css_tryget_online(&blkcg->css)) {
 		ret = -EINVAL;
 		goto err_free_blkg;
 	}
diff --git a/fs/bio.c b/fs/bio.c
index 6f0362b77806..0608ef9422bd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1970,7 +1970,7 @@ int bio_associate_current(struct bio *bio)
 	/* associate blkcg if exists */
 	rcu_read_lock();
 	css = task_css(current, blkio_cgrp_id);
-	if (css && css_tryget(css))
+	if (css && css_tryget_online(css))
 		bio->bi_css = css;
 	rcu_read_unlock();
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bde44618d8c2..c5f3684ef557 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -95,16 +95,16 @@ static inline void css_get(struct cgroup_subsys_state *css)
 }
 
 /**
- * css_tryget - try to obtain a reference on the specified css
+ * css_tryget_online - try to obtain a reference on the specified css if online
  * @css: target css
  *
- * Obtain a reference on @css if it's alive.  The caller naturally needs to
- * ensure that @css is accessible but doesn't have to be holding a
+ * Obtain a reference on @css if it's online.  The caller naturally needs
+ * to ensure that @css is accessible but doesn't have to be holding a
  * reference on it - IOW, RCU protected access is good enough for this
  * function.  Returns %true if a reference count was successfully obtained;
  * %false otherwise.
  */
-static inline bool css_tryget(struct cgroup_subsys_state *css)
+static inline bool css_tryget_online(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
@@ -115,7 +115,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
  * css_put - put a css reference
  * @css: target css
  *
- * Put a reference obtained via css_get() and css_tryget().
+ * Put a reference obtained via css_get() and css_tryget_online().
  */
 static inline void css_put(struct cgroup_subsys_state *css)
 {
@@ -905,8 +905,8 @@ void css_task_iter_end(struct css_task_iter *it);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
-struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
-						struct cgroup_subsys *ss);
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+						       struct cgroup_subsys *ss);
 
 #else /* !CONFIG_CGROUPS */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7633703e9baf..671d8a6dae37 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3771,7 +3771,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 
 	/*
 	 * We aren't being called from kernfs and there's no guarantee on
-	 * @kn->priv's validity.  For this and css_tryget_from_dir(),
+	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
 	 */
 	rcu_read_lock();
@@ -4060,9 +4060,9 @@ err:
  *    Implemented in kill_css().
  *
  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
- *    and thus css_tryget() is guaranteed to fail, the css can be offlined
- *    by invoking offline_css().  After offlining, the base ref is put.
- *    Implemented in css_killed_work_fn().
+ *    and thus css_tryget_online() is guaranteed to fail, the css can be
+ *    offlined by invoking offline_css().  After offlining, the base ref is
+ *    put.  Implemented in css_killed_work_fn().
  *
  * 3. When the percpu_ref reaches zero, the only possible remaining
  *    accessors are inside RCU read sections.  css_release() schedules the
@@ -4386,7 +4386,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
 /*
  * This is called when the refcnt of a css is confirmed to be killed.
- * css_tryget() is now guaranteed to fail.
+ * css_tryget_online() is now guaranteed to fail.
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
@@ -4398,8 +4398,8 @@ static void css_killed_work_fn(struct work_struct *work)
 	mutex_lock(&cgroup_mutex);
 
 	/*
-	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
-	 * initate destruction.
+	 * css_tryget_online() is guaranteed to fail now.  Tell subsystems
+	 * to initate destruction.
 	 */
 	offline_css(css);
 
@@ -4440,8 +4440,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
  *
  * This function initiates destruction of @css by removing cgroup interface
  * files and putting its base reference.  ->css_offline() will be invoked
- * asynchronously once css_tryget() is guaranteed to fail and when the
- * reference count reaches zero, @css will be released.
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
  */
 static void kill_css(struct cgroup_subsys_state *css)
 {
@@ -4462,7 +4462,7 @@ static void kill_css(struct cgroup_subsys_state *css)
 	/*
 	 * cgroup core guarantees that, by the time ->css_offline() is
 	 * invoked, no new css reference will be given out via
-	 * css_tryget().  We can't simply call percpu_ref_kill() and
+	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
 	 * proceed to offlining css's because percpu_ref_kill() doesn't
 	 * guarantee that the ref is seen as killed on all CPUs on return.
 	 *
@@ -4478,9 +4478,9 @@ static void kill_css(struct cgroup_subsys_state *css)
  *
  * css's make use of percpu refcnts whose killing latency shouldn't be
  * exposed to userland and are RCU protected.  Also, cgroup core needs to
- * guarantee that css_tryget() won't succeed by the time ->css_offline() is
- * invoked.  To satisfy all the requirements, destruction is implemented in
- * the following two steps.
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked.  To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
  *
  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
  *     userland visible parts and start killing the percpu refcnts of
@@ -4574,9 +4574,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	/*
 	 * There are two control paths which try to determine cgroup from
 	 * dentry without going through kernfs - cgroupstats_build() and
-	 * css_tryget_from_dir().  Those are supported by RCU protecting
-	 * clearing of cgrp->kn->priv backpointer, which should happen
-	 * after all files under it have been removed.
+	 * css_tryget_online_from_dir().  Those are supported by RCU
+	 * protecting clearing of cgrp->kn->priv backpointer, which should
+	 * happen after all files under it have been removed.
 	 */
 	kernfs_remove(cgrp->kn);	/* @cgrp has an extra ref on its kn */
 	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
@@ -5173,7 +5173,7 @@ static int __init cgroup_disable(char *str)
 __setup("cgroup_disable=", cgroup_disable);
 
 /**
- * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
  * @ss: subsystem of interest
  *
@@ -5181,8 +5181,8 @@ __setup("cgroup_disable=", cgroup_disable);
  * to get the corresponding css and return it.  If such css doesn't exist
  * or can't be pinned, an ERR_PTR value is returned.
  */
-struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
-						struct cgroup_subsys *ss)
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+						       struct cgroup_subsys *ss)
 {
 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup_subsys_state *css = NULL;
@@ -5204,7 +5204,7 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 
-	if (!css || !css_tryget(css))
+	if (!css || !css_tryget_online(css))
 		css = ERR_PTR(-ENOENT);
 
 	rcu_read_unlock();
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 345628c78b5b..0398f7e9ac81 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -304,7 +304,7 @@ static int freezer_read(struct seq_file *m, void *v)
 
 	/* update states bottom-up */
 	css_for_each_descendant_post(pos, css) {
-		if (!css_tryget(pos))
+		if (!css_tryget_online(pos))
 			continue;
 		rcu_read_unlock();
 
@@ -404,7 +404,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
 		struct freezer *pos_f = css_freezer(pos);
 		struct freezer *parent = parent_freezer(pos_f);
 
-		if (!css_tryget(pos))
+		if (!css_tryget_online(pos))
 			continue;
 		rcu_read_unlock();
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7c0e8da59e26..37ca0a5c226d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -872,7 +872,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
 				continue;
 			}
 		}
-		if (!css_tryget(&cp->css))
+		if (!css_tryget_online(&cp->css))
 			continue;
 		rcu_read_unlock();
 
@@ -1108,7 +1108,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
 				continue;
 			}
 		}
-		if (!css_tryget(&cp->css))
+		if (!css_tryget_online(&cp->css))
 			continue;
 		rcu_read_unlock();
 
@@ -2153,7 +2153,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 
 		rcu_read_lock();
 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
-			if (cs == &top_cpuset || !css_tryget(&cs->css))
+			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
 				continue;
 			rcu_read_unlock();
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a3e46d..077968d19b8a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -607,7 +607,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	if (!f.file)
 		return -EBADF;
 
-	css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
+	css = css_tryget_online_from_dir(f.file->f_dentry,
+					 &perf_event_cgrp_subsys);
 	if (IS_ERR(css)) {
 		ret = PTR_ERR(css);
 		goto out;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 595d7fd795e1..372f1adca491 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -181,7 +181,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 again:
 	rcu_read_lock();
 	h_cg = hugetlb_cgroup_from_task(current);
-	if (!css_tryget(&h_cg->css)) {
+	if (!css_tryget_online(&h_cg->css)) {
 		rcu_read_unlock();
 		goto again;
 	}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c3f82f69ef58..5cf3246314a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -567,7 +567,8 @@ void sock_update_memcg(struct sock *sk)
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
 		if (!mem_cgroup_is_root(memcg) &&
-		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
+		    memcg_proto_active(cg_proto) &&
+		    css_tryget_online(&memcg->css)) {
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
@@ -834,7 +835,7 @@ retry:
 	 */
 	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
-		!css_tryget(&mz->memcg->css))
+	    !css_tryget_online(&mz->memcg->css))
 		goto retry;
 done:
 	return mz;
@@ -1076,7 +1077,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!memcg))
 			memcg = root_mem_cgroup;
-	} while (!css_tryget(&memcg->css));
+	} while (!css_tryget_online(&memcg->css));
 	rcu_read_unlock();
 	return memcg;
 }
@@ -1113,7 +1114,8 @@ skip_node:
 	 */
 	if (next_css) {
 		if ((next_css == &root->css) ||
-		    ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
+		    ((next_css->flags & CSS_ONLINE) &&
+		     css_tryget_online(next_css)))
 			return mem_cgroup_from_css(next_css);
 
 		prev_css = next_css;
@@ -1159,7 +1161,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 		 * would be returned all the time.
 		 */
 		if (position && position != root &&
-				!css_tryget(&position->css))
+		    !css_tryget_online(&position->css))
 			position = NULL;
 	}
 	return position;
@@ -2785,9 +2787,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 
 /*
  * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling css_tryget if
- * the mem_cgroup is used for charging. (dropping refcnt from swap can be
- * called against removed memcg.)
+ * rcu_read_lock().  The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
  */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
@@ -2810,14 +2812,14 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
-		if (memcg && !css_tryget(&memcg->css))
+		if (memcg && !css_tryget_online(&memcg->css))
 			memcg = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup_id(ent);
 		rcu_read_lock();
 		memcg = mem_cgroup_lookup(id);
-		if (memcg && !css_tryget(&memcg->css))
+		if (memcg && !css_tryget_online(&memcg->css))
 			memcg = NULL;
 		rcu_read_unlock();
 	}
@@ -3473,7 +3475,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 	}
 
 	/* The corresponding put will be done in the workqueue. */
-	if (!css_tryget(&memcg->css))
+	if (!css_tryget_online(&memcg->css))
 		goto out;
 	rcu_read_unlock();
 
@@ -4246,8 +4248,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
 		/*
-		 * We uncharge this because swap is freed.
-		 * This memcg can be obsolete one. We avoid calling css_tryget
+		 * We uncharge this because swap is freed.  This memcg can
+		 * be obsolete one. We avoid calling css_tryget_online().
 		 */
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
@@ -5840,10 +5842,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 	 * which is then paired with css_put during uncharge resp. here.
 	 *
 	 * Although this might sound strange as this path is called from
-	 * css_offline() when the referencemight have dropped down to 0
-	 * and shouldn't be incremented anymore (css_tryget would fail)
-	 * we do not have other options because of the kmem allocations
-	 * lifetime.
+	 * css_offline() when the referencemight have dropped down to 0 and
+	 * shouldn't be incremented anymore (css_tryget_online() would
+	 * fail) we do not have other options because of the kmem
+	 * allocations lifetime.
 	 */
 	css_get(&memcg->css);
 
@@ -6051,8 +6053,8 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
 	 * automatically removed on cgroup destruction but the removal is
 	 * asynchronous, so take an extra ref on @css.
 	 */
-	cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
-					&memory_cgrp_subsys);
+	cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
+					       &memory_cgrp_subsys);
 	ret = -EINVAL;
 	if (IS_ERR(cfile_css))
 		goto out_put_cfile;
@@ -6496,7 +6498,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	/*
 	 * XXX: css_offline() would be where we should reparent all
 	 * memory to prepare the cgroup for destruction.  However,
-	 * memcg does not do css_tryget() and res_counter charging
+	 * memcg does not do css_tryget_online() and res_counter charging
 	 * under the same RCU lock region, which means that charging
 	 * could race with offlining.  Offlining only happens to
 	 * cgroups with no tasks in them but charges can show up
@@ -6510,9 +6512,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	 *                           lookup_swap_cgroup_id()
 	 *                           rcu_read_lock()
 	 *                           mem_cgroup_lookup()
-	 *                           css_tryget()
+	 *                           css_tryget_online()
 	 *                           rcu_read_unlock()
-	 * disable css_tryget()
+	 * disable css_tryget_online()
 	 * call_rcu()
 	 *   offline_css()
 	 *     reparent_charges()
-- 
cgit v1.2.3-71-gd317


From b41686401e501430ffe93b575ef7959d2ecc6f2e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 May 2014 12:16:21 -0400
Subject: cgroup: implement cftype->write()

During the recent conversion to kernfs, cftype's seq_file operations
are updated so that they are directly mapped to kernfs operations and
thus can fully access the associated kernfs and cgroup contexts;
however, write path hasn't seen similar updates and none of the
existing write operations has access to, for example, the associated
kernfs_open_file.

Let's introduce a new operation cftype->write() which maps directly to
the kernfs write operation and has access to all the arguments and
contexts.  This will replace ->write_string() and ->trigger() and ease
manipulation of kernfs active protection from cgroup file operations.

Two accessors - of_cft() and of_css() - are introduced to enable
accessing the associated cgroup context from cftype->write() which
only takes kernfs_open_file for the context information.  The
accessors for seq_file operations - seq_cft() and seq_css() - are
rewritten to wrap the of_ accessors.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 27 +++++++++++++++++++++++----
 kernel/cgroup.c        | 14 ++++++++------
 2 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c5f3684ef557..c5a170ca4a48 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -515,6 +515,15 @@ struct cftype {
 	 */
 	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
 
+	/*
+	 * write() is the generic write callback which maps directly to
+	 * kernfs write operation and overrides all other operations.
+	 * Maximum write size is determined by ->max_write_len.  Use
+	 * of_css/cft() to access the associated css and cft.
+	 */
+	ssize_t (*write)(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key	lockdep_key;
 #endif
@@ -552,14 +561,24 @@ static inline ino_t cgroup_ino(struct cgroup *cgrp)
 		return 0;
 }
 
-static inline struct cftype *seq_cft(struct seq_file *seq)
+/* cft/css accessors for cftype->write() operation */
+static inline struct cftype *of_cft(struct kernfs_open_file *of)
 {
-	struct kernfs_open_file *of = seq->private;
-
 	return of->kn->priv;
 }
 
-struct cgroup_subsys_state *seq_css(struct seq_file *seq);
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
+
+/* cft/css accessors for cftype->seq_*() operations */
+static inline struct cftype *seq_cft(struct seq_file *seq)
+{
+	return of_cft(seq->private);
+}
+
+static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+{
+	return of_css(seq->private);
+}
 
 /*
  * Name / path handling functions.  All are thin wrappers around the kernfs
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671d8a6dae37..a16f91d12f4e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -283,11 +283,10 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 	return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 
-struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
-	struct kernfs_open_file *of = seq->private;
 	struct cgroup *cgrp = of->kn->parent->priv;
-	struct cftype *cft = seq_cft(seq);
+	struct cftype *cft = of_cft(of);
 
 	/*
 	 * This is open and unprotected implementation of cgroup_css().
@@ -302,7 +301,7 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 	else
 		return &cgrp->dummy_css;
 }
-EXPORT_SYMBOL_GPL(seq_css);
+EXPORT_SYMBOL_GPL(of_css);
 
 /**
  * cgroup_is_descendant - test ancestry
@@ -1035,8 +1034,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 		mode |= S_IRUGO;
 
-	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
-	    cft->trigger)
+	if (cft->write_u64 || cft->write_s64 || cft->write ||
+	    cft->write_string || cft->trigger)
 		mode |= S_IWUSR;
 
 	return mode;
@@ -2726,6 +2725,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 	struct cgroup_subsys_state *css;
 	int ret;
 
+	if (cft->write)
+		return cft->write(of, buf, nbytes, off);
+
 	/*
 	 * kernfs guarantees that a file isn't deleted with operations in
 	 * flight, which means that the matching css is and stays alive and
-- 
cgit v1.2.3-71-gd317


From 451af504df0c62f695a69b83c250486e77c66378 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 May 2014 12:16:21 -0400
Subject: cgroup: replace cftype->write_string() with cftype->write()

Convert all cftype->write_string() users to the new cftype->write()
which maps directly to kernfs write operation and has full access to
kernfs and cgroup contexts.  The conversions are mostly mechanical.

* @css and @cft are accessed using of_css() and of_cft() accessors
  respectively instead of being specified as arguments.

* Should return @nbytes on success instead of 0.

* @buf is not trimmed automatically.  Trim if necessary.  Note that
  blkcg and netprio don't need this as the parsers already handle
  whitespaces.

cftype->write_string() has no user left after the conversions and
removed.

While at it, remove unnecessary local variable @p in
cgroup_subtree_control_write() and stale comment about
CGROUP_LOCAL_BUFFER_SIZE in cgroup_freezer.c.

This patch doesn't introduce any visible behavior changes.

v2: netprio was missing from conversion.  Converted.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Aristeu Rozanski <arozansk@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
---
 block/blk-throttle.c      | 32 ++++++++++++++++----------------
 block/cfq-iosched.c       | 28 ++++++++++++++--------------
 include/linux/cgroup.h    | 10 +---------
 kernel/cgroup.c           | 38 +++++++++++++++++++-------------------
 kernel/cgroup_freezer.c   | 20 +++++++++-----------
 kernel/cpuset.c           | 16 +++++++++-------
 mm/hugetlb_cgroup.c       | 17 +++++++++--------
 mm/memcontrol.c           | 46 +++++++++++++++++++++++++---------------------
 net/core/netprio_cgroup.c | 12 ++++++------
 net/ipv4/tcp_memcontrol.c | 16 +++++++++-------
 security/device_cgroup.c  | 14 +++++++-------
 11 files changed, 124 insertions(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 033745cd7fba..5e8fd1bace98 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1346,10 +1346,10 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
-		       const char *buf, bool is_u64)
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+			   char *buf, size_t nbytes, loff_t off, bool is_u64)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
 	struct throtl_service_queue *sq;
@@ -1368,9 +1368,9 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
 		ctx.v = -1;
 
 	if (is_u64)
-		*(u64 *)((void *)tg + cft->private) = ctx.v;
+		*(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
 	else
-		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
+		*(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
 
 	throtl_log(&tg->service_queue,
 		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1404,19 +1404,19 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 
 	blkg_conf_finish(&ctx);
-	return 0;
+	return nbytes;
 }
 
-static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
-			   char *buf)
+static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
+			       char *buf, size_t nbytes, loff_t off)
 {
-	return tg_set_conf(css, cft, buf, true);
+	return tg_set_conf(of, buf, nbytes, off, true);
 }
 
-static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
-			    char *buf)
+static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
 {
-	return tg_set_conf(css, cft, buf, false);
+	return tg_set_conf(of, buf, nbytes, off, false);
 }
 
 static struct cftype throtl_files[] = {
@@ -1424,25 +1424,25 @@ static struct cftype throtl_files[] = {
 		.name = "throttle.read_bps_device",
 		.private = offsetof(struct throtl_grp, bps[READ]),
 		.seq_show = tg_print_conf_u64,
-		.write_string = tg_set_conf_u64,
+		.write = tg_set_conf_u64,
 	},
 	{
 		.name = "throttle.write_bps_device",
 		.private = offsetof(struct throtl_grp, bps[WRITE]),
 		.seq_show = tg_print_conf_u64,
-		.write_string = tg_set_conf_u64,
+		.write = tg_set_conf_u64,
 	},
 	{
 		.name = "throttle.read_iops_device",
 		.private = offsetof(struct throtl_grp, iops[READ]),
 		.seq_show = tg_print_conf_uint,
-		.write_string = tg_set_conf_uint,
+		.write = tg_set_conf_uint,
 	},
 	{
 		.name = "throttle.write_iops_device",
 		.private = offsetof(struct throtl_grp, iops[WRITE]),
 		.seq_show = tg_print_conf_uint,
-		.write_string = tg_set_conf_uint,
+		.write = tg_set_conf_uint,
 	},
 	{
 		.name = "throttle.io_service_bytes",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e0985f1955e7..a73020b8c9af 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1670,11 +1670,11 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
-				    struct cftype *cft, const char *buf,
-				    bool is_leaf_weight)
+static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
+					char *buf, size_t nbytes, loff_t off,
+					bool is_leaf_weight)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
 	int ret;
@@ -1697,19 +1697,19 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
 	}
 
 	blkg_conf_finish(&ctx);
-	return ret;
+	return ret ?: nbytes;
 }
 
-static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
-				  struct cftype *cft, char *buf)
+static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(css, cft, buf, false);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, false);
 }
 
-static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
-				       struct cftype *cft, char *buf)
+static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
+					   char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(css, cft, buf, true);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, true);
 }
 
 static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -1837,7 +1837,7 @@ static struct cftype cfq_blkcg_files[] = {
 		.name = "weight_device",
 		.flags = CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cfqg_print_leaf_weight_device,
-		.write_string = cfqg_set_leaf_weight_device,
+		.write = cfqg_set_leaf_weight_device,
 	},
 	{
 		.name = "weight",
@@ -1851,7 +1851,7 @@ static struct cftype cfq_blkcg_files[] = {
 		.name = "weight_device",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cfqg_print_weight_device,
-		.write_string = cfqg_set_weight_device,
+		.write = cfqg_set_weight_device,
 	},
 	{
 		.name = "weight",
@@ -1863,7 +1863,7 @@ static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "leaf_weight_device",
 		.seq_show = cfqg_print_leaf_weight_device,
-		.write_string = cfqg_set_leaf_weight_device,
+		.write = cfqg_set_leaf_weight_device,
 	},
 	{
 		.name = "leaf_weight",
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c5a170ca4a48..aecdc84fe128 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -453,8 +453,7 @@ struct cftype {
 
 	/*
 	 * The maximum length of string, excluding trailing nul, that can
-	 * be passed to write_string.  If < PAGE_SIZE-1, PAGE_SIZE-1 is
-	 * assumed.
+	 * be passed to write.  If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
 	 */
 	size_t max_write_len;
 
@@ -500,13 +499,6 @@ struct cftype {
 	int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
 			 s64 val);
 
-	/*
-	 * write_string() is passed a nul-terminated kernelspace
-	 * buffer of maximum length determined by max_write_len.
-	 * Returns 0 or -ve error code.
-	 */
-	int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
-			    char *buffer);
 	/*
 	 * trigger() callback can be used to get some kick from the
 	 * userspace, when the actual string written is not important
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a16f91d12f4e..2a88ce7b24b6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1035,7 +1035,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 		mode |= S_IRUGO;
 
 	if (cft->write_u64 || cft->write_s64 || cft->write ||
-	    cft->write_string || cft->trigger)
+	    cft->trigger)
 		mode |= S_IWUSR;
 
 	return mode;
@@ -2352,20 +2352,21 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
 	return attach_task_by_pid(css->cgroup, tgid, true);
 }
 
-static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
-				      struct cftype *cft, char *buffer)
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes, loff_t off)
 {
-	struct cgroup_root *root = css->cgroup->root;
+	struct cgroup *cgrp = of_css(of)->cgroup;
+	struct cgroup_root *root = cgrp->root;
 
 	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
-	if (!cgroup_lock_live_group(css->cgroup))
+	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	spin_lock(&release_agent_path_lock);
-	strlcpy(root->release_agent_path, buffer,
+	strlcpy(root->release_agent_path, strstrip(buf),
 		sizeof(root->release_agent_path));
 	spin_unlock(&release_agent_path_lock);
 	mutex_unlock(&cgroup_mutex);
-	return 0;
+	return nbytes;
 }
 
 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
@@ -2530,21 +2531,22 @@ out_finish:
 }
 
 /* change the enabled child controllers for a cgroup in the default hierarchy */
-static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css,
-					struct cftype *cft, char *buffer)
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+					    char *buf, size_t nbytes,
+					    loff_t off)
 {
 	unsigned int enable = 0, disable = 0;
-	struct cgroup *cgrp = dummy_css->cgroup, *child;
+	struct cgroup *cgrp = of_css(of)->cgroup, *child;
 	struct cgroup_subsys *ss;
-	char *tok, *p;
+	char *tok;
 	int ssid, ret;
 
 	/*
 	 * Parse input - space separated list of subsystem names prefixed
 	 * with either + or -.
 	 */
-	p = buffer;
-	while ((tok = strsep(&p, " "))) {
+	buf = strstrip(buf);
+	while ((tok = strsep(&buf, " "))) {
 		if (tok[0] == '\0')
 			continue;
 		for_each_subsys(ss, ssid) {
@@ -2692,7 +2694,7 @@ out_unlock_tree:
 out_unbreak:
 	kernfs_unbreak_active_protection(cgrp->control_kn);
 	cgroup_put(cgrp);
-	return ret;
+	return ret ?: nbytes;
 
 err_undo_css:
 	cgrp->child_subsys_mask &= ~enable;
@@ -2738,9 +2740,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 	css = cgroup_css(cgrp, cft->ss);
 	rcu_read_unlock();
 
-	if (cft->write_string) {
-		ret = cft->write_string(css, cft, strstrip(buf));
-	} else if (cft->write_u64) {
+	if (cft->write_u64) {
 		unsigned long long v;
 		ret = kstrtoull(buf, 0, &v);
 		if (!ret)
@@ -3984,7 +3984,7 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.subtree_control",
 		.flags = CFTYPE_ONLY_ON_DFL,
 		.seq_show = cgroup_subtree_control_show,
-		.write_string = cgroup_subtree_control_write,
+		.write = cgroup_subtree_control_write,
 	},
 	{
 		.name = "cgroup.populated",
@@ -4018,7 +4018,7 @@ static struct cftype cgroup_base_files[] = {
 		.name = "release_agent",
 		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_release_agent_show,
-		.write_string = cgroup_release_agent_write,
+		.write = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX - 1,
 	},
 	{ }	/* terminate */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 0398f7e9ac81..6b4e60e33a9a 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -73,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
 	return ret;
 }
 
-/*
- * cgroups_write_string() limits the size of freezer state strings to
- * CGROUP_LOCAL_BUFFER_SIZE
- */
 static const char *freezer_state_strs(unsigned int state)
 {
 	if (state & CGROUP_FROZEN)
@@ -423,20 +419,22 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
 	mutex_unlock(&freezer_mutex);
 }
 
-static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
-			 char *buffer)
+static ssize_t freezer_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
 {
 	bool freeze;
 
-	if (strcmp(buffer, freezer_state_strs(0)) == 0)
+	buf = strstrip(buf);
+
+	if (strcmp(buf, freezer_state_strs(0)) == 0)
 		freeze = false;
-	else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
+	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
 		freeze = true;
 	else
 		return -EINVAL;
 
-	freezer_change_state(css_freezer(css), freeze);
-	return 0;
+	freezer_change_state(css_freezer(of_css(of)), freeze);
+	return nbytes;
 }
 
 static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -460,7 +458,7 @@ static struct cftype files[] = {
 		.name = "state",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = freezer_read,
-		.write_string = freezer_write,
+		.write = freezer_write,
 	},
 	{
 		.name = "self_freezing",
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 37ca0a5c226d..2f4b08b8db24 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1603,13 +1603,15 @@ out_unlock:
 /*
  * Common handling for a write to a "cpus" or "mems" file.
  */
-static int cpuset_write_resmask(struct cgroup_subsys_state *css,
-				struct cftype *cft, char *buf)
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes, loff_t off)
 {
-	struct cpuset *cs = css_cs(css);
+	struct cpuset *cs = css_cs(of_css(of));
 	struct cpuset *trialcs;
 	int retval = -ENODEV;
 
+	buf = strstrip(buf);
+
 	/*
 	 * CPU or memory hotunplug may leave @cs w/o any execution
 	 * resources, in which case the hotplug code asynchronously updates
@@ -1633,7 +1635,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
 		goto out_unlock;
 	}
 
-	switch (cft->private) {
+	switch (of_cft(of)->private) {
 	case FILE_CPULIST:
 		retval = update_cpumask(cs, trialcs, buf);
 		break;
@@ -1648,7 +1650,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
 	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
-	return retval;
+	return retval ?: nbytes;
 }
 
 /*
@@ -1750,7 +1752,7 @@ static struct cftype files[] = {
 	{
 		.name = "cpus",
 		.seq_show = cpuset_common_seq_show,
-		.write_string = cpuset_write_resmask,
+		.write = cpuset_write_resmask,
 		.max_write_len = (100U + 6 * NR_CPUS),
 		.private = FILE_CPULIST,
 	},
@@ -1758,7 +1760,7 @@ static struct cftype files[] = {
 	{
 		.name = "mems",
 		.seq_show = cpuset_common_seq_show,
-		.write_string = cpuset_write_resmask,
+		.write = cpuset_write_resmask,
 		.max_write_len = (100U + 6 * MAX_NUMNODES),
 		.private = FILE_MEMLIST,
 	},
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 372f1adca491..191de26b0148 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -253,15 +253,16 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
 	return res_counter_read_u64(&h_cg->hugepage[idx], name);
 }
 
-static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
-				struct cftype *cft, char *buffer)
+static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes, loff_t off)
 {
 	int idx, name, ret;
 	unsigned long long val;
-	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
 
-	idx = MEMFILE_IDX(cft->private);
-	name = MEMFILE_ATTR(cft->private);
+	buf = strstrip(buf);
+	idx = MEMFILE_IDX(of_cft(of)->private);
+	name = MEMFILE_ATTR(of_cft(of)->private);
 
 	switch (name) {
 	case RES_LIMIT:
@@ -271,7 +272,7 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
 			break;
 		}
 		/* This function does all necessary parse...reuse it */
-		ret = res_counter_memparse_write_strategy(buffer, &val);
+		ret = res_counter_memparse_write_strategy(buf, &val);
 		if (ret)
 			break;
 		ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
@@ -280,7 +281,7 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
 		ret = -EINVAL;
 		break;
 	}
-	return ret;
+	return ret ?: nbytes;
 }
 
 static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
@@ -331,7 +332,7 @@ static void __init __hugetlb_cgroup_file_init(int idx)
 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
 	cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
 	cft->read_u64 = hugetlb_cgroup_read_u64;
-	cft->write_string = hugetlb_cgroup_write;
+	cft->write = hugetlb_cgroup_write;
 
 	/* Add the usage file */
 	cft = &h->cgroup_files[1];
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5cf3246314a2..7098a43f7447 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5143,17 +5143,18 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
  * The user of this function is...
  * RES_LIMIT.
  */
-static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
-			    char *buffer)
+static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	enum res_type type;
 	int name;
 	unsigned long long val;
 	int ret;
 
-	type = MEMFILE_TYPE(cft->private);
-	name = MEMFILE_ATTR(cft->private);
+	buf = strstrip(buf);
+	type = MEMFILE_TYPE(of_cft(of)->private);
+	name = MEMFILE_ATTR(of_cft(of)->private);
 
 	switch (name) {
 	case RES_LIMIT:
@@ -5162,7 +5163,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
 			break;
 		}
 		/* This function does all necessary parse...reuse it */
-		ret = res_counter_memparse_write_strategy(buffer, &val);
+		ret = res_counter_memparse_write_strategy(buf, &val);
 		if (ret)
 			break;
 		if (type == _MEM)
@@ -5175,7 +5176,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
 			return -EINVAL;
 		break;
 	case RES_SOFT_LIMIT:
-		ret = res_counter_memparse_write_strategy(buffer, &val);
+		ret = res_counter_memparse_write_strategy(buf, &val);
 		if (ret)
 			break;
 		/*
@@ -5192,7 +5193,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
-	return ret;
+	return ret ?: nbytes;
 }
 
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
@@ -5964,9 +5965,10 @@ static void memcg_event_ptable_queue_proc(struct file *file,
  * Input must be in format '<event_fd> <control_fd> <args>'.
  * Interpretation of args is defined by control file implementation.
  */
-static int memcg_write_event_control(struct cgroup_subsys_state *css,
-				     struct cftype *cft, char *buffer)
+static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+					 char *buf, size_t nbytes, loff_t off)
 {
+	struct cgroup_subsys_state *css = of_css(of);
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event;
 	struct cgroup_subsys_state *cfile_css;
@@ -5977,15 +5979,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
 	char *endp;
 	int ret;
 
-	efd = simple_strtoul(buffer, &endp, 10);
+	buf = strstrip(buf);
+
+	efd = simple_strtoul(buf, &endp, 10);
 	if (*endp != ' ')
 		return -EINVAL;
-	buffer = endp + 1;
+	buf = endp + 1;
 
-	cfd = simple_strtoul(buffer, &endp, 10);
+	cfd = simple_strtoul(buf, &endp, 10);
 	if ((*endp != ' ') && (*endp != '\0'))
 		return -EINVAL;
-	buffer = endp + 1;
+	buf = endp + 1;
 
 	event = kzalloc(sizeof(*event), GFP_KERNEL);
 	if (!event)
@@ -6063,7 +6067,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
 		goto out_put_cfile;
 	}
 
-	ret = event->register_event(memcg, event->eventfd, buffer);
+	ret = event->register_event(memcg, event->eventfd, buf);
 	if (ret)
 		goto out_put_css;
 
@@ -6076,7 +6080,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
 	fdput(cfile);
 	fdput(efile);
 
-	return 0;
+	return nbytes;
 
 out_put_css:
 	css_put(css);
@@ -6107,13 +6111,13 @@ static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
-		.write_string = mem_cgroup_write,
+		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
-		.write_string = mem_cgroup_write,
+		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
@@ -6138,7 +6142,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "cgroup.event_control",		/* XXX: for compat */
-		.write_string = memcg_write_event_control,
+		.write = memcg_write_event_control,
 		.flags = CFTYPE_NO_PREFIX,
 		.mode = S_IWUGO,
 	},
@@ -6171,7 +6175,7 @@ static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "kmem.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
-		.write_string = mem_cgroup_write,
+		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
@@ -6217,7 +6221,7 @@ static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-		.write_string = mem_cgroup_write,
+		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 3825f669147b..b990cefd906b 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -185,15 +185,15 @@ static int read_priomap(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
-			 char *buffer)
+static ssize_t write_priomap(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
 {
 	char devname[IFNAMSIZ + 1];
 	struct net_device *dev;
 	u32 prio;
 	int ret;
 
-	if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+	if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
 		return -EINVAL;
 
 	dev = dev_get_by_name(&init_net, devname);
@@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
 
 	rtnl_lock();
 
-	ret = netprio_set_prio(css, dev, prio);
+	ret = netprio_set_prio(of_css(of), dev, prio);
 
 	rtnl_unlock();
 	dev_put(dev);
-	return ret;
+	return ret ?: nbytes;
 }
 
 static int update_netprio(const void *v, struct file *file, unsigned n)
@@ -239,7 +239,7 @@ static struct cftype ss_files[] = {
 	{
 		.name = "ifpriomap",
 		.seq_show = read_priomap,
-		.write_string = write_priomap,
+		.write = write_priomap,
 	},
 	{ }	/* terminate */
 };
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index d4f015ad6c84..841fd3fa937a 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -102,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 	return 0;
 }
 
-static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
-			    char *buffer)
+static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned long long val;
 	int ret = 0;
 
-	switch (cft->private) {
+	buf = strstrip(buf);
+
+	switch (of_cft(of)->private) {
 	case RES_LIMIT:
 		/* see memcontrol.c */
-		ret = res_counter_memparse_write_strategy(buffer, &val);
+		ret = res_counter_memparse_write_strategy(buf, &val);
 		if (ret)
 			break;
 		ret = tcp_update_limit(memcg, val);
@@ -121,7 +123,7 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
 		ret = -EINVAL;
 		break;
 	}
-	return ret;
+	return ret ?: nbytes;
 }
 
 static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
@@ -193,7 +195,7 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 static struct cftype tcp_files[] = {
 	{
 		.name = "kmem.tcp.limit_in_bytes",
-		.write_string = tcp_cgroup_write,
+		.write = tcp_cgroup_write,
 		.read_u64 = tcp_cgroup_read,
 		.private = RES_LIMIT,
 	},
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 9134dbf70d3e..7dbac4061b1c 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -767,27 +767,27 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	return rc;
 }
 
-static int devcgroup_access_write(struct cgroup_subsys_state *css,
-				  struct cftype *cft, char *buffer)
+static ssize_t devcgroup_access_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
 {
 	int retval;
 
 	mutex_lock(&devcgroup_mutex);
-	retval = devcgroup_update_access(css_to_devcgroup(css),
-					 cft->private, buffer);
+	retval = devcgroup_update_access(css_to_devcgroup(of_css(of)),
+					 of_cft(of)->private, strstrip(buf));
 	mutex_unlock(&devcgroup_mutex);
-	return retval;
+	return retval ?: nbytes;
 }
 
 static struct cftype dev_cgroup_files[] = {
 	{
 		.name = "allow",
-		.write_string  = devcgroup_access_write,
+		.write = devcgroup_access_write,
 		.private = DEVCG_ALLOW,
 	},
 	{
 		.name = "deny",
-		.write_string = devcgroup_access_write,
+		.write = devcgroup_access_write,
 		.private = DEVCG_DENY,
 	},
 	{
-- 
cgit v1.2.3-71-gd317


From 6770c64e5c8da4705d1f0973bdeb5c2bf4f3a404 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 May 2014 12:16:21 -0400
Subject: cgroup: replace cftype->trigger() with cftype->write()

cftype->trigger() is pointless.  It's trivial to ignore the input
buffer from a regular ->write() operation.  Convert all ->trigger()
users to ->write() and remove ->trigger().

This patch doesn't introduce any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
---
 include/linux/cgroup.h    |  8 --------
 kernel/cgroup.c           |  5 +----
 mm/hugetlb_cgroup.c       | 16 ++++++++--------
 mm/memcontrol.c           | 34 ++++++++++++++++++----------------
 net/ipv4/tcp_memcontrol.c | 15 ++++++++-------
 5 files changed, 35 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index aecdc84fe128..08eb71ee600b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -499,14 +499,6 @@ struct cftype {
 	int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
 			 s64 val);
 
-	/*
-	 * trigger() callback can be used to get some kick from the
-	 * userspace, when the actual string written is not important
-	 * at all. The private field can be used to determine the
-	 * kick type for multiplexing.
-	 */
-	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
-
 	/*
 	 * write() is the generic write callback which maps directly to
 	 * kernfs write operation and overrides all other operations.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2a88ce7b24b6..2f16aab03493 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1034,8 +1034,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 		mode |= S_IRUGO;
 
-	if (cft->write_u64 || cft->write_s64 || cft->write ||
-	    cft->trigger)
+	if (cft->write_u64 || cft->write_s64 || cft->write)
 		mode |= S_IWUSR;
 
 	return mode;
@@ -2750,8 +2749,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 		ret = kstrtoll(buf, 0, &v);
 		if (!ret)
 			ret = cft->write_s64(css, cft, v);
-	} else if (cft->trigger) {
-		ret = cft->trigger(css, (unsigned int)cft->private);
 	} else {
 		ret = -EINVAL;
 	}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 191de26b0148..a380681ab3cf 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -284,14 +284,14 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
-static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
-				unsigned int event)
+static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes, loff_t off)
 {
 	int idx, name, ret = 0;
-	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
 
-	idx = MEMFILE_IDX(event);
-	name = MEMFILE_ATTR(event);
+	idx = MEMFILE_IDX(of_cft(of)->private);
+	name = MEMFILE_ATTR(of_cft(of)->private);
 
 	switch (name) {
 	case RES_MAX_USAGE:
@@ -304,7 +304,7 @@ static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
 		ret = -EINVAL;
 		break;
 	}
-	return ret;
+	return ret ?: nbytes;
 }
 
 static char *mem_fmt(char *buf, int size, unsigned long hsize)
@@ -344,14 +344,14 @@ static void __init __hugetlb_cgroup_file_init(int idx)
 	cft = &h->cgroup_files[2];
 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
 	cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
-	cft->trigger = hugetlb_cgroup_reset;
+	cft->write = hugetlb_cgroup_reset;
 	cft->read_u64 = hugetlb_cgroup_read_u64;
 
 	/* Add the failcntfile */
 	cft = &h->cgroup_files[3];
 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
 	cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
-	cft->trigger  = hugetlb_cgroup_reset;
+	cft->write = hugetlb_cgroup_reset;
 	cft->read_u64 = hugetlb_cgroup_read_u64;
 
 	/* NULL terminate the last cft */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7098a43f7447..b638a79209ee 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4887,14 +4887,15 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 	return 0;
 }
 
-static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
-					unsigned int event)
+static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
+					    char *buf, size_t nbytes,
+					    loff_t off)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 
 	if (mem_cgroup_is_root(memcg))
 		return -EINVAL;
-	return mem_cgroup_force_empty(memcg);
+	return mem_cgroup_force_empty(memcg) ?: nbytes;
 }
 
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
@@ -5220,14 +5221,15 @@ out:
 	*memsw_limit = min_memsw_limit;
 }
 
-static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
+static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
+				size_t nbytes, loff_t off)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	int name;
 	enum res_type type;
 
-	type = MEMFILE_TYPE(event);
-	name = MEMFILE_ATTR(event);
+	type = MEMFILE_TYPE(of_cft(of)->private);
+	name = MEMFILE_ATTR(of_cft(of)->private);
 
 	switch (name) {
 	case RES_MAX_USAGE:
@@ -5252,7 +5254,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 		break;
 	}
 
-	return 0;
+	return nbytes;
 }
 
 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
@@ -6105,7 +6107,7 @@ static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
-		.trigger = mem_cgroup_reset,
+		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
@@ -6123,7 +6125,7 @@ static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
-		.trigger = mem_cgroup_reset,
+		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
@@ -6132,7 +6134,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "force_empty",
-		.trigger = mem_cgroup_force_empty_write,
+		.write = mem_cgroup_force_empty_write,
 	},
 	{
 		.name = "use_hierarchy",
@@ -6186,13 +6188,13 @@ static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "kmem.failcnt",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
-		.trigger = mem_cgroup_reset,
+		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
-		.trigger = mem_cgroup_reset,
+		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 #ifdef CONFIG_SLABINFO
@@ -6215,7 +6217,7 @@ static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-		.trigger = mem_cgroup_reset,
+		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
@@ -6227,7 +6229,7 @@ static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-		.trigger = mem_cgroup_reset,
+		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{ },	/* terminate */
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 841fd3fa937a..f7a2ec3ac584 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -170,17 +170,18 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
 	return val;
 }
 
-static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
+static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg;
 	struct cg_proto *cg_proto;
 
-	memcg = mem_cgroup_from_css(css);
+	memcg = mem_cgroup_from_css(of_css(of));
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
-		return 0;
+		return nbytes;
 
-	switch (event) {
+	switch (of_cft(of)->private) {
 	case RES_MAX_USAGE:
 		res_counter_reset_max(&cg_proto->memory_allocated);
 		break;
@@ -189,7 +190,7 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 		break;
 	}
 
-	return 0;
+	return nbytes;
 }
 
 static struct cftype tcp_files[] = {
@@ -207,13 +208,13 @@ static struct cftype tcp_files[] = {
 	{
 		.name = "kmem.tcp.failcnt",
 		.private = RES_FAILCNT,
-		.trigger = tcp_cgroup_reset,
+		.write = tcp_cgroup_reset,
 		.read_u64 = tcp_cgroup_read,
 	},
 	{
 		.name = "kmem.tcp.max_usage_in_bytes",
 		.private = RES_MAX_USAGE,
-		.trigger = tcp_cgroup_reset,
+		.write = tcp_cgroup_reset,
 		.read_u64 = tcp_cgroup_read,
 	},
 	{ }	/* terminate */
-- 
cgit v1.2.3-71-gd317


From b7fc5ad235936379fae67a9f7b50bb53487a1a3a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 May 2014 12:16:22 -0400
Subject: cgroup: remove cgroup->control_kn

Now that cgroup_subtree_control_write() has access to the associated
kernfs_open_file and thus the kernfs_node, there's no need to cache it
in cgroup->control_kn on creation.  Remove cgroup->control_kn and use
@of->kn directly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 8 +++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 08eb71ee600b..aa7353deaaf3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -177,7 +177,6 @@ struct cgroup {
 
 	struct cgroup *parent;		/* my parent */
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
-	struct kernfs_node *control_kn;	/* kn for "cgroup.subtree_control" */
 	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
 
 	/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9a48c117ebf1..94d259bcd2b9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2580,7 +2580,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 * active_ref protection.
 	 */
 	cgroup_get(cgrp);
-	kernfs_break_active_protection(cgrp->control_kn);
+	kernfs_break_active_protection(of->kn);
 
 	mutex_lock(&cgroup_tree_mutex);
 
@@ -2697,7 +2697,7 @@ out_unlock:
 out_unlock_tree:
 	mutex_unlock(&cgroup_tree_mutex);
 out_unbreak:
-	kernfs_unbreak_active_protection(cgrp->control_kn);
+	kernfs_unbreak_active_protection(of->kn);
 	cgroup_put(cgrp);
 	return ret ?: nbytes;
 
@@ -2887,9 +2887,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 		return ret;
 	}
 
-	if (cft->seq_show == cgroup_subtree_control_show)
-		cgrp->control_kn = kn;
-	else if (cft->seq_show == cgroup_populated_show)
+	if (cft->seq_show == cgroup_populated_show)
 		cgrp->populated_kn = kn;
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 7ad24ea4bf620a32631d7b3069c3e30c078b0c3e Mon Sep 17 00:00:00 2001
From: Wilfried Klaebe <w-lkml@lebenslange-mailadresse.de>
Date: Sun, 11 May 2014 00:12:32 +0000
Subject: net: get rid of SET_ETHTOOL_OPS

net: get rid of SET_ETHTOOL_OPS

Dave Miller mentioned he'd like to see SET_ETHTOOL_OPS gone.
This does that.

Mostly done via coccinelle script:
@@
struct ethtool_ops *ops;
struct net_device *dev;
@@
-       SET_ETHTOOL_OPS(dev, ops);
+       dev->ethtool_ops = ops;

Compile tested only, but I'd seriously wonder if this broke anything.

Suggested-by: Dave Miller <davem@davemloft.net>
Signed-off-by: Wilfried Klaebe <w-lkml@lebenslange-mailadresse.de>
Acked-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c            | 2 +-
 drivers/net/ethernet/3com/3c509.c                       | 2 +-
 drivers/net/ethernet/3com/3c589_cs.c                    | 2 +-
 drivers/net/ethernet/3com/typhoon.c                     | 2 +-
 drivers/net/ethernet/adaptec/starfire.c                 | 2 +-
 drivers/net/ethernet/alteon/acenic.c                    | 2 +-
 drivers/net/ethernet/altera/altera_tse_ethtool.c        | 2 +-
 drivers/net/ethernet/amd/amd8111e.c                     | 2 +-
 drivers/net/ethernet/amd/au1000_eth.c                   | 2 +-
 drivers/net/ethernet/amd/nmclan_cs.c                    | 2 +-
 drivers/net/ethernet/atheros/alx/main.c                 | 2 +-
 drivers/net/ethernet/atheros/atl1c/atl1c_ethtool.c      | 2 +-
 drivers/net/ethernet/atheros/atl1e/atl1e_ethtool.c      | 2 +-
 drivers/net/ethernet/atheros/atlx/atl2.c                | 2 +-
 drivers/net/ethernet/broadcom/b44.c                     | 2 +-
 drivers/net/ethernet/broadcom/bcm63xx_enet.c            | 4 ++--
 drivers/net/ethernet/broadcom/bcmsysport.c              | 2 +-
 drivers/net/ethernet/broadcom/bgmac.c                   | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c     | 6 ++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.c          | 2 +-
 drivers/net/ethernet/brocade/bna/bnad_ethtool.c         | 2 +-
 drivers/net/ethernet/calxeda/xgmac.c                    | 2 +-
 drivers/net/ethernet/chelsio/cxgb/cxgb2.c               | 2 +-
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c         | 2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c         | 2 +-
 drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c     | 2 +-
 drivers/net/ethernet/cisco/enic/enic_ethtool.c          | 2 +-
 drivers/net/ethernet/dec/tulip/tulip_core.c             | 2 +-
 drivers/net/ethernet/dlink/dl2k.c                       | 2 +-
 drivers/net/ethernet/dlink/sundance.c                   | 2 +-
 drivers/net/ethernet/emulex/benet/be_main.c             | 2 +-
 drivers/net/ethernet/faraday/ftgmac100.c                | 2 +-
 drivers/net/ethernet/faraday/ftmac100.c                 | 2 +-
 drivers/net/ethernet/freescale/ucc_geth_ethtool.c       | 2 +-
 drivers/net/ethernet/fujitsu/fmvj18x_cs.c               | 2 +-
 drivers/net/ethernet/ibm/ehea/ehea_ethtool.c            | 2 +-
 drivers/net/ethernet/ibm/emac/core.c                    | 2 +-
 drivers/net/ethernet/icplus/ipg.c                       | 2 +-
 drivers/net/ethernet/intel/e100.c                       | 2 +-
 drivers/net/ethernet/intel/e1000/e1000_ethtool.c        | 2 +-
 drivers/net/ethernet/intel/e1000e/ethtool.c             | 2 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c          | 2 +-
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c      | 2 +-
 drivers/net/ethernet/intel/igb/igb_ethtool.c            | 2 +-
 drivers/net/ethernet/intel/igbvf/ethtool.c              | 2 +-
 drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c          | 2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c        | 2 +-
 drivers/net/ethernet/intel/ixgbevf/ethtool.c            | 2 +-
 drivers/net/ethernet/marvell/mv643xx_eth.c              | 2 +-
 drivers/net/ethernet/marvell/mvneta.c                   | 2 +-
 drivers/net/ethernet/marvell/pxa168_eth.c               | 2 +-
 drivers/net/ethernet/marvell/sky2.c                     | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c          | 2 +-
 drivers/net/ethernet/micrel/ks8695net.c                 | 6 +++---
 drivers/net/ethernet/micrel/ks8851.c                    | 2 +-
 drivers/net/ethernet/micrel/ksz884x.c                   | 2 +-
 drivers/net/ethernet/microchip/enc28j60.c               | 2 +-
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c        | 2 +-
 drivers/net/ethernet/natsemi/natsemi.c                  | 2 +-
 drivers/net/ethernet/natsemi/ns83820.c                  | 2 +-
 drivers/net/ethernet/neterion/s2io.c                    | 2 +-
 drivers/net/ethernet/neterion/vxge/vxge-ethtool.c       | 2 +-
 drivers/net/ethernet/nvidia/forcedeth.c                 | 2 +-
 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c | 2 +-
 drivers/net/ethernet/packetengines/hamachi.c            | 6 ++----
 drivers/net/ethernet/packetengines/yellowfin.c          | 2 +-
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c    | 2 +-
 drivers/net/ethernet/qlogic/qla3xxx.c                   | 2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c        | 8 +++-----
 drivers/net/ethernet/qlogic/qlge/qlge_main.c            | 2 +-
 drivers/net/ethernet/realtek/r8169.c                    | 2 +-
 drivers/net/ethernet/renesas/sh_eth.c                   | 2 +-
 drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c      | 2 +-
 drivers/net/ethernet/sfc/efx.c                          | 2 +-
 drivers/net/ethernet/sis/sis190.c                       | 2 +-
 drivers/net/ethernet/smsc/smc91c92_cs.c                 | 2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c    | 2 +-
 drivers/net/ethernet/tehuti/tehuti.c                    | 2 +-
 drivers/net/ethernet/ti/cpsw.c                          | 4 ++--
 drivers/net/ethernet/ti/davinci_emac.c                  | 2 +-
 drivers/net/hyperv/netvsc_drv.c                         | 2 +-
 drivers/net/ntb_netdev.c                                | 2 +-
 drivers/net/rionet.c                                    | 2 +-
 drivers/net/usb/catc.c                                  | 2 +-
 drivers/net/usb/hso.c                                   | 2 +-
 drivers/net/usb/ipheth.c                                | 2 +-
 drivers/net/usb/kaweth.c                                | 2 +-
 drivers/net/usb/pegasus.c                               | 2 +-
 drivers/net/usb/r8152.c                                 | 2 +-
 drivers/net/usb/rtl8150.c                               | 2 +-
 drivers/net/virtio_net.c                                | 2 +-
 drivers/net/vmxnet3/vmxnet3_ethtool.c                   | 2 +-
 drivers/net/vxlan.c                                     | 2 +-
 drivers/net/wireless/hostap/hostap_main.c               | 2 +-
 drivers/net/xen-netback/interface.c                     | 2 +-
 drivers/net/xen-netfront.c                              | 2 +-
 drivers/s390/net/qeth_l2_main.c                         | 7 +++----
 drivers/s390/net/qeth_l3_main.c                         | 2 +-
 drivers/staging/et131x/et131x.c                         | 2 +-
 drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c        | 2 +-
 drivers/staging/netlogic/xlr_net.c                      | 2 +-
 drivers/staging/octeon/ethernet.c                       | 2 +-
 drivers/usb/gadget/u_ether.c                            | 4 ++--
 include/linux/netdevice.h                               | 3 ---
 net/batman-adv/soft-interface.c                         | 2 +-
 net/bridge/br_device.c                                  | 2 +-
 net/dsa/slave.c                                         | 2 +-
 net/openvswitch/vport-internal_dev.c                    | 2 +-
 108 files changed, 118 insertions(+), 128 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index c4b3940845e6..078cadd6c797 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -105,5 +105,5 @@ static const struct ethtool_ops ipoib_ethtool_ops = {
 
 void ipoib_set_ethtool_ops(struct net_device *dev)
 {
-	SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops);
+	dev->ethtool_ops = &ipoib_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c
index 35df0b9e6848..a968654b631d 100644
--- a/drivers/net/ethernet/3com/3c509.c
+++ b/drivers/net/ethernet/3com/3c509.c
@@ -534,7 +534,7 @@ static int el3_common_init(struct net_device *dev)
 	/* The EL3-specific entries in the device structure. */
 	dev->netdev_ops = &netdev_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
-	SET_ETHTOOL_OPS(dev, &ethtool_ops);
+	dev->ethtool_ops = &ethtool_ops;
 
 	err = register_netdev(dev);
 	if (err) {
diff --git a/drivers/net/ethernet/3com/3c589_cs.c b/drivers/net/ethernet/3com/3c589_cs.c
index 063557e037f2..f18647c23559 100644
--- a/drivers/net/ethernet/3com/3c589_cs.c
+++ b/drivers/net/ethernet/3com/3c589_cs.c
@@ -218,7 +218,7 @@ static int tc589_probe(struct pcmcia_device *link)
 	dev->netdev_ops = &el3_netdev_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
 
-	SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
+	dev->ethtool_ops = &netdev_ethtool_ops;
 
 	return tc589_config(link);
 }
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c
index 465cc7108d8a..e13b04624ded 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -2435,7 +2435,7 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netif_napi_add(dev, &tp->napi, typhoon_poll, 16);
 	dev->watchdog_timeo	= TX_TIMEOUT;
 
-	SET_ETHTOOL_OPS(dev, &typhoon_ethtool_ops);
+	dev->ethtool_ops = &typhoon_ethtool_ops;
 
 	/* We can handle scatter gather, up to 16 entries, and
 	 * we can do IP checksumming (only version 4, doh...)
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 171d73c1d3c2..40dbbf740331 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -784,7 +784,7 @@ static int starfire_init_one(struct pci_dev *pdev,
 
 	dev->netdev_ops = &netdev_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
-	SET_ETHTOOL_OPS(dev, &ethtool_ops);
+	dev->ethtool_ops = &ethtool_ops;
 
 	netif_napi_add(dev, &np->napi, netdev_poll, max_interrupt_work);
 
diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index 1517e9df5ba1..9a6991be9749 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -476,7 +476,7 @@ static int acenic_probe_one(struct pci_dev *pdev,
 	dev->watchdog_timeo = 5*HZ;
 
 	dev->netdev_ops = &ace_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &ace_ethtool_ops);
+	dev->ethtool_ops = &ace_ethtool_ops;
 
 	/* we only display this string ONCE */
 	if (!boards_found)
diff --git a/drivers/net/ethernet/altera/altera_tse_ethtool.c b/drivers/net/ethernet/altera/altera_tse_ethtool.c
index 76133caffa78..d817e285b266 100644
--- a/drivers/net/ethernet/altera/altera_tse_ethtool.c
+++ b/drivers/net/ethernet/altera/altera_tse_ethtool.c
@@ -237,5 +237,5 @@ static const struct ethtool_ops tse_ethtool_ops = {
 
 void altera_tse_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &tse_ethtool_ops);
+	netdev->ethtool_ops = &tse_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 26efaaa5e73f..068dc7cad5fa 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1900,7 +1900,7 @@ static int amd8111e_probe_one(struct pci_dev *pdev,
 
 	/* Initialize driver entry points */
 	dev->netdev_ops = &amd8111e_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &ops);
+	dev->ethtool_ops = &ops;
 	dev->irq =pdev->irq;
 	dev->watchdog_timeo = AMD8111E_TX_TIMEOUT;
 	netif_napi_add(dev, &lp->napi, amd8111e_rx_poll, 32);
diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c
index a2bd91e3d302..a78e4c136959 100644
--- a/drivers/net/ethernet/amd/au1000_eth.c
+++ b/drivers/net/ethernet/amd/au1000_eth.c
@@ -1229,7 +1229,7 @@ static int au1000_probe(struct platform_device *pdev)
 	dev->base_addr = base->start;
 	dev->irq = irq;
 	dev->netdev_ops = &au1000_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &au1000_ethtool_ops);
+	dev->ethtool_ops = &au1000_ethtool_ops;
 	dev->watchdog_timeo = ETH_TX_TIMEOUT;
 
 	/*
diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c
index 08569fe2b182..abf3b1581c82 100644
--- a/drivers/net/ethernet/amd/nmclan_cs.c
+++ b/drivers/net/ethernet/amd/nmclan_cs.c
@@ -457,7 +457,7 @@ static int nmclan_probe(struct pcmcia_device *link)
     lp->tx_free_frames=AM2150_MAX_TX_FRAMES;
 
     dev->netdev_ops = &mace_netdev_ops;
-    SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
+    dev->ethtool_ops = &netdev_ethtool_ops;
     dev->watchdog_timeo = TX_TIMEOUT;
 
     return nmclan_config(link);
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index 17bb9ce96260..49faa97a30c3 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1302,7 +1302,7 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	netdev->netdev_ops = &alx_netdev_ops;
-	SET_ETHTOOL_OPS(netdev, &alx_ethtool_ops);
+	netdev->ethtool_ops = &alx_ethtool_ops;
 	netdev->irq = pdev->irq;
 	netdev->watchdog_timeo = ALX_WATCHDOG_TIME;
 
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_ethtool.c b/drivers/net/ethernet/atheros/atl1c/atl1c_ethtool.c
index 859ea844ba0f..ecacaaeb2b92 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_ethtool.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_ethtool.c
@@ -305,5 +305,5 @@ static const struct ethtool_ops atl1c_ethtool_ops = {
 
 void atl1c_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &atl1c_ethtool_ops);
+	netdev->ethtool_ops = &atl1c_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_ethtool.c b/drivers/net/ethernet/atheros/atl1e/atl1e_ethtool.c
index 82b23861bf55..206e9b7be431 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_ethtool.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_ethtool.c
@@ -388,5 +388,5 @@ static const struct ethtool_ops atl1e_ethtool_ops = {
 
 void atl1e_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &atl1e_ethtool_ops);
+	netdev->ethtool_ops = &atl1e_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 78befb522a52..2587fed7b02c 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -1396,7 +1396,7 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	atl2_setup_pcicmd(pdev);
 
 	netdev->netdev_ops = &atl2_netdev_ops;
-	SET_ETHTOOL_OPS(netdev, &atl2_ethtool_ops);
+	netdev->ethtool_ops = &atl2_ethtool_ops;
 	netdev->watchdog_timeo = 5 * HZ;
 	strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
 
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 05ba62589017..ca5a20a48b14 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -2380,7 +2380,7 @@ static int b44_init_one(struct ssb_device *sdev,
 	netif_napi_add(dev, &bp->napi, b44_poll, 64);
 	dev->watchdog_timeo = B44_TX_TIMEOUT;
 	dev->irq = sdev->irq;
-	SET_ETHTOOL_OPS(dev, &b44_ethtool_ops);
+	dev->ethtool_ops = &b44_ethtool_ops;
 
 	err = ssb_bus_powerup(sdev->bus, 0);
 	if (err) {
diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 8db34d389675..3e8d1a88ed3d 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -1897,7 +1897,7 @@ static int bcm_enet_probe(struct platform_device *pdev)
 	dev->netdev_ops = &bcm_enet_ops;
 	netif_napi_add(dev, &priv->napi, bcm_enet_poll, 16);
 
-	SET_ETHTOOL_OPS(dev, &bcm_enet_ethtool_ops);
+	dev->ethtool_ops = &bcm_enet_ethtool_ops;
 	SET_NETDEV_DEV(dev, &pdev->dev);
 
 	ret = register_netdev(dev);
@@ -2783,7 +2783,7 @@ static int bcm_enetsw_probe(struct platform_device *pdev)
 	/* register netdevice */
 	dev->netdev_ops = &bcm_enetsw_ops;
 	netif_napi_add(dev, &priv->napi, bcm_enet_poll, 16);
-	SET_ETHTOOL_OPS(dev, &bcm_enetsw_ethtool_ops);
+	dev->ethtool_ops = &bcm_enetsw_ethtool_ops;
 	SET_NETDEV_DEV(dev, &pdev->dev);
 
 	spin_lock_init(&priv->enetsw_mdio_lock);
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 4dc8d1e9829b..56b74a495181 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1540,7 +1540,7 @@ static int bcm_sysport_probe(struct platform_device *pdev)
 
 	SET_NETDEV_DEV(dev, &pdev->dev);
 	dev_set_drvdata(&pdev->dev, dev);
-	SET_ETHTOOL_OPS(dev, &bcm_sysport_ethtool_ops);
+	dev->ethtool_ops = &bcm_sysport_ethtool_ops;
 	dev->netdev_ops = &bcm_sysport_netdev_ops;
 	netif_napi_add(dev, &priv->napi, bcm_sysport_poll, 64);
 
diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c
index 0297a79a38e1..05c6af6c418f 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1436,7 +1436,7 @@ static int bgmac_probe(struct bcma_device *core)
 		return -ENOMEM;
 	net_dev->netdev_ops = &bgmac_netdev_ops;
 	net_dev->irq = core->irq;
-	SET_ETHTOOL_OPS(net_dev, &bgmac_ethtool_ops);
+	net_dev->ethtool_ops = &bgmac_ethtool_ops;
 	bgmac = netdev_priv(net_dev);
 	bgmac->net_dev = net_dev;
 	bgmac->core = core;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index b6de05e3149b..03224090ecf9 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3506,8 +3506,6 @@ static const struct ethtool_ops bnx2x_vf_ethtool_ops = {
 
 void bnx2x_set_ethtool_ops(struct bnx2x *bp, struct net_device *netdev)
 {
-	if (IS_PF(bp))
-		SET_ETHTOOL_OPS(netdev, &bnx2x_ethtool_ops);
-	else /* vf */
-		SET_ETHTOOL_OPS(netdev, &bnx2x_vf_ethtool_ops);
+	netdev->ethtool_ops = (IS_PF(bp)) ?
+		&bnx2x_ethtool_ops : &bnx2x_vf_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 0966bd04375f..5ba1cfbd60da 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2481,7 +2481,7 @@ static int bcmgenet_probe(struct platform_device *pdev)
 	dev_set_drvdata(&pdev->dev, dev);
 	ether_addr_copy(dev->dev_addr, macaddr);
 	dev->watchdog_timeo = 2 * HZ;
-	SET_ETHTOOL_OPS(dev, &bcmgenet_ethtool_ops);
+	dev->ethtool_ops = &bcmgenet_ethtool_ops;
 	dev->netdev_ops = &bcmgenet_netdev_ops;
 	netif_napi_add(dev, &priv->napi, bcmgenet_poll, 64);
 
diff --git a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c
index f9e150825bb5..adca62b72837 100644
--- a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c
+++ b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c
@@ -1137,5 +1137,5 @@ static const struct ethtool_ops bnad_ethtool_ops = {
 void
 bnad_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &bnad_ethtool_ops);
+	netdev->ethtool_ops = &bnad_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c
index 521dfea44b83..25d6b2a10e4e 100644
--- a/drivers/net/ethernet/calxeda/xgmac.c
+++ b/drivers/net/ethernet/calxeda/xgmac.c
@@ -1737,7 +1737,7 @@ static int xgmac_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, ndev);
 	ether_setup(ndev);
 	ndev->netdev_ops = &xgmac_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &xgmac_ethtool_ops);
+	ndev->ethtool_ops = &xgmac_ethtool_ops;
 	spin_lock_init(&priv->stats_lock);
 	INIT_WORK(&priv->tx_timeout_work, xgmac_tx_timeout_work);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
index 0fe7ff750d77..c1b2c1dbf015 100644
--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
+++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
@@ -1100,7 +1100,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 		netif_napi_add(netdev, &adapter->napi, t1_poll, 64);
 
-		SET_ETHTOOL_OPS(netdev, &t1_ethtool_ops);
+		netdev->ethtool_ops = &t1_ethtool_ops;
 	}
 
 	if (t1_init_sw_modules(adapter, bi) < 0) {
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index 07bbb711b7e5..3ed507947248 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3291,7 +3291,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			netdev->features |= NETIF_F_HIGHDMA;
 
 		netdev->netdev_ops = &cxgb_netdev_ops;
-		SET_ETHTOOL_OPS(netdev, &cxgb_ethtool_ops);
+		netdev->ethtool_ops = &cxgb_ethtool_ops;
 	}
 
 	pci_set_drvdata(pdev, adapter);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 0f1e886d89e3..266a5bc6aedf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -6083,7 +6083,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		netdev->priv_flags |= IFF_UNICAST_FLT;
 
 		netdev->netdev_ops = &cxgb4_netdev_ops;
-		SET_ETHTOOL_OPS(netdev, &cxgb_ethtool_ops);
+		netdev->ethtool_ops = &cxgb_ethtool_ops;
 	}
 
 	pci_set_drvdata(pdev, adapter);
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 52859288de7b..ff1cdd1788b5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -2664,7 +2664,7 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev,
 		netdev->priv_flags |= IFF_UNICAST_FLT;
 
 		netdev->netdev_ops = &cxgb4vf_netdev_ops;
-		SET_ETHTOOL_OPS(netdev, &cxgb4vf_ethtool_ops);
+		netdev->ethtool_ops = &cxgb4vf_ethtool_ops;
 
 		/*
 		 * Initialize the hardware/software state for the port.
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 47e3562f4866..58a8c67638e3 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -253,5 +253,5 @@ static const struct ethtool_ops enic_ethtool_ops = {
 
 void enic_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &enic_ethtool_ops);
+	netdev->ethtool_ops = &enic_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c
index 1642de78aac8..861660841ce2 100644
--- a/drivers/net/ethernet/dec/tulip/tulip_core.c
+++ b/drivers/net/ethernet/dec/tulip/tulip_core.c
@@ -1703,7 +1703,7 @@ static int tulip_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 #ifdef CONFIG_TULIP_NAPI
 	netif_napi_add(dev, &tp->napi, tulip_poll, 16);
 #endif
-	SET_ETHTOOL_OPS(dev, &ops);
+	dev->ethtool_ops = &ops;
 
 	if (register_netdev(dev))
 		goto err_out_free_ring;
diff --git a/drivers/net/ethernet/dlink/dl2k.c b/drivers/net/ethernet/dlink/dl2k.c
index 4fb756d219f7..2324f2ddfd48 100644
--- a/drivers/net/ethernet/dlink/dl2k.c
+++ b/drivers/net/ethernet/dlink/dl2k.c
@@ -227,7 +227,7 @@ rio_probe1 (struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 	dev->netdev_ops = &netdev_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
-	SET_ETHTOOL_OPS(dev, &ethtool_ops);
+	dev->ethtool_ops = &ethtool_ops;
 #if 0
 	dev->features = NETIF_F_IP_CSUM;
 #endif
diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c
index d9e5ca0d48c1..433c1e185442 100644
--- a/drivers/net/ethernet/dlink/sundance.c
+++ b/drivers/net/ethernet/dlink/sundance.c
@@ -577,7 +577,7 @@ static int sundance_probe1(struct pci_dev *pdev,
 
 	/* The chip-specific entries in the device structure. */
 	dev->netdev_ops = &netdev_ops;
-	SET_ETHTOOL_OPS(dev, &ethtool_ops);
+	dev->ethtool_ops = &ethtool_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
 
 	pci_set_drvdata(pdev, dev);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 3f04356afa82..dcc5e5c69743 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4304,7 +4304,7 @@ static void be_netdev_init(struct net_device *netdev)
 
 	netdev->netdev_ops = &be_netdev_ops;
 
-	SET_ETHTOOL_OPS(netdev, &be_ethtool_ops);
+	netdev->ethtool_ops = &be_ethtool_ops;
 }
 
 static void be_unmap_pci_bars(struct be_adapter *adapter)
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 68069eabc4f8..c77fa4a69844 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1210,7 +1210,7 @@ static int ftgmac100_probe(struct platform_device *pdev)
 
 	SET_NETDEV_DEV(netdev, &pdev->dev);
 
-	SET_ETHTOOL_OPS(netdev, &ftgmac100_ethtool_ops);
+	netdev->ethtool_ops = &ftgmac100_ethtool_ops;
 	netdev->netdev_ops = &ftgmac100_netdev_ops;
 	netdev->features = NETIF_F_IP_CSUM | NETIF_F_GRO;
 
diff --git a/drivers/net/ethernet/faraday/ftmac100.c b/drivers/net/ethernet/faraday/ftmac100.c
index 8be5b40c0a12..4ff1adc6bfca 100644
--- a/drivers/net/ethernet/faraday/ftmac100.c
+++ b/drivers/net/ethernet/faraday/ftmac100.c
@@ -1085,7 +1085,7 @@ static int ftmac100_probe(struct platform_device *pdev)
 	}
 
 	SET_NETDEV_DEV(netdev, &pdev->dev);
-	SET_ETHTOOL_OPS(netdev, &ftmac100_ethtool_ops);
+	netdev->ethtool_ops = &ftmac100_ethtool_ops;
 	netdev->netdev_ops = &ftmac100_netdev_ops;
 
 	platform_set_drvdata(pdev, netdev);
diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c
index 413329eff2ff..cc83350d56ba 100644
--- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c
+++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c
@@ -417,5 +417,5 @@ static const struct ethtool_ops uec_ethtool_ops = {
 
 void uec_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &uec_ethtool_ops);
+	netdev->ethtool_ops = &uec_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
index 7becab1aa3e4..cfe7a7431730 100644
--- a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
+++ b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
@@ -256,7 +256,7 @@ static int fmvj18x_probe(struct pcmcia_device *link)
     dev->netdev_ops = &fjn_netdev_ops;
     dev->watchdog_timeo = TX_TIMEOUT;
 
-    SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
+    dev->ethtool_ops = &netdev_ethtool_ops;
 
     return fmvj18x_config(link);
 } /* fmvj18x_attach */
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_ethtool.c b/drivers/net/ethernet/ibm/ehea/ehea_ethtool.c
index 95837b99a464..6055e3eaf49c 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_ethtool.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_ethtool.c
@@ -278,5 +278,5 @@ static const struct ethtool_ops ehea_ethtool_ops = {
 
 void ehea_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &ehea_ethtool_ops);
+	netdev->ethtool_ops = &ehea_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index ae342fdb42c8..87bd953cc2ee 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -2879,7 +2879,7 @@ static int emac_probe(struct platform_device *ofdev)
 		dev->commac.ops = &emac_commac_sg_ops;
 	} else
 		ndev->netdev_ops = &emac_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &emac_ethtool_ops);
+	ndev->ethtool_ops = &emac_ethtool_ops;
 
 	netif_carrier_off(ndev);
 
diff --git a/drivers/net/ethernet/icplus/ipg.c b/drivers/net/ethernet/icplus/ipg.c
index 25045ae07171..5727779a7df2 100644
--- a/drivers/net/ethernet/icplus/ipg.c
+++ b/drivers/net/ethernet/icplus/ipg.c
@@ -2245,7 +2245,7 @@ static int ipg_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	 */
 	dev->netdev_ops = &ipg_netdev_ops;
 	SET_NETDEV_DEV(dev, &pdev->dev);
-	SET_ETHTOOL_OPS(dev, &ipg_ethtool_ops);
+	dev->ethtool_ops = &ipg_ethtool_ops;
 
 	rc = pci_request_regions(pdev, DRV_NAME);
 	if (rc)
diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c
index b56461ce674c..9d979d7debef 100644
--- a/drivers/net/ethernet/intel/e100.c
+++ b/drivers/net/ethernet/intel/e100.c
@@ -2854,7 +2854,7 @@ static int e100_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netdev->hw_features |= NETIF_F_RXALL;
 
 	netdev->netdev_ops = &e100_netdev_ops;
-	SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops);
+	netdev->ethtool_ops = &e100_ethtool_ops;
 	netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;
 	strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
 
diff --git a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
index 73a8aeefb92a..341889a4ef7f 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
@@ -1905,5 +1905,5 @@ static const struct ethtool_ops e1000_ethtool_ops = {
 
 void e1000_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &e1000_ethtool_ops);
+	netdev->ethtool_ops = &e1000_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c
index 4e5ad7ebe1f2..e9a48bb5caac 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -2318,5 +2318,5 @@ static const struct ethtool_ops e1000_ethtool_ops = {
 
 void e1000e_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &e1000_ethtool_ops);
+	netdev->ethtool_ops = &e1000_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 0cf47c958081..f62929419a09 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1700,5 +1700,5 @@ static const struct ethtool_ops i40e_ethtool_ops = {
 
 void i40e_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &i40e_ethtool_ops);
+	netdev->ethtool_ops = &i40e_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index a46be016039e..77e786d2d0e0 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -705,5 +705,5 @@ static struct ethtool_ops i40evf_ethtool_ops = {
  **/
 void i40evf_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &i40evf_ethtool_ops);
+	netdev->ethtool_ops = &i40evf_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 333a2b0bbada..a84297c85fb1 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -3035,5 +3035,5 @@ static const struct ethtool_ops igb_ethtool_ops = {
 
 void igb_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &igb_ethtool_ops);
+	netdev->ethtool_ops = &igb_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/igbvf/ethtool.c b/drivers/net/ethernet/intel/igbvf/ethtool.c
index 90eef07943f4..f58170bae18b 100644
--- a/drivers/net/ethernet/intel/igbvf/ethtool.c
+++ b/drivers/net/ethernet/intel/igbvf/ethtool.c
@@ -476,5 +476,5 @@ static const struct ethtool_ops igbvf_ethtool_ops = {
 
 void igbvf_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &igbvf_ethtool_ops);
+	netdev->ethtool_ops = &igbvf_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
index dbb7dd2f8e36..1da2d987d370 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
@@ -656,5 +656,5 @@ static const struct ethtool_ops ixgb_ethtool_ops = {
 
 void ixgb_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &ixgb_ethtool_ops);
+	netdev->ethtool_ops = &ixgb_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 6c55c14d082a..31d7268401e7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3099,5 +3099,5 @@ static const struct ethtool_ops ixgbe_ethtool_ops = {
 
 void ixgbe_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &ixgbe_ethtool_ops);
+	netdev->ethtool_ops = &ixgbe_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index 1baecb60f065..a757f0734719 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -813,5 +813,5 @@ static const struct ethtool_ops ixgbevf_ethtool_ops = {
 
 void ixgbevf_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &ixgbevf_ethtool_ops);
+	netdev->ethtool_ops = &ixgbevf_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index b7b8d74c22d9..df1d1b97187f 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2889,7 +2889,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	if (err)
 		goto out;
 
-	SET_ETHTOOL_OPS(dev, &mv643xx_eth_ethtool_ops);
+	dev->ethtool_ops = &mv643xx_eth_ethtool_ops;
 
 	init_pscr(mp, pd->speed, pd->duplex);
 
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 14786c8bf99e..72bc47f2d2e9 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2813,7 +2813,7 @@ static int mvneta_probe(struct platform_device *pdev)
 	dev->watchdog_timeo = 5 * HZ;
 	dev->netdev_ops = &mvneta_netdev_ops;
 
-	SET_ETHTOOL_OPS(dev, &mvneta_eth_tool_ops);
+	dev->ethtool_ops = &mvneta_eth_tool_ops;
 
 	pp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index b358c2f6f4bd..8f5aa7c62b18 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -1488,7 +1488,7 @@ static int pxa168_eth_probe(struct platform_device *pdev)
 	dev->netdev_ops = &pxa168_eth_netdev_ops;
 	dev->watchdog_timeo = 2 * HZ;
 	dev->base_addr = 0;
-	SET_ETHTOOL_OPS(dev, &pxa168_ethtool_ops);
+	dev->ethtool_ops = &pxa168_ethtool_ops;
 
 	INIT_WORK(&pep->tx_timeout_task, pxa168_eth_tx_timeout_task);
 
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index b81106451a0a..69693384b58c 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4760,7 +4760,7 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port,
 
 	SET_NETDEV_DEV(dev, &hw->pdev->dev);
 	dev->irq = hw->pdev->irq;
-	SET_ETHTOOL_OPS(dev, &sky2_ethtool_ops);
+	dev->ethtool_ops = &sky2_ethtool_ops;
 	dev->watchdog_timeo = TX_WATCHDOG;
 	dev->netdev_ops = &sky2_netdev_ops[port];
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index fba3c8e77626..79c6f467d17e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2539,7 +2539,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	netif_set_real_num_tx_queues(dev, priv->tx_ring_num);
 	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
 
-	SET_ETHTOOL_OPS(dev, &mlx4_en_ethtool_ops);
+	dev->ethtool_ops = &mlx4_en_ethtool_ops;
 
 	/*
 	 * Set driver features
diff --git a/drivers/net/ethernet/micrel/ks8695net.c b/drivers/net/ethernet/micrel/ks8695net.c
index 16435b3cfa9f..6c7c78baedca 100644
--- a/drivers/net/ethernet/micrel/ks8695net.c
+++ b/drivers/net/ethernet/micrel/ks8695net.c
@@ -1504,15 +1504,15 @@ ks8695_probe(struct platform_device *pdev)
 	if (ksp->phyiface_regs && ksp->link_irq == -1) {
 		ks8695_init_switch(ksp);
 		ksp->dtype = KS8695_DTYPE_LAN;
-		SET_ETHTOOL_OPS(ndev, &ks8695_ethtool_ops);
+		ndev->ethtool_ops = &ks8695_ethtool_ops;
 	} else if (ksp->phyiface_regs && ksp->link_irq != -1) {
 		ks8695_init_wan_phy(ksp);
 		ksp->dtype = KS8695_DTYPE_WAN;
-		SET_ETHTOOL_OPS(ndev, &ks8695_wan_ethtool_ops);
+		ndev->ethtool_ops = &ks8695_wan_ethtool_ops;
 	} else {
 		/* No initialisation since HPNA does not have a PHY */
 		ksp->dtype = KS8695_DTYPE_HPNA;
-		SET_ETHTOOL_OPS(ndev, &ks8695_ethtool_ops);
+		ndev->ethtool_ops = &ks8695_ethtool_ops;
 	}
 
 	/* And bring up the net_device with the net core */
diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c
index e0c92e0e5e1d..13767eb36a48 100644
--- a/drivers/net/ethernet/micrel/ks8851.c
+++ b/drivers/net/ethernet/micrel/ks8851.c
@@ -1471,7 +1471,7 @@ static int ks8851_probe(struct spi_device *spi)
 
 	skb_queue_head_init(&ks->txq);
 
-	SET_ETHTOOL_OPS(ndev, &ks8851_ethtool_ops);
+	ndev->ethtool_ops = &ks8851_ethtool_ops;
 	SET_NETDEV_DEV(ndev, &spi->dev);
 
 	spi_set_drvdata(spi, ks);
diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c
index 14ac0e2bc09f..4b9592c1fb40 100644
--- a/drivers/net/ethernet/micrel/ksz884x.c
+++ b/drivers/net/ethernet/micrel/ksz884x.c
@@ -7106,7 +7106,7 @@ static int pcidev_init(struct pci_dev *pdev, const struct pci_device_id *id)
 		}
 
 		dev->netdev_ops = &netdev_ops;
-		SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
+		dev->ethtool_ops = &netdev_ethtool_ops;
 		if (register_netdev(dev))
 			goto pcidev_init_reg_err;
 		port_set_power_saving(port, true);
diff --git a/drivers/net/ethernet/microchip/enc28j60.c b/drivers/net/ethernet/microchip/enc28j60.c
index c7b40aa21f22..b1b5f66b8b69 100644
--- a/drivers/net/ethernet/microchip/enc28j60.c
+++ b/drivers/net/ethernet/microchip/enc28j60.c
@@ -1593,7 +1593,7 @@ static int enc28j60_probe(struct spi_device *spi)
 	dev->irq = spi->irq;
 	dev->netdev_ops = &enc28j60_netdev_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
-	SET_ETHTOOL_OPS(dev, &enc28j60_ethtool_ops);
+	dev->ethtool_ops = &enc28j60_ethtool_ops;
 
 	enc28j60_lowpower(priv, true);
 
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index 130f6b204efa..f3d5d79f1cd1 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -4112,7 +4112,7 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	setup_timer(&mgp->watchdog_timer, myri10ge_watchdog_timer,
 		    (unsigned long)mgp);
 
-	SET_ETHTOOL_OPS(netdev, &myri10ge_ethtool_ops);
+	netdev->ethtool_ops = &myri10ge_ethtool_ops;
 	INIT_WORK(&mgp->watchdog_work, myri10ge_watchdog);
 	status = register_netdev(netdev);
 	if (status != 0) {
diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c
index 64ec2a437f46..291fba8b9f07 100644
--- a/drivers/net/ethernet/natsemi/natsemi.c
+++ b/drivers/net/ethernet/natsemi/natsemi.c
@@ -927,7 +927,7 @@ static int natsemi_probe1(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->netdev_ops = &natsemi_netdev_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
 
-	SET_ETHTOOL_OPS(dev, &ethtool_ops);
+	dev->ethtool_ops = &ethtool_ops;
 
 	if (mtu)
 		dev->mtu = mtu;
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index dbccf1de49ec..19bb8244b9e3 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -2030,7 +2030,7 @@ static int ns83820_init_one(struct pci_dev *pci_dev,
 		pci_dev->subsystem_vendor, pci_dev->subsystem_device);
 
 	ndev->netdev_ops = &netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &ops);
+	ndev->ethtool_ops = &ops;
 	ndev->watchdog_timeo = 5 * HZ;
 	pci_set_drvdata(pci_dev, ndev);
 
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index e900c1abdef7..e3cf38e6ce3c 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -7910,7 +7910,7 @@ s2io_init_nic(struct pci_dev *pdev, const struct pci_device_id *pre)
 
 	/*  Driver entry points */
 	dev->netdev_ops = &s2io_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
+	dev->ethtool_ops = &netdev_ethtool_ops;
 	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO6 |
 		NETIF_F_RXCSUM | NETIF_F_LRO;
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
index f8f073880f84..ddcc81ad1ae1 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
@@ -1128,5 +1128,5 @@ static const struct ethtool_ops vxge_ethtool_ops = {
 
 void vxge_initialize_ethtool_ops(struct net_device *ndev)
 {
-	SET_ETHTOOL_OPS(ndev, &vxge_ethtool_ops);
+	ndev->ethtool_ops = &vxge_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index fddb464aeab3..e8235c5c5e69 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -5766,7 +5766,7 @@ static int nv_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 		dev->netdev_ops = &nv_netdev_ops_optimized;
 
 	netif_napi_add(dev, &np->napi, nv_napi_poll, RX_WORK_PER_LOOP);
-	SET_ETHTOOL_OPS(dev, &ops);
+	dev->ethtool_ops = &ops;
 	dev->watchdog_timeo = NV_WATCHDOG_TIMEO;
 
 	pci_set_drvdata(pci_dev, dev);
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c
index 826f0ccdc23c..114d2fe52cc2 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_ethtool.c
@@ -508,5 +508,5 @@ static const struct ethtool_ops pch_gbe_ethtool_ops = {
 
 void pch_gbe_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &pch_gbe_ethtool_ops);
+	netdev->ethtool_ops = &pch_gbe_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c
index b6bdeb3c1971..9a997e4c3e08 100644
--- a/drivers/net/ethernet/packetengines/hamachi.c
+++ b/drivers/net/ethernet/packetengines/hamachi.c
@@ -724,10 +724,8 @@ static int hamachi_init_one(struct pci_dev *pdev,
 
 	/* The Hamachi-specific entries in the device structure. */
 	dev->netdev_ops = &hamachi_netdev_ops;
-	if (chip_tbl[hmp->chip_id].flags & CanHaveMII)
-		SET_ETHTOOL_OPS(dev, &ethtool_ops);
-	else
-		SET_ETHTOOL_OPS(dev, &ethtool_ops_no_mii);
+	dev->ethtool_ops = (chip_tbl[hmp->chip_id].flags & CanHaveMII) ?
+		&ethtool_ops : &ethtool_ops_no_mii;
 	dev->watchdog_timeo = TX_TIMEOUT;
 	if (mtu)
 		dev->mtu = mtu;
diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c
index 9a6cb482dcd0..69a8dc095072 100644
--- a/drivers/net/ethernet/packetengines/yellowfin.c
+++ b/drivers/net/ethernet/packetengines/yellowfin.c
@@ -472,7 +472,7 @@ static int yellowfin_init_one(struct pci_dev *pdev,
 
 	/* The Yellowfin-specific entries in the device structure. */
 	dev->netdev_ops = &netdev_ops;
-	SET_ETHTOOL_OPS(dev, &ethtool_ops);
+	dev->ethtool_ops = &ethtool_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
 
 	if (mtu)
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index f09c35d669b3..5bf05818a12c 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1373,7 +1373,7 @@ netxen_setup_netdev(struct netxen_adapter *adapter,
 
 	netxen_nic_change_mtu(netdev, netdev->mtu);
 
-	SET_ETHTOOL_OPS(netdev, &netxen_nic_ethtool_ops);
+	netdev->ethtool_ops = &netxen_nic_ethtool_ops;
 
 	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
 	                      NETIF_F_RXCSUM;
diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c
index 2eabd44f8914..b5d6bc1a8b00 100644
--- a/drivers/net/ethernet/qlogic/qla3xxx.c
+++ b/drivers/net/ethernet/qlogic/qla3xxx.c
@@ -3838,7 +3838,7 @@ static int ql3xxx_probe(struct pci_dev *pdev,
 
 	/* Set driver entry points */
 	ndev->netdev_ops = &ql3xxx_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &ql3xxx_ethtool_ops);
+	ndev->ethtool_ops = &ql3xxx_ethtool_ops;
 	ndev->watchdog_timeo = 5 * HZ;
 
 	netif_napi_add(ndev, &qdev->napi, ql_poll, 64);
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 8a2aeb85e320..f0a285359e66 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -2265,10 +2265,8 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter, struct net_device *netdev,
 
 	qlcnic_change_mtu(netdev, netdev->mtu);
 
-	if (qlcnic_sriov_vf_check(adapter))
-		SET_ETHTOOL_OPS(netdev, &qlcnic_sriov_vf_ethtool_ops);
-	else
-		SET_ETHTOOL_OPS(netdev, &qlcnic_ethtool_ops);
+	netdev->ethtool_ops = (qlcnic_sriov_vf_check(adapter)) ?
+		&qlcnic_sriov_vf_ethtool_ops : &qlcnic_ethtool_ops;
 
 	netdev->features |= (NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
 			     NETIF_F_IPV6_CSUM | NETIF_F_GRO |
@@ -2682,7 +2680,7 @@ err_out_disable_pdev:
 err_out_maintenance_mode:
 	set_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state);
 	netdev->netdev_ops = &qlcnic_netdev_failed_ops;
-	SET_ETHTOOL_OPS(netdev, &qlcnic_ethtool_failed_ops);
+	netdev->ethtool_ops = &qlcnic_ethtool_failed_ops;
 	ahw->port_type = QLCNIC_XGBE;
 
 	if (qlcnic_83xx_check(adapter))
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 6e36fe14b848..b40050e03a56 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -4770,7 +4770,7 @@ static int qlge_probe(struct pci_dev *pdev,
 	ndev->irq = pdev->irq;
 
 	ndev->netdev_ops = &qlge_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &qlge_ethtool_ops);
+	ndev->ethtool_ops = &qlge_ethtool_ops;
 	ndev->watchdog_timeo = 10 * HZ;
 
 	err = register_netdev(ndev);
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index aa1c079f231d..be425ad5e824 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7125,7 +7125,7 @@ rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	for (i = 0; i < ETH_ALEN; i++)
 		dev->dev_addr[i] = RTL_R8(MAC0 + i);
 
-	SET_ETHTOOL_OPS(dev, &rtl8169_ethtool_ops);
+	dev->ethtool_ops = &rtl8169_ethtool_ops;
 	dev->watchdog_timeo = RTL8169_TX_TIMEOUT;
 
 	netif_napi_add(dev, &tp->napi, rtl8169_poll, R8169_NAPI_WEIGHT);
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 6a9509ccd33b..967314cade95 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -2843,7 +2843,7 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 		ndev->netdev_ops = &sh_eth_netdev_ops_tsu;
 	else
 		ndev->netdev_ops = &sh_eth_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &sh_eth_ethtool_ops);
+	ndev->ethtool_ops = &sh_eth_ethtool_ops;
 	ndev->watchdog_timeo = TX_TIMEOUT;
 
 	/* debug message level */
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
index 0415fa50eeb7..c0981ae45874 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
@@ -520,5 +520,5 @@ static const struct ethtool_ops sxgbe_ethtool_ops = {
 
 void sxgbe_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &sxgbe_ethtool_ops);
+	netdev->ethtool_ops = &sxgbe_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 63d595fd3cc5..1e274045970f 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2248,7 +2248,7 @@ static int efx_register_netdev(struct efx_nic *efx)
 	} else {
 		net_dev->netdev_ops = &efx_farch_netdev_ops;
 	}
-	SET_ETHTOOL_OPS(net_dev, &efx_ethtool_ops);
+	net_dev->ethtool_ops = &efx_ethtool_ops;
 	net_dev->gso_max_segs = EFX_TSO_MAX_SEGS;
 
 	rtnl_lock();
diff --git a/drivers/net/ethernet/sis/sis190.c b/drivers/net/ethernet/sis/sis190.c
index acbbe48a519c..a86339903b9b 100644
--- a/drivers/net/ethernet/sis/sis190.c
+++ b/drivers/net/ethernet/sis/sis190.c
@@ -1877,7 +1877,7 @@ static int sis190_init_one(struct pci_dev *pdev,
 
 	dev->netdev_ops = &sis190_netdev_ops;
 
-	SET_ETHTOOL_OPS(dev, &sis190_ethtool_ops);
+	dev->ethtool_ops = &sis190_ethtool_ops;
 	dev->watchdog_timeo = SIS190_TX_TIMEOUT;
 
 	spin_lock_init(&tp->lock);
diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c
index c7a4868571f9..6b33127ab352 100644
--- a/drivers/net/ethernet/smsc/smc91c92_cs.c
+++ b/drivers/net/ethernet/smsc/smc91c92_cs.c
@@ -318,7 +318,7 @@ static int smc91c92_probe(struct pcmcia_device *link)
 
     /* The SMC91c92-specific entries in the device structure. */
     dev->netdev_ops = &smc_netdev_ops;
-    SET_ETHTOOL_OPS(dev, &ethtool_ops);
+    dev->ethtool_ops = &ethtool_ops;
     dev->watchdog_timeo = TX_TIMEOUT;
 
     smc->mii_if.dev = dev;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index c5f9cb85c8ef..c963394ded6c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -784,5 +784,5 @@ static const struct ethtool_ops stmmac_ethtool_ops = {
 
 void stmmac_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &stmmac_ethtool_ops);
+	netdev->ethtool_ops = &stmmac_ethtool_ops;
 }
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 2ead87759ab4..38da73a2a886 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -2413,7 +2413,7 @@ static void bdx_set_ethtool_ops(struct net_device *netdev)
 		.get_ethtool_stats = bdx_get_ethtool_stats,
 	};
 
-	SET_ETHTOOL_OPS(netdev, &bdx_ethtool_ops);
+	netdev->ethtool_ops = &bdx_ethtool_ops;
 }
 
 /**
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 91499be03c6f..e3d871055d63 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1998,7 +1998,7 @@ static int cpsw_probe_dual_emac(struct platform_device *pdev,
 	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
+	ndev->ethtool_ops = &cpsw_ethtool_ops;
 	netif_napi_add(ndev, &priv_sl2->napi, cpsw_poll, CPSW_POLL_WEIGHT);
 
 	/* register the network device */
@@ -2227,7 +2227,7 @@ static int cpsw_probe(struct platform_device *pdev)
 	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
+	ndev->ethtool_ops = &cpsw_ethtool_ops;
 	netif_napi_add(ndev, &priv->napi, cpsw_poll, CPSW_POLL_WEIGHT);
 
 	/* register the network device */
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 8f0e69ce07ca..e76eae541151 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1980,7 +1980,7 @@ static int davinci_emac_probe(struct platform_device *pdev)
 	}
 
 	ndev->netdev_ops = &emac_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &ethtool_ops);
+	ndev->ethtool_ops = &ethtool_ops;
 	netif_napi_add(ndev, &priv->napi, emac_poll, EMAC_POLL_WEIGHT);
 
 	/* register the network device */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 1de3ef5dd5d2..2e967a7bdb33 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -810,7 +810,7 @@ static int netvsc_probe(struct hv_device *dev,
 	net->features = NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_SG | NETIF_F_RXCSUM |
 			NETIF_F_IP_CSUM | NETIF_F_TSO;
 
-	SET_ETHTOOL_OPS(net, &ethtool_ops);
+	net->ethtool_ops = &ethtool_ops;
 	SET_NETDEV_DEV(net, &dev->device);
 
 	/* Notify the netvsc driver of the new device */
diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
index 63aa9d9e34c5..27536aa89199 100644
--- a/drivers/net/ntb_netdev.c
+++ b/drivers/net/ntb_netdev.c
@@ -348,7 +348,7 @@ static int ntb_netdev_probe(struct pci_dev *pdev)
 	memcpy(ndev->dev_addr, ndev->perm_addr, ndev->addr_len);
 
 	ndev->netdev_ops = &ntb_netdev_ops;
-	SET_ETHTOOL_OPS(ndev, &ntb_ethtool_ops);
+	ndev->ethtool_ops = &ntb_ethtool_ops;
 
 	dev->qp = ntb_transport_create_queue(ndev, pdev, &ntb_netdev_handlers);
 	if (!dev->qp) {
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index a8497183ff8b..dac7a0d9bb46 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -494,7 +494,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev)
 	ndev->mtu = RIO_MAX_MSG_SIZE - 14;
 	ndev->features = NETIF_F_LLTX;
 	SET_NETDEV_DEV(ndev, &mport->dev);
-	SET_ETHTOOL_OPS(ndev, &rionet_ethtool_ops);
+	ndev->ethtool_ops = &rionet_ethtool_ops;
 
 	spin_lock_init(&rnet->lock);
 	spin_lock_init(&rnet->tx_lock);
diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c
index 630caf48f63a..8cfc3bb0c6a6 100644
--- a/drivers/net/usb/catc.c
+++ b/drivers/net/usb/catc.c
@@ -793,7 +793,7 @@ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id
 
 	netdev->netdev_ops = &catc_netdev_ops;
 	netdev->watchdog_timeo = TX_TIMEOUT;
-	SET_ETHTOOL_OPS(netdev, &ops);
+	netdev->ethtool_ops = &ops;
 
 	catc->usbdev = usbdev;
 	catc->netdev = netdev;
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index 660bd5ea9fc0..a3a05869309d 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2425,7 +2425,7 @@ static void hso_net_init(struct net_device *net)
 	net->type = ARPHRD_NONE;
 	net->mtu = DEFAULT_MTU - 14;
 	net->tx_queue_len = 10;
-	SET_ETHTOOL_OPS(net, &ops);
+	net->ethtool_ops = &ops;
 
 	/* and initialize the semaphore */
 	spin_lock_init(&hso_net->net_lock);
diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c
index 421934c83f1c..f72570708edb 100644
--- a/drivers/net/usb/ipheth.c
+++ b/drivers/net/usb/ipheth.c
@@ -524,7 +524,7 @@ static int ipheth_probe(struct usb_interface *intf,
 	usb_set_intfdata(intf, dev);
 
 	SET_NETDEV_DEV(netdev, &intf->dev);
-	SET_ETHTOOL_OPS(netdev, &ops);
+	netdev->ethtool_ops = &ops;
 
 	retval = register_netdev(netdev);
 	if (retval) {
diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
index a359d3bb7c5b..dcb6d33141e0 100644
--- a/drivers/net/usb/kaweth.c
+++ b/drivers/net/usb/kaweth.c
@@ -1171,7 +1171,7 @@ err_fw:
 	netdev->netdev_ops = &kaweth_netdev_ops;
 	netdev->watchdog_timeo = KAWETH_TX_TIMEOUT;
 	netdev->mtu = le16_to_cpu(kaweth->configuration.segment_size);
-	SET_ETHTOOL_OPS(netdev, &ops);
+	netdev->ethtool_ops = &ops;
 
 	/* kaweth is zeroed as part of alloc_netdev */
 	INIT_DELAYED_WORK(&kaweth->lowmem_work, kaweth_resubmit_tl);
diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c
index 03e8a15d7deb..f84080215915 100644
--- a/drivers/net/usb/pegasus.c
+++ b/drivers/net/usb/pegasus.c
@@ -1159,7 +1159,7 @@ static int pegasus_probe(struct usb_interface *intf,
 
 	net->watchdog_timeo = PEGASUS_TX_TIMEOUT;
 	net->netdev_ops = &pegasus_netdev_ops;
-	SET_ETHTOOL_OPS(net, &ops);
+	net->ethtool_ops = &ops;
 	pegasus->mii.dev = net;
 	pegasus->mii.mdio_read = mdio_read;
 	pegasus->mii.mdio_write = mdio_write;
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 3fbfb0869030..9f91c7aba4b0 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -3452,7 +3452,7 @@ static int rtl8152_probe(struct usb_interface *intf,
 			      NETIF_F_TSO | NETIF_F_FRAGLIST |
 			      NETIF_F_IPV6_CSUM | NETIF_F_TSO6;
 
-	SET_ETHTOOL_OPS(netdev, &ops);
+	netdev->ethtool_ops = &ops;
 	netif_set_gso_max_size(netdev, RTL_LIMITED_TSO_SIZE);
 
 	tp->mii.dev = netdev;
diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c
index da2c4583bd2d..6e87e5710048 100644
--- a/drivers/net/usb/rtl8150.c
+++ b/drivers/net/usb/rtl8150.c
@@ -878,7 +878,7 @@ static int rtl8150_probe(struct usb_interface *intf,
 	dev->netdev = netdev;
 	netdev->netdev_ops = &rtl8150_netdev_ops;
 	netdev->watchdog_timeo = RTL8150_TX_TIMEOUT;
-	SET_ETHTOOL_OPS(netdev, &ops);
+	netdev->ethtool_ops = &ops;
 	dev->intr_interval = 100;	/* 100ms */
 
 	if (!alloc_all_urbs(dev)) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b852bb368acc..7d9f84a91f37 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1646,7 +1646,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	dev->netdev_ops = &virtnet_netdev;
 	dev->features = NETIF_F_HIGHDMA;
 
-	SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops);
+	dev->ethtool_ops = &virtnet_ethtool_ops;
 	SET_NETDEV_DEV(dev, &vdev->dev);
 
 	/* Do we support "hardware" checksums? */
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 600ab56c0008..00e120296e92 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -635,5 +635,5 @@ static const struct ethtool_ops vmxnet3_ethtool_ops = {
 
 void vmxnet3_set_ethtool_ops(struct net_device *netdev)
 {
-	SET_ETHTOOL_OPS(netdev, &vmxnet3_ethtool_ops);
+	netdev->ethtool_ops = &vmxnet3_ethtool_ops;
 }
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1dfee9a7fbf7..e68c8eb4ea8e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2716,7 +2716,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 		return -EEXIST;
 	}
 
-	SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops);
+	dev->ethtool_ops = &vxlan_ethtool_ops;
 
 	/* create an fdb entry for a valid default destination */
 	if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
diff --git a/drivers/net/wireless/hostap/hostap_main.c b/drivers/net/wireless/hostap/hostap_main.c
index 67db34e56d7e..52919ad42726 100644
--- a/drivers/net/wireless/hostap/hostap_main.c
+++ b/drivers/net/wireless/hostap/hostap_main.c
@@ -882,7 +882,7 @@ void hostap_setup_dev(struct net_device *dev, local_info_t *local,
 	dev->mtu = local->mtu;
 
 
-	SET_ETHTOOL_OPS(dev, &prism2_ethtool_ops);
+	dev->ethtool_ops = &prism2_ethtool_ops;
 
 }
 
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ef05c5c49d41..a7557331699f 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -386,7 +386,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO6;
 	dev->features = dev->hw_features | NETIF_F_RXCSUM;
-	SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops);
+	dev->ethtool_ops = &xenvif_ethtool_ops;
 
 	dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
 
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 158b5e639fc7..895355de8ac4 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1332,7 +1332,7 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
          */
 	netdev->features |= netdev->hw_features;
 
-	SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
+	netdev->ethtool_ops = &xennet_ethtool_ops;
 	SET_NETDEV_DEV(netdev, &dev->dev);
 
 	netif_set_gso_max_size(netdev, XEN_NETIF_MAX_TX_SIZE - MAX_TCP_HEADER);
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index e232ceca38fe..5ef5b4f45758 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -969,10 +969,9 @@ static int qeth_l2_setup_netdev(struct qeth_card *card)
 	card->dev->watchdog_timeo = QETH_TX_TIMEOUT;
 	card->dev->mtu = card->info.initial_mtu;
 	card->dev->netdev_ops = &qeth_l2_netdev_ops;
-	if (card->info.type != QETH_CARD_TYPE_OSN)
-		SET_ETHTOOL_OPS(card->dev, &qeth_l2_ethtool_ops);
-	else
-		SET_ETHTOOL_OPS(card->dev, &qeth_l2_osn_ops);
+	card->dev->ethtool_ops =
+		(card->info.type != QETH_CARD_TYPE_OSN) ?
+		&qeth_l2_ethtool_ops : &qeth_l2_osn_ops;
 	card->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 	card->info.broadcast_capable = 1;
 	qeth_l2_request_initial_mac(card);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index bc2499a24884..c58f82af3658 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -3301,7 +3301,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 	card->dev->ml_priv = card;
 	card->dev->watchdog_timeo = QETH_TX_TIMEOUT;
 	card->dev->mtu = card->info.initial_mtu;
-	SET_ETHTOOL_OPS(card->dev, &qeth_l3_ethtool_ops);
+	card->dev->ethtool_ops = &qeth_l3_ethtool_ops;
 	card->dev->features |=	NETIF_F_HW_VLAN_CTAG_TX |
 				NETIF_F_HW_VLAN_CTAG_RX |
 				NETIF_F_HW_VLAN_CTAG_FILTER;
diff --git a/drivers/staging/et131x/et131x.c b/drivers/staging/et131x/et131x.c
index d329cf314360..15e0f4da3ce0 100644
--- a/drivers/staging/et131x/et131x.c
+++ b/drivers/staging/et131x/et131x.c
@@ -4604,7 +4604,7 @@ static int et131x_pci_setup(struct pci_dev *pdev,
 	netdev->netdev_ops     = &et131x_netdev_ops;
 
 	SET_NETDEV_DEV(netdev, &pdev->dev);
-	SET_ETHTOOL_OPS(netdev, &et131x_ethtool_ops);
+	netdev->ethtool_ops = &et131x_ethtool_ops;
 
 	adapter = et131x_adapter_init(netdev, pdev);
 
diff --git a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c
index d6421b9b5981..a6158bef58e5 100644
--- a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c
+++ b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c
@@ -2249,7 +2249,7 @@ struct net_device *init_ft1000_card(struct pcmcia_device *link,
 
 	ft1000InitProc(dev);
 	ft1000_card_present = 1;
-	SET_ETHTOOL_OPS(dev, &ops);
+	dev->ethtool_ops = &ops;
 	printk(KERN_INFO "ft1000: %s: addr 0x%04lx irq %d, MAC addr %pM\n",
 			dev->name, dev->base_addr, dev->irq, dev->dev_addr);
 	return dev;
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index c83e3375104b..9d957615e32a 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -1066,7 +1066,7 @@ static int xlr_net_probe(struct platform_device *pdev)
 	xlr_set_rx_mode(ndev);
 
 	priv->num_rx_desc += MAX_NUM_DESC_SPILL;
-	SET_ETHTOOL_OPS(ndev, &xlr_ethtool_ops);
+	ndev->ethtool_ops = &xlr_ethtool_ops;
 	SET_NETDEV_DEV(ndev, &pdev->dev);
 
 	/* Common registers, do one time initialization */
diff --git a/drivers/staging/octeon/ethernet.c b/drivers/staging/octeon/ethernet.c
index ff7214aac9dd..da9dd6bc5660 100644
--- a/drivers/staging/octeon/ethernet.c
+++ b/drivers/staging/octeon/ethernet.c
@@ -469,7 +469,7 @@ int cvm_oct_common_init(struct net_device *dev)
 
 	/* We do our own locking, Linux doesn't need to */
 	dev->features |= NETIF_F_LLTX;
-	SET_ETHTOOL_OPS(dev, &cvm_oct_ethtool_ops);
+	dev->ethtool_ops = &cvm_oct_ethtool_ops;
 
 	cvm_oct_phy_setup_device(dev);
 	cvm_oct_set_mac_filter(dev);
diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c
index b7d4f82872b7..ce8e28146162 100644
--- a/drivers/usb/gadget/u_ether.c
+++ b/drivers/usb/gadget/u_ether.c
@@ -793,7 +793,7 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g,
 
 	net->netdev_ops = &eth_netdev_ops;
 
-	SET_ETHTOOL_OPS(net, &ops);
+	net->ethtool_ops = &ops;
 
 	dev->gadget = g;
 	SET_NETDEV_DEV(net, &g->dev);
@@ -850,7 +850,7 @@ struct net_device *gether_setup_name_default(const char *netname)
 
 	net->netdev_ops = &eth_netdev_ops;
 
-	SET_ETHTOOL_OPS(net, &ops);
+	net->ethtool_ops = &ops;
 	SET_NETDEV_DEVTYPE(net, &gadget_type);
 
 	return net;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index adc4658e9873..2dea98cbbdba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -56,9 +56,6 @@ struct device;
 struct phy_device;
 /* 802.11 specific */
 struct wireless_dev;
-					/* source back-compat hooks */
-#define SET_ETHTOOL_OPS(netdev,ops) \
-	( (netdev)->ethtool_ops = (ops) )
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 744a59b85e15..e7ee65dc20bf 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -884,7 +884,7 @@ static void batadv_softif_init_early(struct net_device *dev)
 	/* generate random address */
 	eth_hw_addr_random(dev);
 
-	SET_ETHTOOL_OPS(dev, &batadv_ethtool_ops);
+	dev->ethtool_ops = &batadv_ethtool_ops;
 
 	memset(priv, 0, sizeof(*priv));
 }
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 3e2da2cb72db..9212015abc8f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -348,7 +348,7 @@ void br_dev_setup(struct net_device *dev)
 
 	dev->netdev_ops = &br_netdev_ops;
 	dev->destructor = br_dev_free;
-	SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
+	dev->ethtool_ops = &br_ethtool_ops;
 	SET_NETDEV_DEVTYPE(dev, &br_type);
 	dev->tx_queue_len = 0;
 	dev->priv_flags = IFF_EBRIDGE;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 02c0e1716f64..64c5af0a10dd 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -346,7 +346,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 		return slave_dev;
 
 	slave_dev->features = master->vlan_features;
-	SET_ETHTOOL_OPS(slave_dev, &dsa_slave_ethtool_ops);
+	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
 	eth_hw_addr_inherit(slave_dev, master);
 	slave_dev->tx_queue_len = 0;
 
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 729c68763fe7..789af9280e77 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -130,7 +130,7 @@ static void do_setup(struct net_device *netdev)
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 	netdev->destructor = internal_dev_destructor;
-	SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops);
+	netdev->ethtool_ops = &internal_dev_ethtool_ops;
 	netdev->tx_queue_len = 0;
 
 	netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
-- 
cgit v1.2.3-71-gd317


From 50a0ffaf75e9d2d97200b523f2c600f40e9756b1 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Sun, 11 May 2014 10:47:15 +0200
Subject: net: cdc_ncm/cdc_mbim: rework probing of NCM/MBIM functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The NCM class match in the cdc_mbim driver is confusing and
cause unexpected behaviour. The USB core guarantees that a
USB interface is in altsetting 0 when probing starts. This
means that devices implementing a NCM 1.0 backwards
compatible MBIM function (a "NCM/MBIM function") always hit
the NCM entry in the cdc_mbim driver match table. Such
functions will never match any of the MBIM entries.

This causes unexpeced behaviour for cases where the NCM and
MBIM entries are differet, which is currently the case for
all except Ericsson devices.

Improve the probing of NCM/MBIM functions by looking up the
device again in the cdc_mbim match table after switching to
the MBIM identity.

The shared altsetting selection is updated to better
accommodate the new probing logic, returning the preferred
altsetting for the control interface instead of the data
interface. The control interface altsetting update is moved
to the cdc_mbim driver. It is never necessary to change the
control interface altsetting for NCM.

Cc: Greg Suarez <gsuarez@smithmicro.com>
Reported by: Yu-an Shih <yshih@nvidia.com>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_mbim.c  | 43 +++++++++++++++++++++++++++++++++++++++++--
 drivers/net/usb/cdc_ncm.c   | 27 +++++++++++++--------------
 include/linux/usb/cdc_ncm.h |  2 +-
 3 files changed, 55 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 80d27719ba38..bc23273d0455 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -107,15 +107,54 @@ static const struct net_device_ops cdc_mbim_netdev_ops = {
 	.ndo_vlan_rx_kill_vid = cdc_mbim_rx_kill_vid,
 };
 
+/* Change the control interface altsetting and update the .driver_info
+ * pointer if the matching entry after changing class codes points to
+ * a different struct
+ */
+static int cdc_mbim_set_ctrlalt(struct usbnet *dev, struct usb_interface *intf, u8 alt)
+{
+	struct usb_driver *driver = to_usb_driver(intf->dev.driver);
+	const struct usb_device_id *id;
+	struct driver_info *info;
+	int ret;
+
+	ret = usb_set_interface(dev->udev,
+				intf->cur_altsetting->desc.bInterfaceNumber,
+				alt);
+	if (ret)
+		return ret;
+
+	id = usb_match_id(intf, driver->id_table);
+	if (!id)
+		return -ENODEV;
+
+	info = (struct driver_info *)id->driver_info;
+	if (info != dev->driver_info) {
+		dev_dbg(&intf->dev, "driver_info updated to '%s'\n",
+			info->description);
+		dev->driver_info = info;
+	}
+	return 0;
+}
+
 static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf)
 {
 	struct cdc_ncm_ctx *ctx;
 	struct usb_driver *subdriver = ERR_PTR(-ENODEV);
 	int ret = -ENODEV;
-	u8 data_altsetting = cdc_ncm_select_altsetting(dev, intf);
+	u8 data_altsetting = 1;
 	struct cdc_mbim_state *info = (void *)&dev->data;
 
-	/* Probably NCM, defer for cdc_ncm_bind */
+	/* should we change control altsetting on a NCM/MBIM function? */
+	if (cdc_ncm_select_altsetting(intf) == CDC_NCM_COMM_ALTSETTING_MBIM) {
+		data_altsetting = CDC_NCM_DATA_ALTSETTING_MBIM;
+		ret = cdc_mbim_set_ctrlalt(dev, intf, CDC_NCM_COMM_ALTSETTING_MBIM);
+		if (ret)
+			goto err;
+		ret = -ENODEV;
+	}
+
+	/* we will hit this for NCM/MBIM functions if prefer_mbim is false */
 	if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting))
 		goto err;
 
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 9a2bd11943eb..d23bca57a23f 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -541,10 +541,10 @@ void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf)
 }
 EXPORT_SYMBOL_GPL(cdc_ncm_unbind);
 
-/* Select the MBIM altsetting iff it is preferred and available,
- * returning the number of the corresponding data interface altsetting
+/* Return the number of the MBIM control interface altsetting iff it
+ * is preferred and available,
  */
-u8 cdc_ncm_select_altsetting(struct usbnet *dev, struct usb_interface *intf)
+u8 cdc_ncm_select_altsetting(struct usb_interface *intf)
 {
 	struct usb_host_interface *alt;
 
@@ -563,15 +563,15 @@ u8 cdc_ncm_select_altsetting(struct usbnet *dev, struct usb_interface *intf)
 	 *   the rules given in section 6 (USB Device Model) of this
 	 *   specification."
 	 */
-	if (prefer_mbim && intf->num_altsetting == 2) {
+	if (intf->num_altsetting < 2)
+		return intf->cur_altsetting->desc.bAlternateSetting;
+
+	if (prefer_mbim) {
 		alt = usb_altnum_to_altsetting(intf, CDC_NCM_COMM_ALTSETTING_MBIM);
-		if (alt && cdc_ncm_comm_intf_is_mbim(alt) &&
-		    !usb_set_interface(dev->udev,
-				       intf->cur_altsetting->desc.bInterfaceNumber,
-				       CDC_NCM_COMM_ALTSETTING_MBIM))
-			return CDC_NCM_DATA_ALTSETTING_MBIM;
+		if (alt && cdc_ncm_comm_intf_is_mbim(alt))
+			return CDC_NCM_COMM_ALTSETTING_MBIM;
 	}
-	return CDC_NCM_DATA_ALTSETTING_NCM;
+	return CDC_NCM_COMM_ALTSETTING_NCM;
 }
 EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting);
 
@@ -580,12 +580,11 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 	int ret;
 
 	/* MBIM backwards compatible function? */
-	cdc_ncm_select_altsetting(dev, intf);
-	if (cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting))
+	if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
 		return -ENODEV;
 
-	/* NCM data altsetting is always 1 */
-	ret = cdc_ncm_bind_common(dev, intf, 1);
+	/* The NCM data altsetting is fixed */
+	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM);
 
 	/*
 	 * We should get an event when network connection is "connected" or
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 44b38b92236a..55b6feead93b 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -121,7 +121,7 @@ struct cdc_ncm_ctx {
 	u16 connected;
 };
 
-u8 cdc_ncm_select_altsetting(struct usbnet *dev, struct usb_interface *intf);
+u8 cdc_ncm_select_altsetting(struct usb_interface *intf);
 int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting);
 void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf);
 struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign);
-- 
cgit v1.2.3-71-gd317


From 89278c9dc922272df921042aafa18311f3398c6c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Sun, 11 May 2014 20:22:10 -0700
Subject: tcp: simplify fast open cookie processing

Consolidate various cookie checking and generation code to simplify
the fast open processing. The main goal is to reduce code duplication
in tcp_v4_conn_request() for IPv6 support.

Removes two experimental sysctl flags TFO_SERVER_ALWAYS and
TFO_SERVER_COOKIE_NOT_CHKD used primarily for developmental debugging
purposes.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Daniel Lee <longinus00@gmail.com>
Signed-off-by: Jerry Chu <hkchu@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  5 ----
 include/net/tcp.h       |  9 +------
 net/ipv4/tcp_fastopen.c | 71 +++++++++++++++++++------------------------------
 net/ipv4/tcp_ipv4.c     | 10 +++----
 net/ipv4/tcp_output.c   |  2 +-
 5 files changed, 33 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4e37c71ecd74..bc35e4709e8e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -366,11 +366,6 @@ static inline bool tcp_passive_fastopen(const struct sock *sk)
 		tcp_sk(sk)->fastopen_rsk != NULL);
 }
 
-static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc)
-{
-	return foc->len != -1;
-}
-
 extern void tcp_sock_destruct(struct sock *sk);
 
 static inline int fastopen_init_queue(struct sock *sk, int backlog)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 012236838583..17d7c6a3d037 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -220,8 +220,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define	TFO_SERVER_ENABLE	2
 #define	TFO_CLIENT_NO_COOKIE	4	/* Data in SYN w/o cookie option */
 
-/* Process SYN data but skip cookie validation */
-#define	TFO_SERVER_COOKIE_NOT_CHKED	0x100
 /* Accept SYN data w/o any cookie option */
 #define	TFO_SERVER_COOKIE_NOT_REQD	0x200
 
@@ -230,10 +228,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define	TFO_SERVER_WO_SOCKOPT1	0x400
 #define	TFO_SERVER_WO_SOCKOPT2	0x800
-/* Always create TFO child sockets on a TFO listener even when
- * cookie/data not present. (For testing purpose!)
- */
-#define	TFO_SERVER_ALWAYS	0x1000
 
 extern struct inet_timewait_death_row tcp_death_row;
 
@@ -1335,8 +1329,7 @@ int tcp_fastopen_create_child(struct sock *sk,
 			      struct request_sock *req);
 bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
 			struct request_sock *req,
-			struct tcp_fastopen_cookie *foc,
-			struct tcp_fastopen_cookie *valid_foc);
+			struct tcp_fastopen_cookie *foc);
 void tcp_fastopen_init_key_once(bool publish);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 0606c91d9d0b..5a98277b9a82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -228,59 +228,44 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
 	return true;
 }
 
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
 bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
 			struct request_sock *req,
-			struct tcp_fastopen_cookie *foc,
-			struct tcp_fastopen_cookie *valid_foc)
+			struct tcp_fastopen_cookie *foc)
 {
-	bool skip_cookie = false;
-
-	if (likely(!fastopen_cookie_present(foc))) {
-		/* See include/net/tcp.h for the meaning of these knobs */
-		if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
-		    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
-		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
-			skip_cookie = true; /* no cookie to validate */
-		else
-			return false;
-	}
-	/* A FO option is present; bump the counter. */
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
+	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
 
-	if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
-	    !tcp_fastopen_queue_check(sk))
+	if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+	      (syn_data || foc->len >= 0) &&
+	      tcp_fastopen_queue_check(sk))) {
+		foc->len = -1;
 		return false;
-
-	if (skip_cookie) {
-		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-		return true;
 	}
 
-	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
-		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
-			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
-						ip_hdr(skb)->daddr, valid_foc);
-			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
-			    memcmp(&foc->val[0], &valid_foc->val[0],
-			    TCP_FASTOPEN_COOKIE_SIZE) != 0)
-				return false;
-			valid_foc->len = -1;
-		}
-		/* Acknowledge the data received from the peer. */
+	if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+		goto fastopen;
+
+	tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+				ip_hdr(skb)->daddr, &valid_foc);
+
+	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
+	    foc->len == valid_foc.len &&
+	    !memcmp(foc->val, valid_foc.val, foc->len)) {
+fastopen:
 		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		foc->len = -1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
 		return true;
-	} else if (foc->len == 0) { /* Client requesting a cookie */
-		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
-					ip_hdr(skb)->daddr, valid_foc);
-		NET_INC_STATS_BH(sock_net(sk),
-		    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
-	} else {
-		/* Client sent a cookie with wrong size. Treat it
-		 * the same as invalid and return a valid one.
-		 */
-		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
-					ip_hdr(skb)->daddr, valid_foc);
 	}
+
+	NET_INC_STATS_BH(sock_net(sk), foc->len ?
+			 LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
+			 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+	*foc = valid_foc;
 	return false;
 }
 EXPORT_SYMBOL(tcp_fastopen_check);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 032fcaee164a..5ea0949dadfd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1273,7 +1273,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	bool want_cookie = false;
 	struct flowi4 fl4;
 	struct tcp_fastopen_cookie foc = { .len = -1 };
-	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	struct sk_buff *skb_synack;
 	int do_fastopen;
 
@@ -1381,7 +1380,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		if (dst == NULL)
 			goto drop_and_free;
 	}
-	do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
+	do_fastopen = !want_cookie &&
+		      tcp_fastopen_check(sk, skb, req, &foc);
 
 	/* We don't call tcp_v4_send_synack() directly because we need
 	 * to make sure a child socket can be created successfully before
@@ -1394,8 +1394,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * latter to remove its dependency on the current implementation
 	 * of tcp_v4_send_synack()->tcp_select_initial_window().
 	 */
-	skb_synack = tcp_make_synack(sk, dst, req,
-	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
+	skb_synack = tcp_make_synack(sk, dst, req, &foc);
 
 	if (skb_synack) {
 		__tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -1415,9 +1414,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		tcp_rsk(req)->listener = NULL;
 		/* Add the request_sock to the SYN table */
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
-		if (fastopen_cookie_present(&foc) && foc.len != 0)
-			NET_INC_STATS_BH(sock_net(sk),
-			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
 	} else if (tcp_fastopen_create_child(sk, skb, skb_synack, req))
 		goto drop_and_release;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 694711a140d4..b20fc02920f9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
 		if (unlikely(!ireq->tstamp_ok))
 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
 	}
-	if (foc != NULL) {
+	if (foc != NULL && foc->len >= 0) {
 		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
 		need = (need + 3) & ~3U;  /* Align to 32 bits */
 		if (remaining >= need) {
-- 
cgit v1.2.3-71-gd317


From cea092c9488cbb22c8b70336ab1413e0daf350f0 Mon Sep 17 00:00:00 2001
From: Brian W Hart <hartb@linux.vnet.ibm.com>
Date: Wed, 14 May 2014 10:33:45 +0930
Subject: cpumask.h: silence warning with -Wsign-compare
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Silence the warning when building with -Wsign-compare when cpumask.h
is included:

include/linux/cpumask.h: In function ‘cpumask_parse’:
include/linux/cpumask.h:603:26: warning: signed and unsigned type in conditional expression [-Wsign-compare]
  int len = nl ? nl - buf : strlen(buf);
                          ^

V2: Rusty pointed out that unsigned should be used instead.

Signed-off-by: Brian W Hart <hartb@linux.vnet.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d08e4d2a9b92..3557ea7b2049 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -600,7 +600,7 @@ static inline int cpulist_scnprintf(char *buf, int len,
 static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 {
 	char *nl = strchr(buf, '\n');
-	int len = nl ? nl - buf : strlen(buf);
+	unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf);
 
 	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
 }
-- 
cgit v1.2.3-71-gd317


From 08e53fbdb85c0f6f45c0f7c1ea3defc1752a95ce Mon Sep 17 00:00:00 2001
From: Amos Kong <akong@redhat.com>
Date: Wed, 14 May 2014 10:33:46 +0930
Subject: virtio-rng: support multiple virtio-rng devices

Current hwrng core supports to register multiple hwrng devices,
and there is only one device really works in the same time.
QEMU alsu supports to have multiple virtio-rng backends.

This patch changes virtio-rng driver to support multiple
virtio-rng devices.

]# cat /sys/class/misc/hw_random/rng_available
virtio_rng.0 virtio_rng.1
]# cat /sys/class/misc/hw_random/rng_current
virtio_rng.0
]# echo -n virtio_rng.1 > /sys/class/misc/hw_random/rng_current
]# dd if=/dev/hwrng of=/dev/null

Signed-off-by: Amos Kong <akong@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/hw_random/virtio-rng.c | 102 ++++++++++++++++++++++--------------
 include/linux/hw_random.h           |   2 +-
 2 files changed, 64 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 2ce0e225e58c..12e242bbb0f5 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -25,88 +25,108 @@
 #include <linux/virtio_rng.h>
 #include <linux/module.h>
 
-static struct virtqueue *vq;
-static unsigned int data_avail;
-static DECLARE_COMPLETION(have_data);
-static bool busy;
+
+struct virtrng_info {
+	struct virtio_device *vdev;
+	struct hwrng hwrng;
+	struct virtqueue *vq;
+	unsigned int data_avail;
+	struct completion have_data;
+	bool busy;
+};
 
 static void random_recv_done(struct virtqueue *vq)
 {
+	struct virtrng_info *vi = vq->vdev->priv;
+
 	/* We can get spurious callbacks, e.g. shared IRQs + virtio_pci. */
-	if (!virtqueue_get_buf(vq, &data_avail))
+	if (!virtqueue_get_buf(vi->vq, &vi->data_avail))
 		return;
 
-	complete(&have_data);
+	complete(&vi->have_data);
 }
 
 /* The host will fill any buffer we give it with sweet, sweet randomness. */
-static void register_buffer(u8 *buf, size_t size)
+static void register_buffer(struct virtrng_info *vi, u8 *buf, size_t size)
 {
 	struct scatterlist sg;
 
 	sg_init_one(&sg, buf, size);
 
 	/* There should always be room for one buffer. */
-	virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL);
+	virtqueue_add_inbuf(vi->vq, &sg, 1, buf, GFP_KERNEL);
 
-	virtqueue_kick(vq);
+	virtqueue_kick(vi->vq);
 }
 
 static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 {
 	int ret;
+	struct virtrng_info *vi = (struct virtrng_info *)rng->priv;
 
-	if (!busy) {
-		busy = true;
-		init_completion(&have_data);
-		register_buffer(buf, size);
+	if (!vi->busy) {
+		vi->busy = true;
+		init_completion(&vi->have_data);
+		register_buffer(vi, buf, size);
 	}
 
 	if (!wait)
 		return 0;
 
-	ret = wait_for_completion_killable(&have_data);
+	ret = wait_for_completion_killable(&vi->have_data);
 	if (ret < 0)
 		return ret;
 
-	busy = false;
+	vi->busy = false;
 
-	return data_avail;
+	return vi->data_avail;
 }
 
 static void virtio_cleanup(struct hwrng *rng)
 {
-	if (busy)
-		wait_for_completion(&have_data);
-}
-
+	struct virtrng_info *vi = (struct virtrng_info *)rng->priv;
 
-static struct hwrng virtio_hwrng = {
-	.name		= "virtio",
-	.cleanup	= virtio_cleanup,
-	.read		= virtio_read,
-};
+	if (vi->busy)
+		wait_for_completion(&vi->have_data);
+}
 
 static int probe_common(struct virtio_device *vdev)
 {
-	int err;
+	int err, i;
+	struct virtrng_info *vi = NULL;
+
+	vi = kmalloc(sizeof(struct virtrng_info), GFP_KERNEL);
+	vi->hwrng.name = kmalloc(40, GFP_KERNEL);
+	init_completion(&vi->have_data);
+
+	vi->hwrng.read = virtio_read;
+	vi->hwrng.cleanup = virtio_cleanup;
+	vi->hwrng.priv = (unsigned long)vi;
+	vdev->priv = vi;
 
-	if (vq) {
-		/* We only support one device for now */
-		return -EBUSY;
-	}
 	/* We expect a single virtqueue. */
-	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
-	if (IS_ERR(vq)) {
-		err = PTR_ERR(vq);
-		vq = NULL;
+	vi->vq = virtio_find_single_vq(vdev, random_recv_done, "input");
+	if (IS_ERR(vi->vq)) {
+		err = PTR_ERR(vi->vq);
+		kfree(vi->hwrng.name);
+		vi->vq = NULL;
+		kfree(vi);
+		vi = NULL;
 		return err;
 	}
 
-	err = hwrng_register(&virtio_hwrng);
+	i = 0;
+	do {
+		sprintf(vi->hwrng.name, "virtio_rng.%d", i++);
+		err = hwrng_register(&vi->hwrng);
+	} while (err == -EEXIST);
+
 	if (err) {
 		vdev->config->del_vqs(vdev);
-		vq = NULL;
+		kfree(vi->hwrng.name);
+		vi->vq = NULL;
+		kfree(vi);
+		vi = NULL;
 		return err;
 	}
 
@@ -115,11 +135,15 @@ static int probe_common(struct virtio_device *vdev)
 
 static void remove_common(struct virtio_device *vdev)
 {
+	struct virtrng_info *vi = vdev->priv;
 	vdev->config->reset(vdev);
-	busy = false;
-	hwrng_unregister(&virtio_hwrng);
+	vi->busy = false;
+	hwrng_unregister(&vi->hwrng);
 	vdev->config->del_vqs(vdev);
-	vq = NULL;
+	kfree(vi->hwrng.name);
+	vi->vq = NULL;
+	kfree(vi);
+	vi = NULL;
 }
 
 static int virtrng_probe(struct virtio_device *vdev)
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index b4b0eef5fddf..02d9c87be54c 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -31,7 +31,7 @@
  * @priv:		Private data, for use by the RNG driver.
  */
 struct hwrng {
-	const char *name;
+	char *name;
 	int (*init)(struct hwrng *rng);
 	void (*cleanup)(struct hwrng *rng);
 	int (*data_present)(struct hwrng *rng, int wait);
-- 
cgit v1.2.3-71-gd317


From b02ef20a9fba08948e643d3eec0efadf1da01a44 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 12 May 2014 18:24:45 +0200
Subject: uprobes/x86: Fix the wrong ->si_addr when xol triggers a trap

If the probed insn triggers a trap, ->si_addr = regs->ip is technically
correct, but this is not what the signal handler wants; we need to pass
the address of the probed insn, not the address of xol slot.

Add the new arch-agnostic helper, uprobe_get_trap_addr(), and change
fill_trap_info() and math_error() to use it. !CONFIG_UPROBES case in
uprobes.h uses a macro to avoid include hell and ensure that it can be
compiled even if an architecture doesn't define instruction_pointer().

Test-case:

	#include <signal.h>
	#include <stdio.h>
	#include <unistd.h>

	extern void probe_div(void);

	void sigh(int sig, siginfo_t *info, void *c)
	{
		int passed = (info->si_addr == probe_div);
		printf(passed ? "PASS\n" : "FAIL\n");
		_exit(!passed);
	}

	int main(void)
	{
		struct sigaction sa = {
			.sa_sigaction	= sigh,
			.sa_flags	= SA_SIGINFO,
		};

		sigaction(SIGFPE, &sa, NULL);

		asm (
			"xor %ecx,%ecx\n"
			".globl probe_div; probe_div:\n"
			"idiv %ecx\n"
		);

		return 0;
	}

it fails if probe_div() is probed.

Note: show_unhandled_signals users should probably use this helper too,
but we need to cleanup them first.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
---
 arch/x86/kernel/traps.c |  7 ++++---
 include/linux/uprobes.h |  4 ++++
 kernel/events/uprobes.c | 10 ++++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 73b3ea32245a..3fdb20548c4b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/uprobes.h>
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -148,11 +149,11 @@ static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
 
 	case X86_TRAP_DE:
 		sicode = FPE_INTDIV;
-		siaddr = regs->ip;
+		siaddr = uprobe_get_trap_addr(regs);
 		break;
 	case X86_TRAP_UD:
 		sicode = ILL_ILLOPN;
-		siaddr = regs->ip;
+		siaddr = uprobe_get_trap_addr(regs);
 		break;
 	case X86_TRAP_AC:
 		sicode = BUS_ADRALN;
@@ -531,7 +532,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	task->thread.error_code = error_code;
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
-	info.si_addr = (void __user *)regs->ip;
+	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
 	if (trapnr == X86_TRAP_MF) {
 		unsigned short cwd, swd;
 		/*
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index edff2b97b864..88c3b7e8b384 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -102,6 +102,7 @@ extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, u
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
 extern bool __weak is_trap_insn(uprobe_opcode_t *insn);
 extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs);
+extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
@@ -130,6 +131,9 @@ extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *r
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
+
+#define uprobe_get_trap_addr(regs)	instruction_pointer(regs)
+
 static inline int
 uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a13251e8bfa4..3b02c72938a8 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1351,6 +1351,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
 	return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
 }
 
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	if (unlikely(utask && utask->active_uprobe))
+		return utask->vaddr;
+
+	return instruction_pointer(regs);
+}
+
 /*
  * Called with no locks held.
  * Called in context of a exiting or a exec-ing thread.
-- 
cgit v1.2.3-71-gd317


From 9d800df12d31734a6853915e9d2deb5d6747985f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 May 2014 09:15:00 -0400
Subject: cgroup: rename cgroup->dummy_css to ->self and move it to the top

cgroup->dummy_css is used as the placeholder css when performing css
oriended operations on the cgroup.  We're gonna shift more cgroup
management to this css.  Let's rename it to ->self and move it to the
top.

This is pure rename and field relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  6 +++---
 kernel/cgroup.c        | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index aa7353deaaf3..164851e388e7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -143,6 +143,9 @@ enum {
 };
 
 struct cgroup {
+	/* self css with NULL ->ss, points back to this cgroup */
+	struct cgroup_subsys_state self;
+
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
 	/*
@@ -224,9 +227,6 @@ struct cgroup {
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 
-	/* dummy css with NULL ->ss, points back to this cgroup */
-	struct cgroup_subsys_state dummy_css;
-
 	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f36fd9c15b3a..b57a949ae4bc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -220,7 +220,7 @@ static void cgroup_idr_remove(struct idr *idr, int id)
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
  * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
  *
  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
  * function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -235,13 +235,13 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 		return rcu_dereference_check(cgrp->subsys[ss->id],
 					lockdep_is_held(&cgroup_mutex));
 	else
-		return &cgrp->dummy_css;
+		return &cgrp->self;
 }
 
 /**
  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
  *
  * Similar to cgroup_css() but returns the effctive css, which is defined
  * as the matching css of the nearest ancestor including self which has @ss
@@ -254,7 +254,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 	lockdep_assert_held(&cgroup_mutex);
 
 	if (!ss)
-		return &cgrp->dummy_css;
+		return &cgrp->self;
 
 	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
 		return NULL;
@@ -288,7 +288,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 	if (cft->ss)
 		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 	else
-		return &cgrp->dummy_css;
+		return &cgrp->self;
 }
 EXPORT_SYMBOL_GPL(of_css);
 
@@ -1551,7 +1551,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
-	cgrp->dummy_css.cgroup = cgrp;
+	cgrp->self.cgroup = cgrp;
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3454,7 +3454,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	 * ->can_attach() fails.
 	 */
 	do {
-		css_task_iter_start(&from->dummy_css, &it);
+		css_task_iter_start(&from->self, &it);
 		task = css_task_iter_next(&it);
 		if (task)
 			get_task_struct(task);
@@ -3719,7 +3719,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	if (!array)
 		return -ENOMEM;
 	/* now, populate the array */
-	css_task_iter_start(&cgrp->dummy_css, &it);
+	css_task_iter_start(&cgrp->self, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		if (unlikely(n == length))
 			break;
@@ -3793,7 +3793,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	}
 	rcu_read_unlock();
 
-	css_task_iter_start(&cgrp->dummy_css, &it);
+	css_task_iter_start(&cgrp->self, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		switch (tsk->state) {
 		case TASK_RUNNING:
@@ -4274,7 +4274,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	init_cgroup_housekeeping(cgrp);
 
 	cgrp->parent = parent;
-	cgrp->dummy_css.parent = &parent->dummy_css;
+	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 
 	if (notify_on_release(parent))
-- 
cgit v1.2.3-71-gd317


From 249f3468a282dcbad53484c821bebb447f14ee03 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 May 2014 09:15:01 -0400
Subject: cgroup: remove cgroup_destory_css_killed()

cgroup_destroy_css_killed() is cgroup destruction stage which happens
after all csses are offlined.  After the recent updates, it no longer
does anything other than putting the base reference.  This patch
removes the function and makes cgroup_destroy_locked() put the base
ref at the end isntead.

This also makes cgroup->nr_css unnecessary.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  3 ---
 kernel/cgroup.c        | 62 +++++---------------------------------------------
 2 files changed, 6 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 164851e388e7..160fcc69149e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -158,9 +158,6 @@ struct cgroup {
 	 */
 	int id;
 
-	/* the number of attached css's */
-	int nr_css;
-
 	/*
 	 * If this cgroup contains any tasks, it contributes one to
 	 * populated_cnt.  All children with non-zero popuplated_cnt of
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e9aa2a51ca68..4a94b0be598d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -178,7 +178,6 @@ static struct cftype cgroup_base_files[];
 static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned int ss_mask);
-static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
 static void kill_css(struct cgroup_subsys_state *css);
@@ -4169,7 +4168,6 @@ static int online_css(struct cgroup_subsys_state *css)
 		ret = ss->css_online(css);
 	if (!ret) {
 		css->flags |= CSS_ONLINE;
-		css->cgroup->nr_css++;
 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
 	}
 	return ret;
@@ -4189,7 +4187,6 @@ static void offline_css(struct cgroup_subsys_state *css)
 		ss->css_offline(css);
 
 	css->flags &= ~CSS_ONLINE;
-	css->cgroup->nr_css--;
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
 
 	wake_up_all(&css->cgroup->offline_waitq);
@@ -4374,39 +4371,18 @@ out_destroy:
 
 /*
  * This is called when the refcnt of a css is confirmed to be killed.
- * css_tryget_online() is now guaranteed to fail.
+ * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
-	struct cgroup *cgrp = css->cgroup;
 
 	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * css_tryget_online() is guaranteed to fail now.  Tell subsystems
-	 * to initate destruction.
-	 */
 	offline_css(css);
-
-	/*
-	 * If @cgrp is marked dead, it's waiting for refs of all css's to
-	 * be disabled before proceeding to the second phase of cgroup
-	 * destruction.  If we are the last one, kick it off.
-	 */
-	if (!cgrp->nr_css && cgroup_is_dead(cgrp))
-		cgroup_destroy_css_killed(cgrp);
-
 	mutex_unlock(&cgroup_mutex);
 
-	/*
-	 * Put the css refs from kill_css().  Each css holds an extra
-	 * reference to the cgroup's dentry and cgroup removal proceeds
-	 * regardless of css refs.  On the last put of each css, whenever
-	 * that may be, the extra dentry ref is put so that dentry
-	 * destruction happens only after all css's are released.
-	 */
 	css_put(css);
 }
 
@@ -4518,11 +4494,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	set_bit(CGRP_DEAD, &cgrp->flags);
 
-	/*
-	 * Initiate massacre of all css's.  cgroup_destroy_css_killed()
-	 * will be invoked to perform the rest of destruction once the
-	 * percpu refs of all css's are confirmed to be killed.
-	 */
+	/* initiate massacre of all css's */
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
 
@@ -4532,15 +4504,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
 
-	/*
-	 * If @cgrp has css's attached, the second stage of cgroup
-	 * destruction is kicked off from css_killed_work_fn() after the
-	 * refs of all attached css's are killed.  If @cgrp doesn't have
-	 * any css, we kick it off here.
-	 */
-	if (!cgrp->nr_css)
-		cgroup_destroy_css_killed(cgrp);
-
 	/*
 	 * Remove @cgrp directory along with the base files.  @cgrp has an
 	 * extra ref on its kn.
@@ -4550,25 +4513,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	set_bit(CGRP_RELEASABLE, &cgrp->parent->flags);
 	check_for_release(cgrp->parent);
 
+	/* put the base reference */
+	cgroup_put(cgrp);
+
 	return 0;
 };
 
-/**
- * cgroup_destroy_css_killed - the second step of cgroup destruction
- * @cgrp: the cgroup whose csses have just finished offlining
- *
- * This function is invoked from a work item for a cgroup which is being
- * destroyed after all css's are offlined and performs the rest of
- * destruction.  This is the second step of destruction described in the
- * comment above cgroup_destroy_locked().
- */
-static void cgroup_destroy_css_killed(struct cgroup *cgrp)
-{
-	lockdep_assert_held(&cgroup_mutex);
-
-	cgroup_put(cgrp);
-}
-
 static int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
-- 
cgit v1.2.3-71-gd317


From 9395a4500404e05173eda9a2d198b6fa500e90c5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 May 2014 09:15:02 -0400
Subject: cgroup: enable refcnting for root csses

Currently, css_get(), css_tryget() and css_tryget_online() are noops
for root csses as an optimization; however, we're planning to use css
refcnts to track of cgroup lifetime too and root cgroups also need to
be reference counted.  Since css has been converted to percpu_refcnt,
the overhead of refcnting is miniscule and this optimization isn't too
meaningful anymore.  Furthermore, controllers which optimize the root
cgroup often never even invoke these functions in their hot paths.

This patch enables refcnting for root csses too.  This makes CSS_ROOT
flag unused and removes it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 10 ++--------
 kernel/cgroup.c        |  6 +++---
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 160fcc69149e..286e39e4e9bf 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -77,7 +77,6 @@ struct cgroup_subsys_state {
 
 /* bits in struct cgroup_subsys_state flags field */
 enum {
-	CSS_ROOT	= (1 << 0), /* this CSS is the root of the subsystem */
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
 
@@ -89,9 +88,7 @@ enum {
  */
 static inline void css_get(struct cgroup_subsys_state *css)
 {
-	/* We don't need to reference count the root state */
-	if (!(css->flags & CSS_ROOT))
-		percpu_ref_get(&css->refcnt);
+	percpu_ref_get(&css->refcnt);
 }
 
 /**
@@ -106,8 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css)
  */
 static inline bool css_tryget_online(struct cgroup_subsys_state *css)
 {
-	if (css->flags & CSS_ROOT)
-		return true;
 	return percpu_ref_tryget_live(&css->refcnt);
 }
 
@@ -119,8 +114,7 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css)
  */
 static inline void css_put(struct cgroup_subsys_state *css)
 {
-	if (!(css->flags & CSS_ROOT))
-		percpu_ref_put(&css->refcnt);
+	percpu_ref_put(&css->refcnt);
 }
 
 /* bits in struct cgroup flags field */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e694f4153edb..cb5864e36f99 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4158,8 +4158,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	if (cgrp->parent) {
 		css->parent = cgroup_css(cgrp->parent, ss);
 		css_get(css->parent);
-	} else {
-		css->flags |= CSS_ROOT;
 	}
 
 	BUG_ON(cgroup_css(cgrp, ss));
@@ -4582,9 +4580,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 	BUG_ON(IS_ERR(css));
 	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
 	if (early) {
-		/* idr_alloc() can't be called safely during early init */
+		/* allocation can't be done safely during early init */
 		css->id = 1;
 	} else {
+		BUG_ON(percpu_ref_init(&css->refcnt, css_release));
 		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
 		BUG_ON(css->id < 0);
 	}
@@ -4671,6 +4670,7 @@ int __init cgroup_init(void)
 			struct cgroup_subsys_state *css =
 				init_css_set.subsys[ss->id];
 
+			BUG_ON(percpu_ref_init(&css->refcnt, css_release));
 			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
 						   GFP_KERNEL);
 			BUG_ON(css->id < 0);
-- 
cgit v1.2.3-71-gd317


From 9d755d33f0db8c9b49438f71b38a56e375b34360 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 May 2014 09:15:02 -0400
Subject: cgroup: use cgroup->self.refcnt for cgroup refcnting

Currently cgroup implements refcnting separately using atomic_t
cgroup->refcnt.  The destruction paths of cgroup and css are rather
complex and bear a lot of similiarities including the use of RCU and
bouncing to a work item.

This patch makes cgroup use the refcnt of self css for refcnting
instead of using its own.  This makes cgroup refcnting use css's
percpu refcnt and share the destruction mechanism.

* css_release_work_fn() and css_free_work_fn() are updated to handle
  both csses and cgroups.  This is a bit messy but should do until we
  can make cgroup->self a full css, which currently can't be done
  thanks to multiple hierarchies.

* cgroup_destroy_locked() now performs
  percpu_ref_kill(&cgrp->self.refcnt) instead of cgroup_put(cgrp).

* Negative refcnt sanity check in cgroup_get() is no longer necessary
  as percpu_ref already handles it.

* Similarly, as a cgroup which hasn't been killed will never be
  released regardless of its refcnt value and percpu_ref has sanity
  check on kill, cgroup_is_dead() sanity check in cgroup_put() is no
  longer necessary.

* As whether a refcnt reached zero or not can only be decided after
  the reference count is killed, cgroup_root->cgrp's refcnting can no
  longer be used to decide whether to kill the root or not.  Let's
  make cgroup_kill_sb() explicitly initiate destruction if the root
  doesn't have any children.  This makes sense anyway as unmounted
  cgroup hierarchy without any children should be destroyed.

While this is a bit messy, this will allow pushing more bookkeeping
towards cgroup->self and thus handling cgroups and csses in more
uniform way.  In the very long term, it should be possible to
introduce a base subsystem and convert the self css to a proper one
making things whole lot simpler and unified.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |   6 --
 kernel/cgroup.c        | 146 +++++++++++++++++++++++++++----------------------
 2 files changed, 80 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 286e39e4e9bf..76dadd77a120 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -160,8 +160,6 @@ struct cgroup {
 	 */
 	int populated_cnt;
 
-	atomic_t refcnt;
-
 	/*
 	 * We link our 'sibling' struct into our parent's 'children'.
 	 * Our children link their 'sibling' into our 'children'.
@@ -218,10 +216,6 @@ struct cgroup {
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 
-	/* For css percpu_ref killing and RCU-protected deletion */
-	struct rcu_head rcu_head;
-	struct work_struct destroy_work;
-
 	/* used to wait for offlining of csses */
 	wait_queue_head_t offline_waitq;
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cb5864e36f99..c01e8e8dfad0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -176,10 +176,12 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 
 static void cgroup_put(struct cgroup *cgrp);
+static bool cgroup_has_live_children(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned int ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
@@ -1008,62 +1010,15 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	return mode;
 }
 
-static void cgroup_free_fn(struct work_struct *work)
-{
-	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
-
-	atomic_dec(&cgrp->root->nr_cgrps);
-	cgroup_pidlist_destroy_all(cgrp);
-
-	if (cgrp->parent) {
-		/*
-		 * We get a ref to the parent, and put the ref when this
-		 * cgroup is being freed, so it's guaranteed that the
-		 * parent won't be destroyed before its children.
-		 */
-		cgroup_put(cgrp->parent);
-		kernfs_put(cgrp->kn);
-		kfree(cgrp);
-	} else {
-		/*
-		 * This is root cgroup's refcnt reaching zero, which
-		 * indicates that the root should be released.
-		 */
-		cgroup_destroy_root(cgrp->root);
-	}
-}
-
-static void cgroup_free_rcu(struct rcu_head *head)
-{
-	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
-
-	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
-	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
-}
-
 static void cgroup_get(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
-	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
-	atomic_inc(&cgrp->refcnt);
+	css_get(&cgrp->self);
 }
 
 static void cgroup_put(struct cgroup *cgrp)
 {
-	if (!atomic_dec_and_test(&cgrp->refcnt))
-		return;
-	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
-		return;
-
-	/* delete this cgroup from parent->children */
-	mutex_lock(&cgroup_mutex);
-	list_del_rcu(&cgrp->sibling);
-	mutex_unlock(&cgroup_mutex);
-
-	cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-	cgrp->id = -1;
-
-	call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
+	css_put(&cgrp->self);
 }
 
 /**
@@ -1548,7 +1503,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	atomic_set(&cgrp->refcnt, 1);
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->cset_links);
@@ -1597,6 +1551,10 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 		goto out;
 	root_cgrp->id = ret;
 
+	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+	if (ret)
+		goto out;
+
 	/*
 	 * We're accessing css_set_count without locking css_set_rwsem here,
 	 * but that's OK - it can only be increased by someone holding
@@ -1605,11 +1563,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 	 */
 	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 	if (ret)
-		goto out;
+		goto cancel_ref;
 
 	ret = cgroup_init_root_id(root);
 	if (ret)
-		goto out;
+		goto cancel_ref;
 
 	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
@@ -1657,6 +1615,8 @@ destroy_root:
 	root->kf_root = NULL;
 exit_root_id:
 	cgroup_exit_root_id(root);
+cancel_ref:
+	percpu_ref_cancel_init(&root_cgrp->self.refcnt);
 out:
 	free_cgrp_cset_links(&tmp_links);
 	return ret;
@@ -1735,13 +1695,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		}
 
 		/*
-		 * A root's lifetime is governed by its root cgroup.  Zero
-		 * ref indicate that the root is being destroyed.  Wait for
-		 * destruction to complete so that the subsystems are free.
-		 * We can use wait_queue for the wait but this path is
-		 * super cold.  Let's just sleep for a bit and retry.
+		 * A root's lifetime is governed by its root cgroup.
+		 * tryget_live failure indicate that the root is being
+		 * destroyed.  Wait for destruction to complete so that the
+		 * subsystems are free.  We can use wait_queue for the wait
+		 * but this path is super cold.  Let's just sleep for a bit
+		 * and retry.
 		 */
-		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
+		if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			msleep(10);
 			ret = restart_syscall();
@@ -1794,7 +1755,16 @@ static void cgroup_kill_sb(struct super_block *sb)
 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 
-	cgroup_put(&root->cgrp);
+	/*
+	 * If @root doesn't have any mounts or children, start killing it.
+	 * This prevents new mounts by disabling percpu_ref_tryget_live().
+	 * cgroup_mount() may wait for @root's release.
+	 */
+	if (cgroup_has_live_children(&root->cgrp))
+		cgroup_put(&root->cgrp);
+	else
+		percpu_ref_kill(&root->cgrp.self.refcnt);
+
 	kernfs_kill_sb(sb);
 }
 
@@ -4110,11 +4080,37 @@ static void css_free_work_fn(struct work_struct *work)
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 
-	if (css->parent)
-		css_put(css->parent);
+	if (css->ss) {
+		/* css free path */
+		if (css->parent)
+			css_put(css->parent);
 
-	css->ss->css_free(css);
-	cgroup_put(cgrp);
+		css->ss->css_free(css);
+		cgroup_put(cgrp);
+	} else {
+		/* cgroup free path */
+		atomic_dec(&cgrp->root->nr_cgrps);
+		cgroup_pidlist_destroy_all(cgrp);
+
+		if (cgrp->parent) {
+			/*
+			 * We get a ref to the parent, and put the ref when
+			 * this cgroup is being freed, so it's guaranteed
+			 * that the parent won't be destroyed before its
+			 * children.
+			 */
+			cgroup_put(cgrp->parent);
+			kernfs_put(cgrp->kn);
+			kfree(cgrp);
+		} else {
+			/*
+			 * This is root cgroup's refcnt reaching zero,
+			 * which indicates that the root should be
+			 * released.
+			 */
+			cgroup_destroy_root(cgrp->root);
+		}
+	}
 }
 
 static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4131,8 +4127,20 @@ static void css_release_work_fn(struct work_struct *work)
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup_subsys *ss = css->ss;
+	struct cgroup *cgrp = css->cgroup;
 
-	cgroup_idr_remove(&ss->css_idr, css->id);
+	if (ss) {
+		/* css release path */
+		cgroup_idr_remove(&ss->css_idr, css->id);
+	} else {
+		/* cgroup release path */
+		mutex_lock(&cgroup_mutex);
+		list_del_rcu(&cgrp->sibling);
+		mutex_unlock(&cgroup_mutex);
+
+		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+		cgrp->id = -1;
+	}
 
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
@@ -4285,6 +4293,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 		goto out_unlock;
 	}
 
+	ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+	if (ret)
+		goto out_free_cgrp;
+
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
@@ -4292,7 +4304,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
 	if (cgrp->id < 0) {
 		ret = -ENOMEM;
-		goto out_free_cgrp;
+		goto out_cancel_ref;
 	}
 
 	init_cgroup_housekeeping(cgrp);
@@ -4365,6 +4377,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
 out_free_id:
 	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_cancel_ref:
+	percpu_ref_cancel_init(&cgrp->self.refcnt);
 out_free_cgrp:
 	kfree(cgrp);
 out_unlock:
@@ -4521,7 +4535,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	check_for_release(cgrp->parent);
 
 	/* put the base reference */
-	cgroup_put(cgrp);
+	percpu_ref_kill(&cgrp->self.refcnt);
 
 	return 0;
 };
-- 
cgit v1.2.3-71-gd317


From 7413af1fb70e7efa6dbc7f27663e7a5126b3aa33 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 6 May 2014 21:34:14 -0400
Subject: ftrace: Make get_ftrace_addr() and get_ftrace_addr_old() global

Move and rename get_ftrace_addr() and get_ftrace_addr_old() to
ftrace_get_addr_new() and ftrace_get_addr_curr() respectively.

This moves these two helper functions in the generic code out from
the arch specific code, and renames them to have a better generic
name. This will allow other archs to use them as well as makes it
a bit easier to work on getting separate trampolines for different
functions.

ftrace_get_addr_new() returns the trampoline address that the mcount
call address will be converted to.

ftrace_get_addr_curr() returns the trampoline address of what the
mcount call address currently jumps to.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 36 +++++-------------------------------
 include/linux/ftrace.h   |  2 ++
 kernel/trace/ftrace.c    | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4b3c195d4133..5ef43ce8492f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -349,38 +349,12 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
 	return add_break(rec->ip, old);
 }
 
-/*
- * If the record has the FTRACE_FL_REGS set, that means that it
- * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
- * is not not set, then it wants to convert to the normal callback.
- */
-static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
-{
-	if (rec->flags & FTRACE_FL_REGS)
-		return (unsigned long)FTRACE_REGS_ADDR;
-	else
-		return (unsigned long)FTRACE_ADDR;
-}
-
-/*
- * The FTRACE_FL_REGS_EN is set when the record already points to
- * a function that saves all the regs. Basically the '_EN' version
- * represents the current state of the function.
- */
-static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
-{
-	if (rec->flags & FTRACE_FL_REGS_EN)
-		return (unsigned long)FTRACE_REGS_ADDR;
-	else
-		return (unsigned long)FTRACE_ADDR;
-}
-
 static int add_breakpoints(struct dyn_ftrace *rec, int enable)
 {
 	unsigned long ftrace_addr;
 	int ret;
 
-	ftrace_addr = get_ftrace_old_addr(rec);
+	ftrace_addr = ftrace_get_addr_curr(rec);
 
 	ret = ftrace_test_record(rec, enable);
 
@@ -438,14 +412,14 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
 		 * If not, don't touch the breakpoint, we make just create
 		 * a disaster.
 		 */
-		ftrace_addr = get_ftrace_addr(rec);
+		ftrace_addr = ftrace_get_addr_new(rec);
 		nop = ftrace_call_replace(ip, ftrace_addr);
 
 		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
 			goto update;
 
 		/* Check both ftrace_addr and ftrace_old_addr */
-		ftrace_addr = get_ftrace_old_addr(rec);
+		ftrace_addr = ftrace_get_addr_curr(rec);
 		nop = ftrace_call_replace(ip, ftrace_addr);
 
 		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
@@ -489,7 +463,7 @@ static int add_update(struct dyn_ftrace *rec, int enable)
 
 	ret = ftrace_test_record(rec, enable);
 
-	ftrace_addr  = get_ftrace_addr(rec);
+	ftrace_addr  = ftrace_get_addr_new(rec);
 
 	switch (ret) {
 	case FTRACE_UPDATE_IGNORE:
@@ -536,7 +510,7 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
 
 	ret = ftrace_update_record(rec, enable);
 
-	ftrace_addr = get_ftrace_addr(rec);
+	ftrace_addr = ftrace_get_addr_new(rec);
 
 	switch (ret) {
 	case FTRACE_UPDATE_IGNORE:
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f0ff2c2453e7..2f8cbffecd3d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -400,6 +400,8 @@ int ftrace_update_record(struct dyn_ftrace *rec, int enable);
 int ftrace_test_record(struct dyn_ftrace *rec, int enable);
 void ftrace_run_stop_machine(int command);
 unsigned long ftrace_location(unsigned long ip);
+unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec);
+unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec);
 
 extern ftrace_func_t ftrace_trace_function;
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 98fa931b6864..e825fded435d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1755,6 +1755,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
 	return ftrace_check_record(rec, enable, 0);
 }
 
+/**
+ * ftrace_get_addr_new - Get the call address to set to
+ * @rec:  The ftrace record descriptor
+ *
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ *
+ * Returns the address of the trampoline to set to
+ */
+unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
+{
+	if (rec->flags & FTRACE_FL_REGS)
+		return (unsigned long)FTRACE_REGS_ADDR;
+	else
+		return (unsigned long)FTRACE_ADDR;
+}
+
+/**
+ * ftrace_get_addr_curr - Get the call address that is already there
+ * @rec:  The ftrace record descriptor
+ *
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ *
+ * Returns the address of the trampoline that is currently being called
+ */
+unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
+{
+	if (rec->flags & FTRACE_FL_REGS_EN)
+		return (unsigned long)FTRACE_REGS_ADDR;
+	else
+		return (unsigned long)FTRACE_ADDR;
+}
+
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
-- 
cgit v1.2.3-71-gd317


From f1b2f2bd5821c6ab7feed2e133343dd54b212ed9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 7 May 2014 16:09:49 -0400
Subject: ftrace: Remove FTRACE_UPDATE_MODIFY_CALL_REGS flag

As the decision to what needs to be done (converting a call to the
ftrace_caller to ftrace_caller_regs or to convert from ftrace_caller_regs
to ftrace_caller) can easily be determined from the rec->flags of
FTRACE_FL_REGS and FTRACE_FL_REGS_EN, there's no need to have the
ftrace_check_record() return either a UPDATE_MODIFY_CALL_REGS or a
UPDATE_MODIFY_CALL. Just he latter is enough. This added flag causes
more complexity than is required. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c |  3 ---
 include/linux/ftrace.h   |  2 --
 kernel/trace/ftrace.c    | 13 ++++---------
 3 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5ef43ce8492f..89de3eaf8772 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -366,7 +366,6 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
 		/* converting nop to call */
 		return add_brk_on_nop(rec);
 
-	case FTRACE_UPDATE_MODIFY_CALL_REGS:
 	case FTRACE_UPDATE_MODIFY_CALL:
 	case FTRACE_UPDATE_MAKE_NOP:
 		/* converting a call to a nop */
@@ -469,7 +468,6 @@ static int add_update(struct dyn_ftrace *rec, int enable)
 	case FTRACE_UPDATE_IGNORE:
 		return 0;
 
-	case FTRACE_UPDATE_MODIFY_CALL_REGS:
 	case FTRACE_UPDATE_MODIFY_CALL:
 	case FTRACE_UPDATE_MAKE_CALL:
 		/* converting nop to call */
@@ -516,7 +514,6 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
 	case FTRACE_UPDATE_IGNORE:
 		return 0;
 
-	case FTRACE_UPDATE_MODIFY_CALL_REGS:
 	case FTRACE_UPDATE_MODIFY_CALL:
 	case FTRACE_UPDATE_MAKE_CALL:
 		/* converting nop to call */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2f8cbffecd3d..3e6dfb31f8e6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -362,14 +362,12 @@ enum {
  *  IGNORE           - The function is already what we want it to be
  *  MAKE_CALL        - Start tracing the function
  *  MODIFY_CALL      - Stop saving regs for the function
- *  MODIFY_CALL_REGS - Start saving regs for the function
  *  MAKE_NOP         - Stop tracing the function
  */
 enum {
 	FTRACE_UPDATE_IGNORE,
 	FTRACE_UPDATE_MAKE_CALL,
 	FTRACE_UPDATE_MODIFY_CALL,
-	FTRACE_UPDATE_MODIFY_CALL_REGS,
 	FTRACE_UPDATE_MAKE_NOP,
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 52c2b53b7953..cc07b7fc4372 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1701,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 		/*
 		 * If this record is being updated from a nop, then
 		 *   return UPDATE_MAKE_CALL.
-		 * Otherwise, if the EN flag is set, then return
-		 *   UPDATE_MODIFY_CALL_REGS to tell the caller to convert
-		 *   from the non-save regs, to a save regs function.
 		 * Otherwise,
 		 *   return UPDATE_MODIFY_CALL to tell the caller to convert
-		 *   from the save regs, to a non-save regs function.
+		 *   from the save regs, to a non-save regs function or
+		 *   vice versa.
 		 */
 		if (flag & FTRACE_FL_ENABLED)
 			return FTRACE_UPDATE_MAKE_CALL;
-		else if (rec->flags & FTRACE_FL_REGS_EN)
-			return FTRACE_UPDATE_MODIFY_CALL_REGS;
-		else
-			return FTRACE_UPDATE_MODIFY_CALL;
+
+		return FTRACE_UPDATE_MODIFY_CALL;
 	}
 
 	if (update) {
@@ -1815,7 +1811,6 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	case FTRACE_UPDATE_MAKE_NOP:
 		return ftrace_make_nop(NULL, rec, ftrace_addr);
 
-	case FTRACE_UPDATE_MODIFY_CALL_REGS:
 	case FTRACE_UPDATE_MODIFY_CALL:
 		return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
 	}
-- 
cgit v1.2.3-71-gd317


From f9f36917903b57c571b1ddcfc6bc794ca4dd8232 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <kefeng.wang@linaro.org>
Date: Wed, 14 May 2014 14:13:41 +0800
Subject: libahci_platform: add host_flags parameter in
 ahci_platform_init_host()

Add a dynamic host_flags argument to make ahci_platform_init_host more flexible,
then remove the AHCI_HFLAGS(...) argument from some driver's ata_port_info,
and pass that in as the new argument.

Cc: Hans de Geode <hdegoede@redhat.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Kefeng Wang <kefeng.wang@linaro.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/ahci_da850.c       | 3 ++-
 drivers/ata/ahci_imx.c         | 3 ++-
 drivers/ata/ahci_mvebu.c       | 3 ++-
 drivers/ata/ahci_platform.c    | 2 +-
 drivers/ata/ahci_st.c          | 2 +-
 drivers/ata/ahci_sunxi.c       | 9 ++++++---
 drivers/ata/ahci_xgene.c       | 7 +++++--
 drivers/ata/libahci_platform.c | 5 ++++-
 include/linux/ahci_platform.h  | 1 +
 9 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci_da850.c b/drivers/ata/ahci_da850.c
index 2c83613ce2db..2b77d53bccf8 100644
--- a/drivers/ata/ahci_da850.c
+++ b/drivers/ata/ahci_da850.c
@@ -85,7 +85,8 @@ static int ahci_da850_probe(struct platform_device *pdev)
 
 	da850_sata_init(dev, pwrdn_reg, hpriv->mmio);
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_da850_port_info, 0, 0);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_da850_port_info,
+				     0, 0, 0);
 	if (rc)
 		goto disable_resources;
 
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 497c7abe1c7d..e7e44a73e4fe 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -267,7 +267,8 @@ static int imx_ahci_probe(struct platform_device *pdev)
 	reg_val = clk_get_rate(imxpriv->ahb_clk) / 1000;
 	writel(reg_val, hpriv->mmio + HOST_TIMER1MS);
 
-	ret = ahci_platform_init_host(pdev, hpriv, &ahci_imx_port_info, 0, 0);
+	ret = ahci_platform_init_host(pdev, hpriv, &ahci_imx_port_info,
+				      0, 0, 0);
 	if (ret)
 		imx_sata_disable(hpriv);
 
diff --git a/drivers/ata/ahci_mvebu.c b/drivers/ata/ahci_mvebu.c
index 1df8630c6b65..fd3dfd733b84 100644
--- a/drivers/ata/ahci_mvebu.c
+++ b/drivers/ata/ahci_mvebu.c
@@ -88,7 +88,8 @@ static int ahci_mvebu_probe(struct platform_device *pdev)
 	ahci_mvebu_mbus_config(hpriv, dram);
 	ahci_mvebu_regret_option(hpriv);
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_mvebu_port_info, 0, 0);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_mvebu_port_info,
+				     0, 0, 0);
 	if (rc)
 		goto disable_resources;
 
diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index ef67e79944f9..a476a1fd3f8f 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -55,7 +55,7 @@ static int ahci_probe(struct platform_device *pdev)
 			goto disable_resources;
 	}
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info, 0, 0);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info, 0, 0, 0);
 	if (rc)
 		goto pdata_exit;
 
diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c
index 633222226c19..2595598df9ce 100644
--- a/drivers/ata/ahci_st.c
+++ b/drivers/ata/ahci_st.c
@@ -166,7 +166,7 @@ static int st_ahci_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	err = ahci_platform_init_host(pdev, hpriv, &st_ahci_port_info, 0, 0);
+	err = ahci_platform_init_host(pdev, hpriv, &st_ahci_port_info, 0, 0, 0);
 	if (err) {
 		ahci_platform_disable_resources(hpriv);
 		return err;
diff --git a/drivers/ata/ahci_sunxi.c b/drivers/ata/ahci_sunxi.c
index 42d3f64e74b3..02002f125bd4 100644
--- a/drivers/ata/ahci_sunxi.c
+++ b/drivers/ata/ahci_sunxi.c
@@ -157,8 +157,6 @@ static void ahci_sunxi_start_engine(struct ata_port *ap)
 }
 
 static const struct ata_port_info ahci_sunxi_port_info = {
-	AHCI_HFLAGS(AHCI_HFLAG_32BIT_ONLY | AHCI_HFLAG_NO_MSI |
-			  AHCI_HFLAG_NO_PMP | AHCI_HFLAG_YES_NCQ),
 	.flags		= AHCI_FLAG_COMMON | ATA_FLAG_NCQ,
 	.pio_mask	= ATA_PIO4,
 	.udma_mask	= ATA_UDMA6,
@@ -169,6 +167,7 @@ static int ahci_sunxi_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct ahci_host_priv *hpriv;
+	unsigned long hflags;
 	int rc;
 
 	hpriv = ahci_platform_get_resources(pdev);
@@ -185,7 +184,11 @@ static int ahci_sunxi_probe(struct platform_device *pdev)
 	if (rc)
 		goto disable_resources;
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_sunxi_port_info, 0, 0);
+	hflags = AHCI_HFLAG_32BIT_ONLY | AHCI_HFLAG_NO_MSI |
+		 AHCI_HFLAG_NO_PMP | AHCI_HFLAG_YES_NCQ;
+
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_sunxi_port_info,
+				     hflags, 0, 0);
 	if (rc)
 		goto disable_resources;
 
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index 77c89bf171f1..042a9bb45c86 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -303,7 +303,6 @@ static struct ata_port_operations xgene_ahci_ops = {
 };
 
 static const struct ata_port_info xgene_ahci_port_info = {
-	AHCI_HFLAGS(AHCI_HFLAG_NO_PMP | AHCI_HFLAG_YES_NCQ),
 	.flags = AHCI_FLAG_COMMON | ATA_FLAG_NCQ,
 	.pio_mask = ATA_PIO4,
 	.udma_mask = ATA_UDMA6,
@@ -382,6 +381,7 @@ static int xgene_ahci_probe(struct platform_device *pdev)
 	struct ahci_host_priv *hpriv;
 	struct xgene_ahci_context *ctx;
 	struct resource *res;
+	unsigned long hflags;
 	int rc;
 
 	hpriv = ahci_platform_get_resources(pdev);
@@ -450,7 +450,10 @@ static int xgene_ahci_probe(struct platform_device *pdev)
 		goto disable_resources;
 	}
 
-	rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info, 0, 0);
+	hflags = AHCI_HFLAG_NO_PMP | AHCI_HFLAG_YES_NCQ;
+
+	rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info,
+				     hflags, 0, 0);
 	if (rc)
 		goto disable_resources;
 
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index 7cb3a85719c0..3a5b4ed25a4f 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -283,6 +283,7 @@ EXPORT_SYMBOL_GPL(ahci_platform_get_resources);
  * @pdev: platform device pointer for the host
  * @hpriv: ahci-host private data for the host
  * @pi_template: template for the ata_port_info to use
+ * @host_flags: ahci host flags used in ahci_host_priv
  * @force_port_map: param passed to ahci_save_initial_config
  * @mask_port_map: param passed to ahci_save_initial_config
  *
@@ -296,6 +297,7 @@ EXPORT_SYMBOL_GPL(ahci_platform_get_resources);
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
 			    const struct ata_port_info *pi_template,
+			    unsigned long host_flags,
 			    unsigned int force_port_map,
 			    unsigned int mask_port_map)
 {
@@ -312,7 +314,8 @@ int ahci_platform_init_host(struct platform_device *pdev,
 	}
 
 	/* prepare host */
-	hpriv->flags |= (unsigned long)pi.private_data;
+	pi.private_data = (void *)host_flags;
+	hpriv->flags |= host_flags;
 
 	ahci_save_initial_config(dev, hpriv, force_port_map, mask_port_map);
 
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index 1f16d502600c..6dfd51a04d77 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -44,6 +44,7 @@ struct ahci_host_priv *ahci_platform_get_resources(
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
 			    const struct ata_port_info *pi_template,
+			    unsigned long host_flags,
 			    unsigned int force_port_map,
 			    unsigned int mask_port_map);
 
-- 
cgit v1.2.3-71-gd317


From 1a56f2aa4752293e5a9c0c3a2331620aa1fdb808 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 14 May 2014 13:43:37 +0900
Subject: workqueue: Remove deprecated flush[_delayed]_work_sync()

flush[_delayed]_work_sync() were deprecated by 4382973 ("workqueue:
deprecate flush[_delayed]_work_sync()") and have been deprecated
for a long time. In addition, these are not used anymore. So,
let's remove these functions.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1b22c42e9c2d..aa92d0295e28 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -587,18 +587,6 @@ static inline bool keventd_up(void)
 	return system_wq != NULL;
 }
 
-/* used to be different but now identical to flush_work(), deprecated */
-static inline bool __deprecated flush_work_sync(struct work_struct *work)
-{
-	return flush_work(work);
-}
-
-/* used to be different but now identical to flush_delayed_work(), deprecated */
-static inline bool __deprecated flush_delayed_work_sync(struct delayed_work *dwork)
-{
-	return flush_delayed_work(dwork);
-}
-
 #ifndef CONFIG_SMP
 static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
-- 
cgit v1.2.3-71-gd317


From cf416171e7e1d966111f53bdae82f51af05e7bf8 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 14 May 2014 13:58:06 +0900
Subject: workqueue: Remove deprecated system_nrt[_freezable]_wq

system_nrt[_freezable]_wq were deprecated by 3b07e9c ("workqueue:
deprecate system_nrt[_freezable]_wq") and have been deprecated
for a long time. In addition, these are not used anymore. So,
let's remove these functions.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index aa92d0295e28..d93d28b2ec73 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -364,20 +364,6 @@ extern struct workqueue_struct *system_freezable_wq;
 extern struct workqueue_struct *system_power_efficient_wq;
 extern struct workqueue_struct *system_freezable_power_efficient_wq;
 
-static inline struct workqueue_struct * __deprecated __system_nrt_wq(void)
-{
-	return system_wq;
-}
-
-static inline struct workqueue_struct * __deprecated __system_nrt_freezable_wq(void)
-{
-	return system_freezable_wq;
-}
-
-/* equivlalent to system_wq and system_freezable_wq, deprecated */
-#define system_nrt_wq			__system_nrt_wq()
-#define system_nrt_freezable_wq		__system_nrt_freezable_wq()
-
 extern struct workqueue_struct *
 __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 	struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
-- 
cgit v1.2.3-71-gd317


From 2eacc23c422b4553030168f315cb49522fa1b1f6 Mon Sep 17 00:00:00 2001
From: Yuval Atias <yuvala@mellanox.com>
Date: Wed, 14 May 2014 12:15:10 +0300
Subject: net/mlx4_core: Enforce irq affinity changes immediatly

During heavy traffic, napi is constatntly polling the complition queue
and no interrupt is fired. Because of that, changes to irq affinity are
ignored until traffic is stopped and resumed.

By registering to the irq notifier mechanism, and forcing interrupt when
affinity is changed, irq affinity changes will be immediatly enforced.

Signed-off-by: Yuval Atias <yuvala@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cq.c    |  3 ++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 11 +++++-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c |  6 +++
 drivers/net/ethernet/mellanox/mlx4/eq.c    | 62 ++++++++++++++++++++++++++++++
 include/linux/mlx4/device.h                |  3 ++
 5 files changed, 83 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 0487121e4a0f..8542030b89cf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -293,6 +293,9 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	atomic_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 
+	cq->irq = priv->eq_table.eq[cq->vector].irq;
+	cq->irq_affinity_change = false;
+
 	return 0;
 
 err_radix:
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index a1512450816d..e8c0d2b832b7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -895,10 +895,17 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
 	mlx4_en_cq_unlock_napi(cq);
 
 	/* If we used up all the quota - we're probably not done yet... */
-	if (done == budget)
+	if (done == budget) {
 		INC_PERF_COUNTER(priv->pstats.napi_quota);
-	else {
+		if (unlikely(cq->mcq.irq_affinity_change)) {
+			cq->mcq.irq_affinity_change = false;
+			napi_complete(napi);
+			mlx4_en_arm_cq(priv, cq);
+			return 0;
+		}
+	} else {
 		/* Done for now */
+		cq->mcq.irq_affinity_change = false;
 		napi_complete(napi);
 		mlx4_en_arm_cq(priv, cq);
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 89585c6311c3..cb964056d710 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -474,9 +474,15 @@ int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
 	/* If we used up all the quota - we're probably not done yet... */
 	if (done < budget) {
 		/* Done for now */
+		cq->mcq.irq_affinity_change = false;
 		napi_complete(napi);
 		mlx4_en_arm_cq(priv, cq);
 		return done;
+	} else if (unlikely(cq->mcq.irq_affinity_change)) {
+		cq->mcq.irq_affinity_change = false;
+		napi_complete(napi);
+		mlx4_en_arm_cq(priv, cq);
+		return 0;
 	}
 	return budget;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 6c088bc1845b..d954ec1eac17 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -53,6 +53,11 @@ enum {
 	MLX4_EQ_ENTRY_SIZE	= 0x20
 };
 
+struct mlx4_irq_notify {
+	void *arg;
+	struct irq_affinity_notify notify;
+};
+
 #define MLX4_EQ_STATUS_OK	   ( 0 << 28)
 #define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
 #define MLX4_EQ_OWNER_SW	   ( 0 << 24)
@@ -1083,6 +1088,57 @@ static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
 	iounmap(priv->clr_base);
 }
 
+static void mlx4_irq_notifier_notify(struct irq_affinity_notify *notify,
+				     const cpumask_t *mask)
+{
+	struct mlx4_irq_notify *n = container_of(notify,
+						 struct mlx4_irq_notify,
+						 notify);
+	struct mlx4_priv *priv = (struct mlx4_priv *)n->arg;
+	struct radix_tree_iter iter;
+	void **slot;
+
+	radix_tree_for_each_slot(slot, &priv->cq_table.tree, &iter, 0) {
+		struct mlx4_cq *cq = (struct mlx4_cq *)(*slot);
+
+		if (cq->irq == notify->irq)
+			cq->irq_affinity_change = true;
+	}
+}
+
+static void mlx4_release_irq_notifier(struct kref *ref)
+{
+	struct mlx4_irq_notify *n = container_of(ref, struct mlx4_irq_notify,
+						 notify.kref);
+	kfree(n);
+}
+
+static void mlx4_assign_irq_notifier(struct mlx4_priv *priv,
+				     struct mlx4_dev *dev, int irq)
+{
+	struct mlx4_irq_notify *irq_notifier = NULL;
+	int err = 0;
+
+	irq_notifier = kzalloc(sizeof(*irq_notifier), GFP_KERNEL);
+	if (!irq_notifier) {
+		mlx4_warn(dev, "Failed to allocate irq notifier. irq %d\n",
+			  irq);
+		return;
+	}
+
+	irq_notifier->notify.irq = irq;
+	irq_notifier->notify.notify = mlx4_irq_notifier_notify;
+	irq_notifier->notify.release = mlx4_release_irq_notifier;
+	irq_notifier->arg = priv;
+	err = irq_set_affinity_notifier(irq, &irq_notifier->notify);
+	if (err) {
+		kfree(irq_notifier);
+		irq_notifier = NULL;
+		mlx4_warn(dev, "Failed to set irq notifier. irq %d\n", irq);
+	}
+}
+
+
 int mlx4_alloc_eq_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1353,6 +1409,9 @@ int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
 				continue;
 				/*we dont want to break here*/
 			}
+			mlx4_assign_irq_notifier(priv, dev,
+						 priv->eq_table.eq[vec].irq);
+
 			eq_set_ci(&priv->eq_table.eq[vec], 1);
 		}
 	}
@@ -1379,6 +1438,9 @@ void mlx4_release_eq(struct mlx4_dev *dev, int vec)
 		  Belonging to a legacy EQ*/
 		mutex_lock(&priv->msix_ctl.pool_lock);
 		if (priv->msix_ctl.pool_bm & 1ULL << i) {
+			irq_set_affinity_notifier(
+				priv->eq_table.eq[vec].irq,
+				NULL);
 			free_irq(priv->eq_table.eq[vec].irq,
 				 &priv->eq_table.eq[vec]);
 			priv->msix_ctl.pool_bm &= ~(1ULL << i);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ba87bd21295a..c0468e6f0442 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -577,6 +577,9 @@ struct mlx4_cq {
 
 	u32			cons_index;
 
+	u16                     irq;
+	bool                    irq_affinity_change;
+
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
 	int			arm_sn;
-- 
cgit v1.2.3-71-gd317


From 542ad4f8886b376dac9a4334bdb38f9c22a4d8da Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 25 Apr 2014 15:08:29 -0700
Subject: Input: gpio_keys - convert struct descriptions to kernel-doc

This patch converts descriptions of the structures defined in
linux/gpio_keys.h to follow kernel-doc format.

There is no functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/gpio_keys.h | 48 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h
index a7e977ff4abf..8b622468952c 100644
--- a/include/linux/gpio_keys.h
+++ b/include/linux/gpio_keys.h
@@ -3,29 +3,53 @@
 
 struct device;
 
+/**
+ * struct gpio_keys_button - configuration parameters
+ * @code:		input event code (KEY_*, SW_*)
+ * @gpio:		%-1 if this key does not support gpio
+ * @active_low:		%true indicates that button is considered
+ *			depressed when gpio is low
+ * @desc:		label that will be attached to button's gpio
+ * @type:		input event type (%EV_KEY, %EV_SW, %EV_ABS)
+ * @wakeup:		configure the button as a wake-up source
+ * @debounce_interval:	debounce ticks interval in msecs
+ * @can_disable:	%true indicates that userspace is allowed to
+ *			disable button via sysfs
+ * @value:		axis value for %EV_ABS
+ * @irq:		Irq number in case of interrupt keys
+ */
 struct gpio_keys_button {
-	/* Configuration parameters */
-	unsigned int code;	/* input event code (KEY_*, SW_*) */
-	int gpio;		/* -1 if this key does not support gpio */
+	unsigned int code;
+	int gpio;
 	int active_low;
 	const char *desc;
-	unsigned int type;	/* input event type (EV_KEY, EV_SW, EV_ABS) */
-	int wakeup;		/* configure the button as a wake-up source */
-	int debounce_interval;	/* debounce ticks interval in msecs */
+	unsigned int type;
+	int wakeup;
+	int debounce_interval;
 	bool can_disable;
-	int value;		/* axis value for EV_ABS */
-	unsigned int irq;	/* Irq number in case of interrupt keys */
+	int value;
+	unsigned int irq;
 };
 
+/**
+ * struct gpio_keys_platform_data - platform data for gpio_keys driver
+ * @buttons:		pointer to array of &gpio_keys_button structures
+ *			describing buttons attached to the device
+ * @nbuttons:		number of elements in @buttons array
+ * @poll_interval:	polling interval in msecs - for polling driver only
+ * @rep:		enable input subsystem auto repeat
+ * @enable:		platform hook for enabling the device
+ * @disable:		platform hook for disabling the device
+ * @name:		input device name
+ */
 struct gpio_keys_platform_data {
 	struct gpio_keys_button *buttons;
 	int nbuttons;
-	unsigned int poll_interval;	/* polling interval in msecs -
-					   for polling driver only */
-	unsigned int rep:1;		/* enable input subsystem auto repeat */
+	unsigned int poll_interval;
+	unsigned int rep:1;
 	int (*enable)(struct device *dev);
 	void (*disable)(struct device *dev);
-	const char *name;		/* input device name */
+	const char *name;
 };
 
 #endif
-- 
cgit v1.2.3-71-gd317


From bf1de9761c21f56d5b0c6a0acd3b792d801c61e6 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 28 Apr 2014 10:49:51 -0700
Subject: Input: implement managed polled input devices

Managed resources are becoming more and more popular in drivers. Let's
implement managed polled input devices, to complement managed regular input
devices.

Similarly to managed regular input devices only one new call
devm_input_allocate_polled_device() is added and the rest of APIs is
modified to work with both managed and non-managed devices.

Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Tested-by: Alexander Shiyan <shc_work@mail.ru>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input-polldev.c | 118 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/input-polldev.h |   3 ++
 2 files changed, 119 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input-polldev.c b/drivers/input/input-polldev.c
index 4b191908d5de..3664f81655ca 100644
--- a/drivers/input/input-polldev.c
+++ b/drivers/input/input-polldev.c
@@ -176,6 +176,91 @@ struct input_polled_dev *input_allocate_polled_device(void)
 }
 EXPORT_SYMBOL(input_allocate_polled_device);
 
+struct input_polled_devres {
+	struct input_polled_dev *polldev;
+};
+
+static int devm_input_polldev_match(struct device *dev, void *res, void *data)
+{
+	struct input_polled_devres *devres = res;
+
+	return devres->polldev == data;
+}
+
+static void devm_input_polldev_release(struct device *dev, void *res)
+{
+	struct input_polled_devres *devres = res;
+	struct input_polled_dev *polldev = devres->polldev;
+
+	dev_dbg(dev, "%s: dropping reference/freeing %s\n",
+		__func__, dev_name(&polldev->input->dev));
+
+	input_put_device(polldev->input);
+	kfree(polldev);
+}
+
+static void devm_input_polldev_unregister(struct device *dev, void *res)
+{
+	struct input_polled_devres *devres = res;
+	struct input_polled_dev *polldev = devres->polldev;
+
+	dev_dbg(dev, "%s: unregistering device %s\n",
+		__func__, dev_name(&polldev->input->dev));
+	input_unregister_device(polldev->input);
+
+	/*
+	 * Note that we are still holding extra reference to the input
+	 * device so it will stick around until devm_input_polldev_release()
+	 * is called.
+	 */
+}
+
+/**
+ * devm_input_allocate_polled_device - allocate managed polled device
+ * @dev: device owning the polled device being created
+ *
+ * Returns prepared &struct input_polled_dev or %NULL.
+ *
+ * Managed polled input devices do not need to be explicitly unregistered
+ * or freed as it will be done automatically when owner device unbinds
+ * from * its driver (or binding fails). Once such managed polled device
+ * is allocated, it is ready to be set up and registered in the same
+ * fashion as regular polled input devices (using
+ * input_register_polled_device() function).
+ *
+ * If you want to manually unregister and free such managed polled devices,
+ * it can be still done by calling input_unregister_polled_device() and
+ * input_free_polled_device(), although it is rarely needed.
+ *
+ * NOTE: the owner device is set up as parent of input device and users
+ * should not override it.
+ */
+struct input_polled_dev *devm_input_allocate_polled_device(struct device *dev)
+{
+	struct input_polled_dev *polldev;
+	struct input_polled_devres *devres;
+
+	devres = devres_alloc(devm_input_polldev_release, sizeof(*devres),
+			      GFP_KERNEL);
+	if (!devres)
+		return NULL;
+
+	polldev = input_allocate_polled_device();
+	if (!polldev) {
+		devres_free(devres);
+		return NULL;
+	}
+
+	polldev->input->dev.parent = dev;
+	polldev->devres_managed = true;
+
+	devres->polldev = polldev;
+	devres_add(dev, devres);
+
+	return polldev;
+}
+EXPORT_SYMBOL(devm_input_allocate_polled_device);
+
 /**
  * input_free_polled_device - free memory allocated for polled device
  * @dev: device to free
@@ -186,7 +271,12 @@ EXPORT_SYMBOL(input_allocate_polled_device);
 void input_free_polled_device(struct input_polled_dev *dev)
 {
 	if (dev) {
-		input_free_device(dev->input);
+		if (dev->devres_managed)
+			WARN_ON(devres_destroy(dev->input->dev.parent,
+						devm_input_polldev_release,
+						devm_input_polldev_match,
+						dev));
+		input_put_device(dev->input);
 		kfree(dev);
 	}
 }
@@ -204,9 +294,19 @@ EXPORT_SYMBOL(input_free_polled_device);
  */
 int input_register_polled_device(struct input_polled_dev *dev)
 {
+	struct input_polled_devres *devres = NULL;
 	struct input_dev *input = dev->input;
 	int error;
 
+	if (dev->devres_managed) {
+		devres = devres_alloc(devm_input_polldev_unregister,
+				      sizeof(*devres), GFP_KERNEL);
+		if (!devres)
+			return -ENOMEM;
+
+		devres->polldev = dev;
+	}
+
 	input_set_drvdata(input, dev);
 	INIT_DELAYED_WORK(&dev->work, input_polled_device_work);
 
@@ -221,8 +321,10 @@ int input_register_polled_device(struct input_polled_dev *dev)
 	input->dev.groups = input_polldev_attribute_groups;
 
 	error = input_register_device(input);
-	if (error)
+	if (error) {
+		devres_free(devres);
 		return error;
+	}
 
 	/*
 	 * Take extra reference to the underlying input device so
@@ -233,6 +335,12 @@ int input_register_polled_device(struct input_polled_dev *dev)
 	 */
 	input_get_device(input);
 
+	if (dev->devres_managed) {
+		dev_dbg(input->dev.parent, "%s: registering %s with devres.\n",
+			__func__, dev_name(&input->dev));
+		devres_add(input->dev.parent, devres);
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(input_register_polled_device);
@@ -247,6 +355,12 @@ EXPORT_SYMBOL(input_register_polled_device);
  */
 void input_unregister_polled_device(struct input_polled_dev *dev)
 {
+	if (dev->devres_managed)
+		WARN_ON(devres_destroy(dev->input->dev.parent,
+					devm_input_polldev_unregister,
+					devm_input_polldev_match,
+					dev));
+
 	input_unregister_device(dev->input);
 }
 EXPORT_SYMBOL(input_unregister_polled_device);
diff --git a/include/linux/input-polldev.h b/include/linux/input-polldev.h
index ce0b72464eb8..2465182670db 100644
--- a/include/linux/input-polldev.h
+++ b/include/linux/input-polldev.h
@@ -48,9 +48,12 @@ struct input_polled_dev {
 
 /* private: */
 	struct delayed_work work;
+
+	bool devres_managed;
 };
 
 struct input_polled_dev *input_allocate_polled_device(void);
+struct input_polled_dev *devm_input_allocate_polled_device(struct device *dev);
 void input_free_polled_device(struct input_polled_dev *dev);
 int input_register_polled_device(struct input_polled_dev *dev);
 void input_unregister_polled_device(struct input_polled_dev *dev);
-- 
cgit v1.2.3-71-gd317


From 4449bf927b61bdb4389393c6fea6837214d1ace7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 6 May 2014 13:10:24 -0400
Subject: tracing: Add __bitmask() macro to trace events to cpumasks and other
 bitmasks

Being able to show a cpumask of events can be useful as some events
may affect only some CPUs. There is no standard way to record the
cpumask and converting it to a string is rather expensive during
the trace as traces happen in hotpaths. It would be better to record
the raw event mask and be able to parse it at print time.

The following macros were added for use with the TRACE_EVENT() macro:

  __bitmask()
  __assign_bitmask()
  __get_bitmask()

To test this, I added this to the sched_migrate_task event, which
looked like this:

TRACE_EVENT(sched_migrate_task,

	TP_PROTO(struct task_struct *p, int dest_cpu, const struct cpumask *cpus),

	TP_ARGS(p, dest_cpu, cpus),

	TP_STRUCT__entry(
		__array(	char,	comm,	TASK_COMM_LEN	)
		__field(	pid_t,	pid			)
		__field(	int,	prio			)
		__field(	int,	orig_cpu		)
		__field(	int,	dest_cpu		)
		__bitmask(	cpumask, num_possible_cpus()	)
	),

	TP_fast_assign(
		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
		__entry->pid		= p->pid;
		__entry->prio		= p->prio;
		__entry->orig_cpu	= task_cpu(p);
		__entry->dest_cpu	= dest_cpu;
		__assign_bitmask(cpumask, cpumask_bits(cpus), num_possible_cpus());
	),

	TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d cpumask=%s",
		  __entry->comm, __entry->pid, __entry->prio,
		  __entry->orig_cpu, __entry->dest_cpu,
		  __get_bitmask(cpumask))
);

With the output of:

        ksmtuned-3613  [003] d..2   485.220508: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=3 dest_cpu=2 cpumask=00000000,0000000f
     migration/1-13    [001] d..5   485.221202: sched_migrate_task: comm=ksmtuned pid=3614 prio=120 orig_cpu=1 dest_cpu=0 cpumask=00000000,0000000f
             awk-3615  [002] d.H5   485.221747: sched_migrate_task: comm=rcu_preempt pid=7 prio=120 orig_cpu=0 dest_cpu=1 cpumask=00000000,000000ff
     migration/2-18    [002] d..5   485.222062: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=2 dest_cpu=3 cpumask=00000000,0000000f

Link: http://lkml.kernel.org/r/1399377998-14870-6-git-send-email-javi.merino@arm.com
Link: http://lkml.kernel.org/r/20140506132238.22e136d1@gandalf.local.home

Suggested-by: Javi Merino <javi.merino@arm.com>
Tested-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  3 +++
 include/linux/trace_seq.h    | 10 ++++++++
 include/trace/ftrace.h       | 57 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_output.c  | 41 +++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d16da3e53bc7..cff3106ffe2c 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -38,6 +38,9 @@ const char *ftrace_print_symbols_seq_u64(struct trace_seq *p,
 								 *symbol_array);
 #endif
 
+const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+				     unsigned int bitmask_size);
+
 const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index a32d86ec8bf2..136116924d8d 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -46,6 +46,9 @@ extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
 extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
 extern int trace_seq_path(struct trace_seq *s, const struct path *path);
 
+extern int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+			     int nmaskbits);
+
 #else /* CONFIG_TRACING */
 static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 {
@@ -57,6 +60,13 @@ trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 	return 0;
 }
 
+static inline int
+trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+		  int nmaskbits)
+{
+	return 0;
+}
+
 static inline int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
 	return 0;
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 0a1a4f7caf09..9b7a989dcbcc 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -53,6 +53,9 @@
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
+#undef __bitmask
+#define __bitmask(item, nr_bits) __dynamic_array(char, item, -1)
+
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
 
@@ -128,6 +131,9 @@
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
+#undef __bitmask
+#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 	struct ftrace_data_offsets_##call {				\
@@ -200,6 +206,15 @@
 #undef __get_str
 #define __get_str(field) (char *)__get_dynamic_array(field)
 
+#undef __get_bitmask
+#define __get_bitmask(field)						\
+	({								\
+		void *__bitmask = __get_dynamic_array(field);		\
+		unsigned int __bitmask_size;				\
+		__bitmask_size = (__entry->__data_loc_##field >> 16) & 0xffff; \
+		ftrace_print_bitmask_seq(p, __bitmask, __bitmask_size);	\
+	})
+
 #undef __print_flags
 #define __print_flags(flag, delim, flag_array...)			\
 	({								\
@@ -322,6 +337,9 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
+#undef __bitmask
+#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print)	\
 static int notrace __init						\
@@ -372,6 +390,29 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
 #define __string(item, src) __dynamic_array(char, item,			\
 		    strlen((src) ? (const char *)(src) : "(null)") + 1)
 
+/*
+ * __bitmask_size_in_bytes_raw is the number of bytes needed to hold
+ * num_possible_cpus().
+ */
+#define __bitmask_size_in_bytes_raw(nr_bits)	\
+	(((nr_bits) + 7) / 8)
+
+#define __bitmask_size_in_longs(nr_bits)			\
+	((__bitmask_size_in_bytes_raw(nr_bits) +		\
+	  ((BITS_PER_LONG / 8) - 1)) / (BITS_PER_LONG / 8))
+
+/*
+ * __bitmask_size_in_bytes is the number of bytes needed to hold
+ * num_possible_cpus() padded out to the nearest long. This is what
+ * is saved in the buffer, just to be consistent.
+ */
+#define __bitmask_size_in_bytes(nr_bits)				\
+	(__bitmask_size_in_longs(nr_bits) * (BITS_PER_LONG / 8))
+
+#undef __bitmask
+#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item,	\
+					 __bitmask_size_in_longs(nr_bits))
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static inline notrace int ftrace_get_offsets_##call(			\
@@ -513,12 +554,22 @@ static inline notrace int ftrace_get_offsets_##call(			\
 	__entry->__data_loc_##item = __data_offsets.item;
 
 #undef __string
-#define __string(item, src) __dynamic_array(char, item, -1)       	\
+#define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef __assign_str
 #define __assign_str(dst, src)						\
 	strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)");
 
+#undef __bitmask
+#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
+
+#undef __get_bitmask
+#define __get_bitmask(field) (char *)__get_dynamic_array(field)
+
+#undef __assign_bitmask
+#define __assign_bitmask(dst, src, nr_bits)					\
+	memcpy(__get_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits))
+
 #undef TP_fast_assign
 #define TP_fast_assign(args...) args
 
@@ -586,6 +637,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #undef __print_hex
 #undef __get_dynamic_array
 #undef __get_str
+#undef __get_bitmask
 
 #undef TP_printk
 #define TP_printk(fmt, args...) "\"" fmt "\", "  __stringify(args)
@@ -651,6 +703,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 #undef __get_str
 #define __get_str(field) (char *)__get_dynamic_array(field)
 
+#undef __get_bitmask
+#define __get_bitmask(field) (char *)__get_dynamic_array(field)
+
 #undef __perf_addr
 #define __perf_addr(a)	(__addr = (a))
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a436de18aa99..f3dad80c20b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -125,6 +125,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 
+/**
+ * trace_seq_bitmask - put a list of longs as a bitmask print output
+ * @s:		trace sequence descriptor
+ * @maskp:	points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits:	The number of bits that are valid in @maskp
+ *
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ */
+int
+trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+		  int nmaskbits)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (s->full || !len)
+		return 0;
+
+	ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+	s->len += ret;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+
 /**
  * trace_seq_vprintf - sequence printing of trace information
  * @s: trace sequence descriptor
@@ -398,6 +426,19 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
 #endif
 
+const char *
+ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+			 unsigned int bitmask_size)
+{
+	const char *ret = p->buffer + p->len;
+
+	trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+
 const char *
 ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
-- 
cgit v1.2.3-71-gd317


From 622582786c9e041d0bd52bde201787adeab249f8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 13 May 2014 19:50:46 -0700
Subject: net: filter: x86: internal BPF JIT

Maps all internal BPF instructions into x86_64 instructions.
This patch replaces original BPF x64 JIT with internal BPF x64 JIT.
sysctl net.core.bpf_jit_enable is reused as on/off switch.

Performance:

1. old BPF JIT and internal BPF JIT generate equivalent x86_64 code.
  No performance difference is observed for filters that were JIT-able before

Example assembler code for BPF filter "tcpdump port 22"

original BPF -> old JIT:            original BPF -> internal BPF -> new JIT:
   0:   push   %rbp                      0:     push   %rbp
   1:   mov    %rsp,%rbp                 1:     mov    %rsp,%rbp
   4:   sub    $0x60,%rsp                4:     sub    $0x228,%rsp
   8:   mov    %rbx,-0x8(%rbp)           b:     mov    %rbx,-0x228(%rbp) // prologue
                                        12:     mov    %r13,-0x220(%rbp)
                                        19:     mov    %r14,-0x218(%rbp)
                                        20:     mov    %r15,-0x210(%rbp)
                                        27:     xor    %eax,%eax         // clear A
   c:   xor    %ebx,%ebx                29:     xor    %r13,%r13         // clear X
   e:   mov    0x68(%rdi),%r9d          2c:     mov    0x68(%rdi),%r9d
  12:   sub    0x6c(%rdi),%r9d          30:     sub    0x6c(%rdi),%r9d
  16:   mov    0xd8(%rdi),%r8           34:     mov    0xd8(%rdi),%r10
                                        3b:     mov    %rdi,%rbx
  1d:   mov    $0xc,%esi                3e:     mov    $0xc,%esi
  22:   callq  0xffffffffe1021e15       43:     callq  0xffffffffe102bd75
  27:   cmp    $0x86dd,%eax             48:     cmp    $0x86dd,%rax
  2c:   jne    0x0000000000000069       4f:     jne    0x000000000000009a
  2e:   mov    $0x14,%esi               51:     mov    $0x14,%esi
  33:   callq  0xffffffffe1021e31       56:     callq  0xffffffffe102bd91
  38:   cmp    $0x84,%eax               5b:     cmp    $0x84,%rax
  3d:   je     0x0000000000000049       62:     je     0x0000000000000074
  3f:   cmp    $0x6,%eax                64:     cmp    $0x6,%rax
  42:   je     0x0000000000000049       68:     je     0x0000000000000074
  44:   cmp    $0x11,%eax               6a:     cmp    $0x11,%rax
  47:   jne    0x00000000000000c6       6e:     jne    0x0000000000000117
  49:   mov    $0x36,%esi               74:     mov    $0x36,%esi
  4e:   callq  0xffffffffe1021e15       79:     callq  0xffffffffe102bd75
  53:   cmp    $0x16,%eax               7e:     cmp    $0x16,%rax
  56:   je     0x00000000000000bf       82:     je     0x0000000000000110
  58:   mov    $0x38,%esi               88:     mov    $0x38,%esi
  5d:   callq  0xffffffffe1021e15       8d:     callq  0xffffffffe102bd75
  62:   cmp    $0x16,%eax               92:     cmp    $0x16,%rax
  65:   je     0x00000000000000bf       96:     je     0x0000000000000110
  67:   jmp    0x00000000000000c6       98:     jmp    0x0000000000000117
  69:   cmp    $0x800,%eax              9a:     cmp    $0x800,%rax
  6e:   jne    0x00000000000000c6       a1:     jne    0x0000000000000117
  70:   mov    $0x17,%esi               a3:     mov    $0x17,%esi
  75:   callq  0xffffffffe1021e31       a8:     callq  0xffffffffe102bd91
  7a:   cmp    $0x84,%eax               ad:     cmp    $0x84,%rax
  7f:   je     0x000000000000008b       b4:     je     0x00000000000000c2
  81:   cmp    $0x6,%eax                b6:     cmp    $0x6,%rax
  84:   je     0x000000000000008b       ba:     je     0x00000000000000c2
  86:   cmp    $0x11,%eax               bc:     cmp    $0x11,%rax
  89:   jne    0x00000000000000c6       c0:     jne    0x0000000000000117
  8b:   mov    $0x14,%esi               c2:     mov    $0x14,%esi
  90:   callq  0xffffffffe1021e15       c7:     callq  0xffffffffe102bd75
  95:   test   $0x1fff,%ax              cc:     test   $0x1fff,%rax
  99:   jne    0x00000000000000c6       d3:     jne    0x0000000000000117
                                        d5:     mov    %rax,%r14
  9b:   mov    $0xe,%esi                d8:     mov    $0xe,%esi
  a0:   callq  0xffffffffe1021e44       dd:     callq  0xffffffffe102bd91 // MSH
                                        e2:     and    $0xf,%eax
                                        e5:     shl    $0x2,%eax
                                        e8:     mov    %rax,%r13
                                        eb:     mov    %r14,%rax
                                        ee:     mov    %r13,%rsi
  a5:   lea    0xe(%rbx),%esi           f1:     add    $0xe,%esi
  a8:   callq  0xffffffffe1021e0d       f4:     callq  0xffffffffe102bd6d
  ad:   cmp    $0x16,%eax               f9:     cmp    $0x16,%rax
  b0:   je     0x00000000000000bf       fd:     je     0x0000000000000110
                                        ff:     mov    %r13,%rsi
  b2:   lea    0x10(%rbx),%esi         102:     add    $0x10,%esi
  b5:   callq  0xffffffffe1021e0d      105:     callq  0xffffffffe102bd6d
  ba:   cmp    $0x16,%eax              10a:     cmp    $0x16,%rax
  bd:   jne    0x00000000000000c6      10e:     jne    0x0000000000000117
  bf:   mov    $0xffff,%eax            110:     mov    $0xffff,%eax
  c4:   jmp    0x00000000000000c8      115:     jmp    0x000000000000011c
  c6:   xor    %eax,%eax               117:     mov    $0x0,%eax
  c8:   mov    -0x8(%rbp),%rbx         11c:     mov    -0x228(%rbp),%rbx // epilogue
  cc:   leaveq                         123:     mov    -0x220(%rbp),%r13
  cd:   retq                           12a:     mov    -0x218(%rbp),%r14
                                       131:     mov    -0x210(%rbp),%r15
                                       138:     leaveq
                                       139:     retq

On fully cached SKBs both JITed functions take 12 nsec to execute.
BPF interpreter executes the program in 30 nsec.

The difference in generated assembler is due to the following:

Old BPF imlements LDX_MSH instruction via sk_load_byte_msh() helper function
inside bpf_jit.S.
New JIT removes the helper and does it explicitly, so ldx_msh cost
is the same for both JITs, but generated code looks longer.

New JIT has 4 registers to save, so prologue/epilogue are larger,
but the cost is within noise on x64.

Old JIT checks whether first insn clears A and if not emits 'xor %eax,%eax'.
New JIT clears %rax unconditionally.

2. old BPF JIT doesn't support ANC_NLATTR, ANC_PAY_OFFSET, ANC_RANDOM
  extensions. New JIT supports all BPF extensions.
  Performance of such filters improves 2-4 times depending on a filter.
  The longer the filter the higher performance gain.
  Synthetic benchmarks with many ancillary loads see 20x speedup
  which seems to be the maximum gain from JIT

Notes:

. net.core.bpf_jit_enable=2 + tools/net/bpf_jit_disasm is still functional
  and can be used to see generated assembler

. there are two jit_compile() functions and code flow for classic filters is:
  sk_attach_filter() - load classic BPF
  bpf_jit_compile() - try to JIT from classic BPF
  sk_convert_filter() - convert classic to internal
  bpf_int_jit_compile() - JIT from internal BPF

  seccomp and tracing filters will just call bpf_int_jit_compile()

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit.S      |   77 +--
 arch/x86/net/bpf_jit_comp.c | 1314 +++++++++++++++++++++++--------------------
 include/linux/filter.h      |    3 +
 net/core/filter.c           |    9 +-
 4 files changed, 748 insertions(+), 655 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 01495755701b..6440221ced0d 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -12,13 +12,16 @@
 
 /*
  * Calling convention :
- * rdi : skb pointer
+ * rbx : skb pointer (callee saved)
  * esi : offset of byte(s) to fetch in skb (can be scratched)
- * r8  : copy of skb->data
+ * r10 : copy of skb->data
  * r9d : hlen = skb->len - skb->data_len
  */
-#define SKBDATA	%r8
+#define SKBDATA	%r10
 #define SKF_MAX_NEG_OFF    $(-0x200000) /* SKF_LL_OFF from filter.h */
+#define MAX_BPF_STACK (512 /* from filter.h */ + \
+	32 /* space for rbx,r13,r14,r15 */ + \
+	8 /* space for skb_copy_bits */)
 
 sk_load_word:
 	.globl	sk_load_word
@@ -68,53 +71,31 @@ sk_load_byte_positive_offset:
 	movzbl	(SKBDATA,%rsi),%eax
 	ret
 
-/**
- * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
- *
- * Implements BPF_S_LDX_B_MSH : ldxb  4*([offset]&0xf)
- * Must preserve A accumulator (%eax)
- * Inputs : %esi is the offset value
- */
-sk_load_byte_msh:
-	.globl	sk_load_byte_msh
-	test	%esi,%esi
-	js	bpf_slow_path_byte_msh_neg
-
-sk_load_byte_msh_positive_offset:
-	.globl	sk_load_byte_msh_positive_offset
-	cmp	%esi,%r9d      /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
-	jle	bpf_slow_path_byte_msh
-	movzbl	(SKBDATA,%rsi),%ebx
-	and	$15,%bl
-	shl	$2,%bl
-	ret
-
 /* rsi contains offset and can be scratched */
 #define bpf_slow_path_common(LEN)		\
-	push	%rdi;    /* save skb */		\
+	mov	%rbx, %rdi; /* arg1 == skb */	\
 	push	%r9;				\
 	push	SKBDATA;			\
 /* rsi already has offset */			\
 	mov	$LEN,%ecx;	/* len */	\
-	lea	-12(%rbp),%rdx;			\
+	lea	- MAX_BPF_STACK + 32(%rbp),%rdx;			\
 	call	skb_copy_bits;			\
 	test    %eax,%eax;			\
 	pop	SKBDATA;			\
-	pop	%r9;				\
-	pop	%rdi
+	pop	%r9;
 
 
 bpf_slow_path_word:
 	bpf_slow_path_common(4)
 	js	bpf_error
-	mov	-12(%rbp),%eax
+	mov	- MAX_BPF_STACK + 32(%rbp),%eax
 	bswap	%eax
 	ret
 
 bpf_slow_path_half:
 	bpf_slow_path_common(2)
 	js	bpf_error
-	mov	-12(%rbp),%ax
+	mov	- MAX_BPF_STACK + 32(%rbp),%ax
 	rol	$8,%ax
 	movzwl	%ax,%eax
 	ret
@@ -122,21 +103,11 @@ bpf_slow_path_half:
 bpf_slow_path_byte:
 	bpf_slow_path_common(1)
 	js	bpf_error
-	movzbl	-12(%rbp),%eax
-	ret
-
-bpf_slow_path_byte_msh:
-	xchg	%eax,%ebx /* dont lose A , X is about to be scratched */
-	bpf_slow_path_common(1)
-	js	bpf_error
-	movzbl	-12(%rbp),%eax
-	and	$15,%al
-	shl	$2,%al
-	xchg	%eax,%ebx
+	movzbl	- MAX_BPF_STACK + 32(%rbp),%eax
 	ret
 
 #define sk_negative_common(SIZE)				\
-	push	%rdi;	/* save skb */				\
+	mov	%rbx, %rdi; /* arg1 == skb */			\
 	push	%r9;						\
 	push	SKBDATA;					\
 /* rsi already has offset */					\
@@ -145,10 +116,8 @@ bpf_slow_path_byte_msh:
 	test	%rax,%rax;					\
 	pop	SKBDATA;					\
 	pop	%r9;						\
-	pop	%rdi;						\
 	jz	bpf_error
 
-
 bpf_slow_path_word_neg:
 	cmp	SKF_MAX_NEG_OFF, %esi	/* test range */
 	jl	bpf_error	/* offset lower -> error  */
@@ -179,22 +148,12 @@ sk_load_byte_negative_offset:
 	movzbl	(%rax), %eax
 	ret
 
-bpf_slow_path_byte_msh_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi
-	jl	bpf_error
-sk_load_byte_msh_negative_offset:
-	.globl	sk_load_byte_msh_negative_offset
-	xchg	%eax,%ebx /* dont lose A , X is about to be scratched */
-	sk_negative_common(1)
-	movzbl	(%rax),%eax
-	and	$15,%al
-	shl	$2,%al
-	xchg	%eax,%ebx
-	ret
-
 bpf_error:
 # force a return 0 from jit handler
-	xor		%eax,%eax
-	mov		-8(%rbp),%rbx
+	xor	%eax,%eax
+	mov	- MAX_BPF_STACK(%rbp),%rbx
+	mov	- MAX_BPF_STACK + 8(%rbp),%r13
+	mov	- MAX_BPF_STACK + 16(%rbp),%r14
+	mov	- MAX_BPF_STACK + 24(%rbp),%r15
 	leaveq
 	ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index c5fa7c9cb665..92aef8fdac2f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,6 +1,7 @@
 /* bpf_jit_comp.c : BPF JIT compiler
  *
  * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
+ * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -14,28 +15,16 @@
 #include <linux/if_vlan.h>
 #include <linux/random.h>
 
-/*
- * Conventions :
- *  EAX : BPF A accumulator
- *  EBX : BPF X accumulator
- *  RDI : pointer to skb   (first argument given to JIT function)
- *  RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
- *  ECX,EDX,ESI : scratch registers
- *  r9d : skb->len - skb->data_len (headlen)
- *  r8  : skb->data
- * -8(RBP) : saved RBX value
- * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
- */
 int bpf_jit_enable __read_mostly;
 
 /*
  * assembly code in arch/x86/net/bpf_jit.S
  */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
 extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
-extern u8 sk_load_byte_positive_offset[], sk_load_byte_msh_positive_offset[];
+extern u8 sk_load_byte_positive_offset[];
 extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
-extern u8 sk_load_byte_negative_offset[], sk_load_byte_msh_negative_offset[];
+extern u8 sk_load_byte_negative_offset[];
 
 static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -56,30 +45,44 @@ static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
 #define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
 #define EMIT4(b1, b2, b3, b4)   EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
-#define EMIT1_off32(b1, off)	do { EMIT1(b1); EMIT(off, 4);} while (0)
-
-#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
-#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
+#define EMIT1_off32(b1, off) \
+	do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+	do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+	do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+	do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
 
 static inline bool is_imm8(int value)
 {
 	return value <= 127 && value >= -128;
 }
 
-static inline bool is_near(int offset)
+static inline bool is_simm32(s64 value)
 {
-	return offset <= 127 && offset >= -128;
+	return value == (s64) (s32) value;
 }
 
-#define EMIT_JMP(offset)						\
-do {									\
-	if (offset) {							\
-		if (is_near(offset))					\
-			EMIT2(0xeb, offset); /* jmp .+off8 */		\
-		else							\
-			EMIT1_off32(0xe9, offset); /* jmp .+off32 */	\
-	}								\
-} while (0)
+/* mov A, X */
+#define EMIT_mov(A, X) \
+	do {if (A != X) \
+		EMIT3(add_2mod(0x48, A, X), 0x89, add_2reg(0xC0, A, X)); \
+	} while (0)
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+	if (bpf_size == BPF_W)
+		return 4;
+	else if (bpf_size == BPF_H)
+		return 2;
+	else if (bpf_size == BPF_B)
+		return 1;
+	else if (bpf_size == BPF_DW)
+		return 4; /* imm32 */
+	else
+		return 0;
+}
 
 /* list of x86 cond jumps opcodes (. + s8)
  * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
@@ -90,27 +93,8 @@ do {									\
 #define X86_JNE 0x75
 #define X86_JBE 0x76
 #define X86_JA  0x77
-
-#define EMIT_COND_JMP(op, offset)				\
-do {								\
-	if (is_near(offset))					\
-		EMIT2(op, offset); /* jxx .+off8 */		\
-	else {							\
-		EMIT2(0x0f, op + 0x10);				\
-		EMIT(offset, 4); /* jxx .+off32 */		\
-	}							\
-} while (0)
-
-#define COND_SEL(CODE, TOP, FOP)	\
-	case CODE:			\
-		t_op = TOP;		\
-		f_op = FOP;		\
-		goto cond_branch
-
-
-#define SEEN_DATAREF 1 /* might call external helpers */
-#define SEEN_XREG    2 /* ebx is used */
-#define SEEN_MEM     4 /* use mem[] for temporary storage */
+#define X86_JGE 0x7D
+#define X86_JG  0x7F
 
 static inline void bpf_flush_icache(void *start, void *end)
 {
@@ -125,26 +109,6 @@ static inline void bpf_flush_icache(void *start, void *end)
 #define CHOOSE_LOAD_FUNC(K, func) \
 	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
 
-/* Helper to find the offset of pkt_type in sk_buff
- * We want to make sure its still a 3bit field starting at a byte boundary.
- */
-#define PKT_TYPE_MAX 7
-static int pkt_type_offset(void)
-{
-	struct sk_buff skb_probe = {
-		.pkt_type = ~0,
-	};
-	char *ct = (char *)&skb_probe;
-	unsigned int off;
-
-	for (off = 0; off < sizeof(struct sk_buff); off++) {
-		if (ct[off] == PKT_TYPE_MAX)
-			return off;
-	}
-	pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n");
-	return -1;
-}
-
 struct bpf_binary_header {
 	unsigned int	pages;
 	/* Note : for security reasons, bpf code will follow a randomly
@@ -178,546 +142,715 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
 	return header;
 }
 
+/* pick a register outside of BPF range for JIT internal work */
+#define AUX_REG (MAX_BPF_REG + 1)
+
+/* the following table maps BPF registers to x64 registers.
+ * x64 register r12 is unused, since if used as base address register
+ * in load/store instructions, it always needs an extra byte of encoding
+ */
+static const int reg2hex[] = {
+	[BPF_REG_0] = 0,  /* rax */
+	[BPF_REG_1] = 7,  /* rdi */
+	[BPF_REG_2] = 6,  /* rsi */
+	[BPF_REG_3] = 2,  /* rdx */
+	[BPF_REG_4] = 1,  /* rcx */
+	[BPF_REG_5] = 0,  /* r8 */
+	[BPF_REG_6] = 3,  /* rbx callee saved */
+	[BPF_REG_7] = 5,  /* r13 callee saved */
+	[BPF_REG_8] = 6,  /* r14 callee saved */
+	[BPF_REG_9] = 7,  /* r15 callee saved */
+	[BPF_REG_FP] = 5, /* rbp readonly */
+	[AUX_REG] = 3,    /* r11 temp register */
+};
+
+/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+ * which need extra byte of encoding.
+ * rax,rcx,...,rbp have simpler encoding
+ */
+static inline bool is_ereg(u32 reg)
+{
+	if (reg == BPF_REG_5 || reg == AUX_REG ||
+	    (reg >= BPF_REG_7 && reg <= BPF_REG_9))
+		return true;
+	else
+		return false;
+}
+
+/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+static inline u8 add_1mod(u8 byte, u32 reg)
+{
+	if (is_ereg(reg))
+		byte |= 1;
+	return byte;
+}
+
+static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
+{
+	if (is_ereg(r1))
+		byte |= 1;
+	if (is_ereg(r2))
+		byte |= 4;
+	return byte;
+}
+
+/* encode dest register 'a_reg' into x64 opcode 'byte' */
+static inline u8 add_1reg(u8 byte, u32 a_reg)
+{
+	return byte + reg2hex[a_reg];
+}
+
+/* encode dest 'a_reg' and src 'x_reg' registers into x64 opcode 'byte' */
+static inline u8 add_2reg(u8 byte, u32 a_reg, u32 x_reg)
+{
+	return byte + reg2hex[a_reg] + (reg2hex[x_reg] << 3);
+}
+
 struct jit_context {
 	unsigned int cleanup_addr; /* epilogue code offset */
-	int pc_ret0; /* bpf index of first RET #0 instruction (if any) */
-	u8 seen;
+	bool seen_ld_abs;
 };
 
 static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
-	const struct sock_filter *filter = bpf_prog->insns;
-	int flen = bpf_prog->len;
+	struct sock_filter_int *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
 	u8 temp[64];
-	u8 *prog;
-	int ilen, i, proglen;
-	int t_offset, f_offset;
-	u8 t_op, f_op, seen = 0;
-	u8 *func;
-	unsigned int cleanup_addr = ctx->cleanup_addr;
-	u8 seen_or_pass0 = ctx->seen;
-
-		/* no prologue/epilogue for trivial filters (RET something) */
-		proglen = 0;
-		prog = temp;
+	int i;
+	int proglen = 0;
+	u8 *prog = temp;
+	int stacksize = MAX_BPF_STACK +
+		32 /* space for rbx, r13, r14, r15 */ +
+		8 /* space for skb_copy_bits() buffer */;
 
-		if (seen_or_pass0) {
-			EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
-			EMIT4(0x48, 0x83, 0xec, 96);	/* subq  $96,%rsp	*/
-			/* note : must save %rbx in case bpf_error is hit */
-			if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF))
-				EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
-			if (seen_or_pass0 & SEEN_XREG)
-				CLEAR_X(); /* make sure we dont leek kernel memory */
-
-			/*
-			 * If this filter needs to access skb data,
-			 * loads r9 and r8 with :
-			 *  r9 = skb->len - skb->data_len
-			 *  r8 = skb->data
-			 */
-			if (seen_or_pass0 & SEEN_DATAREF) {
-				if (offsetof(struct sk_buff, len) <= 127)
-					/* mov    off8(%rdi),%r9d */
-					EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
-				else {
-					/* mov    off32(%rdi),%r9d */
-					EMIT3(0x44, 0x8b, 0x8f);
-					EMIT(offsetof(struct sk_buff, len), 4);
-				}
-				if (is_imm8(offsetof(struct sk_buff, data_len)))
-					/* sub    off8(%rdi),%r9d */
-					EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
-				else {
-					EMIT3(0x44, 0x2b, 0x8f);
-					EMIT(offsetof(struct sk_buff, data_len), 4);
-				}
+	EMIT1(0x55); /* push rbp */
+	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
 
-				if (is_imm8(offsetof(struct sk_buff, data)))
-					/* mov off8(%rdi),%r8 */
-					EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
-				else {
-					/* mov off32(%rdi),%r8 */
-					EMIT3(0x4c, 0x8b, 0x87);
-					EMIT(offsetof(struct sk_buff, data), 4);
-				}
+	/* sub rsp, stacksize */
+	EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+
+	/* all classic BPF filters use R6(rbx) save it */
+
+	/* mov qword ptr [rbp-X],rbx */
+	EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+
+	/* sk_convert_filter() maps classic BPF register X to R7 and uses R8
+	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
+	 * R8(r14). R9(r15) spill could be made conditional, but there is only
+	 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
+	 * The overhead of extra spill is negligible for any filter other
+	 * than synthetic ones. Therefore not worth adding complexity.
+	 */
+
+	/* mov qword ptr [rbp-X],r13 */
+	EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+	/* mov qword ptr [rbp-X],r14 */
+	EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+	/* mov qword ptr [rbp-X],r15 */
+	EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+
+	/* clear A and X registers */
+	EMIT2(0x31, 0xc0); /* xor eax, eax */
+	EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
+
+	if (ctx->seen_ld_abs) {
+		/* r9d : skb->len - skb->data_len (headlen)
+		 * r10 : skb->data
+		 */
+		if (is_imm8(offsetof(struct sk_buff, len)))
+			/* mov %r9d, off8(%rdi) */
+			EMIT4(0x44, 0x8b, 0x4f,
+			      offsetof(struct sk_buff, len));
+		else
+			/* mov %r9d, off32(%rdi) */
+			EMIT3_off32(0x44, 0x8b, 0x8f,
+				    offsetof(struct sk_buff, len));
+
+		if (is_imm8(offsetof(struct sk_buff, data_len)))
+			/* sub %r9d, off8(%rdi) */
+			EMIT4(0x44, 0x2b, 0x4f,
+			      offsetof(struct sk_buff, data_len));
+		else
+			EMIT3_off32(0x44, 0x2b, 0x8f,
+				    offsetof(struct sk_buff, data_len));
+
+		if (is_imm8(offsetof(struct sk_buff, data)))
+			/* mov %r10, off8(%rdi) */
+			EMIT4(0x4c, 0x8b, 0x57,
+			      offsetof(struct sk_buff, data));
+		else
+			/* mov %r10, off32(%rdi) */
+			EMIT3_off32(0x4c, 0x8b, 0x97,
+				    offsetof(struct sk_buff, data));
+	}
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		const s32 K = insn->imm;
+		u32 a_reg = insn->a_reg;
+		u32 x_reg = insn->x_reg;
+		u8 b1 = 0, b2 = 0, b3 = 0;
+		s64 jmp_offset;
+		u8 jmp_cond;
+		int ilen;
+		u8 *func;
+
+		switch (insn->code) {
+			/* ALU */
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU64 | BPF_ADD | BPF_X:
+		case BPF_ALU64 | BPF_SUB | BPF_X:
+		case BPF_ALU64 | BPF_AND | BPF_X:
+		case BPF_ALU64 | BPF_OR | BPF_X:
+		case BPF_ALU64 | BPF_XOR | BPF_X:
+			switch (BPF_OP(insn->code)) {
+			case BPF_ADD: b2 = 0x01; break;
+			case BPF_SUB: b2 = 0x29; break;
+			case BPF_AND: b2 = 0x21; break;
+			case BPF_OR: b2 = 0x09; break;
+			case BPF_XOR: b2 = 0x31; break;
 			}
-		}
+			if (BPF_CLASS(insn->code) == BPF_ALU64)
+				EMIT1(add_2mod(0x48, a_reg, x_reg));
+			else if (is_ereg(a_reg) || is_ereg(x_reg))
+				EMIT1(add_2mod(0x40, a_reg, x_reg));
+			EMIT2(b2, add_2reg(0xC0, a_reg, x_reg));
+			break;
 
-		switch (filter[0].code) {
-		case BPF_S_RET_K:
-		case BPF_S_LD_W_LEN:
-		case BPF_S_ANC_PROTOCOL:
-		case BPF_S_ANC_IFINDEX:
-		case BPF_S_ANC_MARK:
-		case BPF_S_ANC_RXHASH:
-		case BPF_S_ANC_CPU:
-		case BPF_S_ANC_VLAN_TAG:
-		case BPF_S_ANC_VLAN_TAG_PRESENT:
-		case BPF_S_ANC_QUEUE:
-		case BPF_S_ANC_PKTTYPE:
-		case BPF_S_LD_W_ABS:
-		case BPF_S_LD_H_ABS:
-		case BPF_S_LD_B_ABS:
-			/* first instruction sets A register (or is RET 'constant') */
+			/* mov A, X */
+		case BPF_ALU64 | BPF_MOV | BPF_X:
+			EMIT_mov(a_reg, x_reg);
 			break;
-		default:
-			/* make sure we dont leak kernel information to user */
-			CLEAR_A(); /* A = 0 */
-		}
 
-		for (i = 0; i < flen; i++) {
-			unsigned int K = filter[i].k;
+			/* mov32 A, X */
+		case BPF_ALU | BPF_MOV | BPF_X:
+			if (is_ereg(a_reg) || is_ereg(x_reg))
+				EMIT1(add_2mod(0x40, a_reg, x_reg));
+			EMIT2(0x89, add_2reg(0xC0, a_reg, x_reg));
+			break;
 
-			switch (filter[i].code) {
-			case BPF_S_ALU_ADD_X: /* A += X; */
-				seen |= SEEN_XREG;
-				EMIT2(0x01, 0xd8);		/* add %ebx,%eax */
-				break;
-			case BPF_S_ALU_ADD_K: /* A += K; */
-				if (!K)
-					break;
-				if (is_imm8(K))
-					EMIT3(0x83, 0xc0, K);	/* add imm8,%eax */
-				else
-					EMIT1_off32(0x05, K);	/* add imm32,%eax */
-				break;
-			case BPF_S_ALU_SUB_X: /* A -= X; */
-				seen |= SEEN_XREG;
-				EMIT2(0x29, 0xd8);		/* sub    %ebx,%eax */
-				break;
-			case BPF_S_ALU_SUB_K: /* A -= K */
-				if (!K)
-					break;
-				if (is_imm8(K))
-					EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
-				else
-					EMIT1_off32(0x2d, K); /* sub imm32,%eax */
-				break;
-			case BPF_S_ALU_MUL_X: /* A *= X; */
-				seen |= SEEN_XREG;
-				EMIT3(0x0f, 0xaf, 0xc3);	/* imul %ebx,%eax */
-				break;
-			case BPF_S_ALU_MUL_K: /* A *= K */
-				if (is_imm8(K))
-					EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
-				else {
-					EMIT2(0x69, 0xc0);		/* imul imm32,%eax */
-					EMIT(K, 4);
-				}
-				break;
-			case BPF_S_ALU_DIV_X: /* A /= X; */
-				seen |= SEEN_XREG;
-				EMIT2(0x85, 0xdb);	/* test %ebx,%ebx */
-				if (ctx->pc_ret0 > 0) {
-					/* addrs[pc_ret0 - 1] is start address of target
-					 * (addrs[i] - 4) is the address following this jmp
-					 * ("xor %edx,%edx; div %ebx" being 4 bytes long)
-					 */
-					EMIT_COND_JMP(X86_JE, addrs[ctx->pc_ret0 - 1] -
-								(addrs[i] - 4));
-				} else {
-					EMIT_COND_JMP(X86_JNE, 2 + 5);
-					CLEAR_A();
-					EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
-				}
-				EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
-				break;
-			case BPF_S_ALU_MOD_X: /* A %= X; */
-				seen |= SEEN_XREG;
-				EMIT2(0x85, 0xdb);	/* test %ebx,%ebx */
-				if (ctx->pc_ret0 > 0) {
-					/* addrs[pc_ret0 - 1] is start address of target
-					 * (addrs[i] - 6) is the address following this jmp
-					 * ("xor %edx,%edx; div %ebx;mov %edx,%eax" being 6 bytes long)
-					 */
-					EMIT_COND_JMP(X86_JE, addrs[ctx->pc_ret0 - 1] -
-								(addrs[i] - 6));
-				} else {
-					EMIT_COND_JMP(X86_JNE, 2 + 5);
-					CLEAR_A();
-					EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 6)); /* jmp .+off32 */
-				}
-				EMIT2(0x31, 0xd2);	/* xor %edx,%edx */
-				EMIT2(0xf7, 0xf3);	/* div %ebx */
-				EMIT2(0x89, 0xd0);	/* mov %edx,%eax */
-				break;
-			case BPF_S_ALU_MOD_K: /* A %= K; */
-				if (K == 1) {
-					CLEAR_A();
-					break;
-				}
-				EMIT2(0x31, 0xd2);	/* xor %edx,%edx */
-				EMIT1(0xb9);EMIT(K, 4);	/* mov imm32,%ecx */
-				EMIT2(0xf7, 0xf1);	/* div %ecx */
-				EMIT2(0x89, 0xd0);	/* mov %edx,%eax */
-				break;
-			case BPF_S_ALU_DIV_K: /* A /= K */
-				if (K == 1)
-					break;
-				EMIT2(0x31, 0xd2);	/* xor %edx,%edx */
-				EMIT1(0xb9);EMIT(K, 4);	/* mov imm32,%ecx */
-				EMIT2(0xf7, 0xf1);	/* div %ecx */
-				break;
-			case BPF_S_ALU_AND_X:
-				seen |= SEEN_XREG;
-				EMIT2(0x21, 0xd8);		/* and %ebx,%eax */
-				break;
-			case BPF_S_ALU_AND_K:
-				if (K >= 0xFFFFFF00) {
-					EMIT2(0x24, K & 0xFF); /* and imm8,%al */
-				} else if (K >= 0xFFFF0000) {
-					EMIT2(0x66, 0x25);	/* and imm16,%ax */
-					EMIT(K, 2);
-				} else {
-					EMIT1_off32(0x25, K);	/* and imm32,%eax */
-				}
-				break;
-			case BPF_S_ALU_OR_X:
-				seen |= SEEN_XREG;
-				EMIT2(0x09, 0xd8);		/* or %ebx,%eax */
-				break;
-			case BPF_S_ALU_OR_K:
-				if (is_imm8(K))
-					EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
-				else
-					EMIT1_off32(0x0d, K);	/* or imm32,%eax */
-				break;
-			case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
-			case BPF_S_ALU_XOR_X:
-				seen |= SEEN_XREG;
-				EMIT2(0x31, 0xd8);		/* xor %ebx,%eax */
-				break;
-			case BPF_S_ALU_XOR_K: /* A ^= K; */
-				if (K == 0)
-					break;
-				if (is_imm8(K))
-					EMIT3(0x83, 0xf0, K);	/* xor imm8,%eax */
-				else
-					EMIT1_off32(0x35, K);	/* xor imm32,%eax */
-				break;
-			case BPF_S_ALU_LSH_X: /* A <<= X; */
-				seen |= SEEN_XREG;
-				EMIT4(0x89, 0xd9, 0xd3, 0xe0);	/* mov %ebx,%ecx; shl %cl,%eax */
-				break;
-			case BPF_S_ALU_LSH_K:
-				if (K == 0)
-					break;
-				else if (K == 1)
-					EMIT2(0xd1, 0xe0); /* shl %eax */
-				else
-					EMIT3(0xc1, 0xe0, K);
-				break;
-			case BPF_S_ALU_RSH_X: /* A >>= X; */
-				seen |= SEEN_XREG;
-				EMIT4(0x89, 0xd9, 0xd3, 0xe8);	/* mov %ebx,%ecx; shr %cl,%eax */
-				break;
-			case BPF_S_ALU_RSH_K: /* A >>= K; */
-				if (K == 0)
-					break;
-				else if (K == 1)
-					EMIT2(0xd1, 0xe8); /* shr %eax */
-				else
-					EMIT3(0xc1, 0xe8, K);
-				break;
-			case BPF_S_ALU_NEG:
-				EMIT2(0xf7, 0xd8);		/* neg %eax */
-				break;
-			case BPF_S_RET_K:
-				if (!K) {
-					if (ctx->pc_ret0 == -1)
-						ctx->pc_ret0 = i;
-					CLEAR_A();
-				} else {
-					EMIT1_off32(0xb8, K);	/* mov $imm32,%eax */
-				}
-				/* fallinto */
-			case BPF_S_RET_A:
-				if (seen_or_pass0) {
-					if (i != flen - 1) {
-						EMIT_JMP(cleanup_addr - addrs[i]);
-						break;
-					}
-					if (seen_or_pass0 & SEEN_XREG)
-						EMIT4(0x48, 0x8b, 0x5d, 0xf8);  /* mov  -8(%rbp),%rbx */
-					EMIT1(0xc9);		/* leaveq */
-				}
-				EMIT1(0xc3);		/* ret */
-				break;
-			case BPF_S_MISC_TAX: /* X = A */
-				seen |= SEEN_XREG;
-				EMIT2(0x89, 0xc3);	/* mov    %eax,%ebx */
-				break;
-			case BPF_S_MISC_TXA: /* A = X */
-				seen |= SEEN_XREG;
-				EMIT2(0x89, 0xd8);	/* mov    %ebx,%eax */
-				break;
-			case BPF_S_LD_IMM: /* A = K */
-				if (!K)
-					CLEAR_A();
-				else
-					EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
+			/* neg A */
+		case BPF_ALU | BPF_NEG:
+		case BPF_ALU64 | BPF_NEG:
+			if (BPF_CLASS(insn->code) == BPF_ALU64)
+				EMIT1(add_1mod(0x48, a_reg));
+			else if (is_ereg(a_reg))
+				EMIT1(add_1mod(0x40, a_reg));
+			EMIT2(0xF7, add_1reg(0xD8, a_reg));
+			break;
+
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU64 | BPF_ADD | BPF_K:
+		case BPF_ALU64 | BPF_SUB | BPF_K:
+		case BPF_ALU64 | BPF_AND | BPF_K:
+		case BPF_ALU64 | BPF_OR | BPF_K:
+		case BPF_ALU64 | BPF_XOR | BPF_K:
+			if (BPF_CLASS(insn->code) == BPF_ALU64)
+				EMIT1(add_1mod(0x48, a_reg));
+			else if (is_ereg(a_reg))
+				EMIT1(add_1mod(0x40, a_reg));
+
+			switch (BPF_OP(insn->code)) {
+			case BPF_ADD: b3 = 0xC0; break;
+			case BPF_SUB: b3 = 0xE8; break;
+			case BPF_AND: b3 = 0xE0; break;
+			case BPF_OR: b3 = 0xC8; break;
+			case BPF_XOR: b3 = 0xF0; break;
+			}
+
+			if (is_imm8(K))
+				EMIT3(0x83, add_1reg(b3, a_reg), K);
+			else
+				EMIT2_off32(0x81, add_1reg(b3, a_reg), K);
+			break;
+
+		case BPF_ALU64 | BPF_MOV | BPF_K:
+			/* optimization: if imm32 is positive,
+			 * use 'mov eax, imm32' (which zero-extends imm32)
+			 * to save 2 bytes
+			 */
+			if (K < 0) {
+				/* 'mov rax, imm32' sign extends imm32 */
+				b1 = add_1mod(0x48, a_reg);
+				b2 = 0xC7;
+				b3 = 0xC0;
+				EMIT3_off32(b1, b2, add_1reg(b3, a_reg), K);
 				break;
-			case BPF_S_LDX_IMM: /* X = K */
-				seen |= SEEN_XREG;
-				if (!K)
-					CLEAR_X();
+			}
+
+		case BPF_ALU | BPF_MOV | BPF_K:
+			/* mov %eax, imm32 */
+			if (is_ereg(a_reg))
+				EMIT1(add_1mod(0x40, a_reg));
+			EMIT1_off32(add_1reg(0xB8, a_reg), K);
+			break;
+
+			/* A %= X, A /= X, A %= K, A /= K */
+		case BPF_ALU | BPF_MOD | BPF_X:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_MOD | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU64 | BPF_MOD | BPF_X:
+		case BPF_ALU64 | BPF_DIV | BPF_X:
+		case BPF_ALU64 | BPF_MOD | BPF_K:
+		case BPF_ALU64 | BPF_DIV | BPF_K:
+			EMIT1(0x50); /* push rax */
+			EMIT1(0x52); /* push rdx */
+
+			if (BPF_SRC(insn->code) == BPF_X)
+				/* mov r11, X */
+				EMIT_mov(AUX_REG, x_reg);
+			else
+				/* mov r11, K */
+				EMIT3_off32(0x49, 0xC7, 0xC3, K);
+
+			/* mov rax, A */
+			EMIT_mov(BPF_REG_0, a_reg);
+
+			/* xor edx, edx
+			 * equivalent to 'xor rdx, rdx', but one byte less
+			 */
+			EMIT2(0x31, 0xd2);
+
+			if (BPF_SRC(insn->code) == BPF_X) {
+				/* if (X == 0) return 0 */
+
+				/* cmp r11, 0 */
+				EMIT4(0x49, 0x83, 0xFB, 0x00);
+
+				/* jne .+9 (skip over pop, pop, xor and jmp) */
+				EMIT2(X86_JNE, 1 + 1 + 2 + 5);
+				EMIT1(0x5A); /* pop rdx */
+				EMIT1(0x58); /* pop rax */
+				EMIT2(0x31, 0xc0); /* xor eax, eax */
+
+				/* jmp cleanup_addr
+				 * addrs[i] - 11, because there are 11 bytes
+				 * after this insn: div, mov, pop, pop, mov
+				 */
+				jmp_offset = ctx->cleanup_addr - (addrs[i] - 11);
+				EMIT1_off32(0xE9, jmp_offset);
+			}
+
+			if (BPF_CLASS(insn->code) == BPF_ALU64)
+				/* div r11 */
+				EMIT3(0x49, 0xF7, 0xF3);
+			else
+				/* div r11d */
+				EMIT3(0x41, 0xF7, 0xF3);
+
+			if (BPF_OP(insn->code) == BPF_MOD)
+				/* mov r11, rdx */
+				EMIT3(0x49, 0x89, 0xD3);
+			else
+				/* mov r11, rax */
+				EMIT3(0x49, 0x89, 0xC3);
+
+			EMIT1(0x5A); /* pop rdx */
+			EMIT1(0x58); /* pop rax */
+
+			/* mov A, r11 */
+			EMIT_mov(a_reg, AUX_REG);
+			break;
+
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+		case BPF_ALU64 | BPF_MUL | BPF_K:
+		case BPF_ALU64 | BPF_MUL | BPF_X:
+			EMIT1(0x50); /* push rax */
+			EMIT1(0x52); /* push rdx */
+
+			/* mov r11, A */
+			EMIT_mov(AUX_REG, a_reg);
+
+			if (BPF_SRC(insn->code) == BPF_X)
+				/* mov rax, X */
+				EMIT_mov(BPF_REG_0, x_reg);
+			else
+				/* mov rax, K */
+				EMIT3_off32(0x48, 0xC7, 0xC0, K);
+
+			if (BPF_CLASS(insn->code) == BPF_ALU64)
+				EMIT1(add_1mod(0x48, AUX_REG));
+			else if (is_ereg(AUX_REG))
+				EMIT1(add_1mod(0x40, AUX_REG));
+			/* mul(q) r11 */
+			EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
+
+			/* mov r11, rax */
+			EMIT_mov(AUX_REG, BPF_REG_0);
+
+			EMIT1(0x5A); /* pop rdx */
+			EMIT1(0x58); /* pop rax */
+
+			/* mov A, r11 */
+			EMIT_mov(a_reg, AUX_REG);
+			break;
+
+			/* shifts */
+		case BPF_ALU | BPF_LSH | BPF_K:
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_ARSH | BPF_K:
+		case BPF_ALU64 | BPF_LSH | BPF_K:
+		case BPF_ALU64 | BPF_RSH | BPF_K:
+		case BPF_ALU64 | BPF_ARSH | BPF_K:
+			if (BPF_CLASS(insn->code) == BPF_ALU64)
+				EMIT1(add_1mod(0x48, a_reg));
+			else if (is_ereg(a_reg))
+				EMIT1(add_1mod(0x40, a_reg));
+
+			switch (BPF_OP(insn->code)) {
+			case BPF_LSH: b3 = 0xE0; break;
+			case BPF_RSH: b3 = 0xE8; break;
+			case BPF_ARSH: b3 = 0xF8; break;
+			}
+			EMIT3(0xC1, add_1reg(b3, a_reg), K);
+			break;
+
+		case BPF_ALU | BPF_END | BPF_FROM_BE:
+			switch (K) {
+			case 16:
+				/* emit 'ror %ax, 8' to swap lower 2 bytes */
+				EMIT1(0x66);
+				if (is_ereg(a_reg))
+					EMIT1(0x41);
+				EMIT3(0xC1, add_1reg(0xC8, a_reg), 8);
+				break;
+			case 32:
+				/* emit 'bswap eax' to swap lower 4 bytes */
+				if (is_ereg(a_reg))
+					EMIT2(0x41, 0x0F);
 				else
-					EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
-				break;
-			case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
-				seen |= SEEN_MEM;
-				EMIT3(0x8b, 0x45, 0xf0 - K*4);
+					EMIT1(0x0F);
+				EMIT1(add_1reg(0xC8, a_reg));
 				break;
-			case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
-				seen |= SEEN_XREG | SEEN_MEM;
-				EMIT3(0x8b, 0x5d, 0xf0 - K*4);
-				break;
-			case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
-				seen |= SEEN_MEM;
-				EMIT3(0x89, 0x45, 0xf0 - K*4);
-				break;
-			case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
-				seen |= SEEN_XREG | SEEN_MEM;
-				EMIT3(0x89, 0x5d, 0xf0 - K*4);
-				break;
-			case BPF_S_LD_W_LEN: /*	A = skb->len; */
-				BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
-				if (is_imm8(offsetof(struct sk_buff, len)))
-					/* mov    off8(%rdi),%eax */
-					EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
-				else {
-					EMIT2(0x8b, 0x87);
-					EMIT(offsetof(struct sk_buff, len), 4);
-				}
-				break;
-			case BPF_S_LDX_W_LEN: /* X = skb->len; */
-				seen |= SEEN_XREG;
-				if (is_imm8(offsetof(struct sk_buff, len)))
-					/* mov off8(%rdi),%ebx */
-					EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
-				else {
-					EMIT2(0x8b, 0x9f);
-					EMIT(offsetof(struct sk_buff, len), 4);
-				}
-				break;
-			case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
-				BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
-				if (is_imm8(offsetof(struct sk_buff, protocol))) {
-					/* movzwl off8(%rdi),%eax */
-					EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
-				} else {
-					EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
-					EMIT(offsetof(struct sk_buff, protocol), 4);
-				}
-				EMIT2(0x86, 0xc4); /* ntohs() : xchg   %al,%ah */
-				break;
-			case BPF_S_ANC_IFINDEX:
-				if (is_imm8(offsetof(struct sk_buff, dev))) {
-					/* movq off8(%rdi),%rax */
-					EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
-				} else {
-					EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
-					EMIT(offsetof(struct sk_buff, dev), 4);
-				}
-				EMIT3(0x48, 0x85, 0xc0);	/* test %rax,%rax */
-				EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
-				BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-				EMIT2(0x8b, 0x80);	/* mov off32(%rax),%eax */
-				EMIT(offsetof(struct net_device, ifindex), 4);
-				break;
-			case BPF_S_ANC_MARK:
-				BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
-				if (is_imm8(offsetof(struct sk_buff, mark))) {
-					/* mov off8(%rdi),%eax */
-					EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
-				} else {
-					EMIT2(0x8b, 0x87);
-					EMIT(offsetof(struct sk_buff, mark), 4);
-				}
-				break;
-			case BPF_S_ANC_RXHASH:
-				BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
-				if (is_imm8(offsetof(struct sk_buff, hash))) {
-					/* mov off8(%rdi),%eax */
-					EMIT3(0x8b, 0x47, offsetof(struct sk_buff, hash));
-				} else {
-					EMIT2(0x8b, 0x87);
-					EMIT(offsetof(struct sk_buff, hash), 4);
-				}
-				break;
-			case BPF_S_ANC_QUEUE:
-				BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
-				if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
-					/* movzwl off8(%rdi),%eax */
-					EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
-				} else {
-					EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
-					EMIT(offsetof(struct sk_buff, queue_mapping), 4);
-				}
-				break;
-			case BPF_S_ANC_CPU:
-#ifdef CONFIG_SMP
-				EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
-				EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
-#else
-				CLEAR_A();
-#endif
-				break;
-			case BPF_S_ANC_VLAN_TAG:
-			case BPF_S_ANC_VLAN_TAG_PRESENT:
-				BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
-				if (is_imm8(offsetof(struct sk_buff, vlan_tci))) {
-					/* movzwl off8(%rdi),%eax */
-					EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, vlan_tci));
-				} else {
-					EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
-					EMIT(offsetof(struct sk_buff, vlan_tci), 4);
-				}
-				BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
-				if (filter[i].code == BPF_S_ANC_VLAN_TAG) {
-					EMIT3(0x80, 0xe4, 0xef); /* and    $0xef,%ah */
-				} else {
-					EMIT3(0xc1, 0xe8, 0x0c); /* shr    $0xc,%eax */
-					EMIT3(0x83, 0xe0, 0x01); /* and    $0x1,%eax */
-				}
-				break;
-			case BPF_S_ANC_PKTTYPE:
-			{
-				int off = pkt_type_offset();
-
-				if (off < 0)
-					return -EINVAL;
-				if (is_imm8(off)) {
-					/* movzbl off8(%rdi),%eax */
-					EMIT4(0x0f, 0xb6, 0x47, off);
-				} else {
-					/* movbl off32(%rdi),%eax */
-					EMIT3(0x0f, 0xb6, 0x87);
-					EMIT(off, 4);
-				}
-				EMIT3(0x83, 0xe0, PKT_TYPE_MAX); /* and    $0x7,%eax */
+			case 64:
+				/* emit 'bswap rax' to swap 8 bytes */
+				EMIT3(add_1mod(0x48, a_reg), 0x0F,
+				      add_1reg(0xC8, a_reg));
 				break;
 			}
-			case BPF_S_LD_W_ABS:
-				func = CHOOSE_LOAD_FUNC(K, sk_load_word);
-common_load:			seen |= SEEN_DATAREF;
-				t_offset = func - (image + addrs[i]);
-				EMIT1_off32(0xbe, K); /* mov imm32,%esi */
-				EMIT1_off32(0xe8, t_offset); /* call */
-				break;
-			case BPF_S_LD_H_ABS:
-				func = CHOOSE_LOAD_FUNC(K, sk_load_half);
-				goto common_load;
-			case BPF_S_LD_B_ABS:
-				func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
-				goto common_load;
-			case BPF_S_LDX_B_MSH:
-				func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
-				seen |= SEEN_DATAREF | SEEN_XREG;
-				t_offset = func - (image + addrs[i]);
-				EMIT1_off32(0xbe, K);	/* mov imm32,%esi */
-				EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
-				break;
-			case BPF_S_LD_W_IND:
-				func = sk_load_word;
-common_load_ind:		seen |= SEEN_DATAREF | SEEN_XREG;
-				t_offset = func - (image + addrs[i]);
-				if (K) {
-					if (is_imm8(K)) {
-						EMIT3(0x8d, 0x73, K); /* lea imm8(%rbx), %esi */
-					} else {
-						EMIT2(0x8d, 0xb3); /* lea imm32(%rbx),%esi */
-						EMIT(K, 4);
-					}
-				} else {
-					EMIT2(0x89,0xde); /* mov %ebx,%esi */
-				}
-				EMIT1_off32(0xe8, t_offset);	/* call sk_load_xxx_ind */
-				break;
-			case BPF_S_LD_H_IND:
-				func = sk_load_half;
-				goto common_load_ind;
-			case BPF_S_LD_B_IND:
-				func = sk_load_byte;
-				goto common_load_ind;
-			case BPF_S_JMP_JA:
-				t_offset = addrs[i + K] - addrs[i];
-				EMIT_JMP(t_offset);
-				break;
-			COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
-			COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
-			COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
-			COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
-			COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
-			COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
-			COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
-			COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
-
-cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
-				t_offset = addrs[i + filter[i].jt] - addrs[i];
-
-				/* same targets, can avoid doing the test :) */
-				if (filter[i].jt == filter[i].jf) {
-					EMIT_JMP(t_offset);
-					break;
-				}
+			break;
+
+		case BPF_ALU | BPF_END | BPF_FROM_LE:
+			break;
+
+			/* ST: *(u8*)(a_reg + off) = imm */
+		case BPF_ST | BPF_MEM | BPF_B:
+			if (is_ereg(a_reg))
+				EMIT2(0x41, 0xC6);
+			else
+				EMIT1(0xC6);
+			goto st;
+		case BPF_ST | BPF_MEM | BPF_H:
+			if (is_ereg(a_reg))
+				EMIT3(0x66, 0x41, 0xC7);
+			else
+				EMIT2(0x66, 0xC7);
+			goto st;
+		case BPF_ST | BPF_MEM | BPF_W:
+			if (is_ereg(a_reg))
+				EMIT2(0x41, 0xC7);
+			else
+				EMIT1(0xC7);
+			goto st;
+		case BPF_ST | BPF_MEM | BPF_DW:
+			EMIT2(add_1mod(0x48, a_reg), 0xC7);
+
+st:			if (is_imm8(insn->off))
+				EMIT2(add_1reg(0x40, a_reg), insn->off);
+			else
+				EMIT1_off32(add_1reg(0x80, a_reg), insn->off);
+
+			EMIT(K, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+			break;
+
+			/* STX: *(u8*)(a_reg + off) = x_reg */
+		case BPF_STX | BPF_MEM | BPF_B:
+			/* emit 'mov byte ptr [rax + off], al' */
+			if (is_ereg(a_reg) || is_ereg(x_reg) ||
+			    /* have to add extra byte for x86 SIL, DIL regs */
+			    x_reg == BPF_REG_1 || x_reg == BPF_REG_2)
+				EMIT2(add_2mod(0x40, a_reg, x_reg), 0x88);
+			else
+				EMIT1(0x88);
+			goto stx;
+		case BPF_STX | BPF_MEM | BPF_H:
+			if (is_ereg(a_reg) || is_ereg(x_reg))
+				EMIT3(0x66, add_2mod(0x40, a_reg, x_reg), 0x89);
+			else
+				EMIT2(0x66, 0x89);
+			goto stx;
+		case BPF_STX | BPF_MEM | BPF_W:
+			if (is_ereg(a_reg) || is_ereg(x_reg))
+				EMIT2(add_2mod(0x40, a_reg, x_reg), 0x89);
+			else
+				EMIT1(0x89);
+			goto stx;
+		case BPF_STX | BPF_MEM | BPF_DW:
+			EMIT2(add_2mod(0x48, a_reg, x_reg), 0x89);
+stx:			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+					    insn->off);
+			break;
+
+			/* LDX: a_reg = *(u8*)(x_reg + off) */
+		case BPF_LDX | BPF_MEM | BPF_B:
+			/* emit 'movzx rax, byte ptr [rax + off]' */
+			EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB6);
+			goto ldx;
+		case BPF_LDX | BPF_MEM | BPF_H:
+			/* emit 'movzx rax, word ptr [rax + off]' */
+			EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB7);
+			goto ldx;
+		case BPF_LDX | BPF_MEM | BPF_W:
+			/* emit 'mov eax, dword ptr [rax+0x14]' */
+			if (is_ereg(a_reg) || is_ereg(x_reg))
+				EMIT2(add_2mod(0x40, x_reg, a_reg), 0x8B);
+			else
+				EMIT1(0x8B);
+			goto ldx;
+		case BPF_LDX | BPF_MEM | BPF_DW:
+			/* emit 'mov rax, qword ptr [rax+0x14]' */
+			EMIT2(add_2mod(0x48, x_reg, a_reg), 0x8B);
+ldx:			/* if insn->off == 0 we can save one extra byte, but
+			 * special case of x86 r13 which always needs an offset
+			 * is not worth the hassle
+			 */
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, x_reg, a_reg), insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, x_reg, a_reg),
+					    insn->off);
+			break;
+
+			/* STX XADD: lock *(u32*)(a_reg + off) += x_reg */
+		case BPF_STX | BPF_XADD | BPF_W:
+			/* emit 'lock add dword ptr [rax + off], eax' */
+			if (is_ereg(a_reg) || is_ereg(x_reg))
+				EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x01);
+			else
+				EMIT2(0xF0, 0x01);
+			goto xadd;
+		case BPF_STX | BPF_XADD | BPF_DW:
+			EMIT3(0xF0, add_2mod(0x48, a_reg, x_reg), 0x01);
+xadd:			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+					    insn->off);
+			break;
+
+			/* call */
+		case BPF_JMP | BPF_CALL:
+			func = (u8 *) __bpf_call_base + K;
+			jmp_offset = func - (image + addrs[i]);
+			if (ctx->seen_ld_abs) {
+				EMIT2(0x41, 0x52); /* push %r10 */
+				EMIT2(0x41, 0x51); /* push %r9 */
+				/* need to adjust jmp offset, since
+				 * pop %r9, pop %r10 take 4 bytes after call insn
+				 */
+				jmp_offset += 4;
+			}
+			if (!K || !is_simm32(jmp_offset)) {
+				pr_err("unsupported bpf func %d addr %p image %p\n",
+				       K, func, image);
+				return -EINVAL;
+			}
+			EMIT1_off32(0xE8, jmp_offset);
+			if (ctx->seen_ld_abs) {
+				EMIT2(0x41, 0x59); /* pop %r9 */
+				EMIT2(0x41, 0x5A); /* pop %r10 */
+			}
+			break;
+
+			/* cond jump */
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JNE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JSGT | BPF_X:
+		case BPF_JMP | BPF_JSGE | BPF_X:
+			/* cmp a_reg, x_reg */
+			EMIT3(add_2mod(0x48, a_reg, x_reg), 0x39,
+			      add_2reg(0xC0, a_reg, x_reg));
+			goto emit_cond_jmp;
+
+		case BPF_JMP | BPF_JSET | BPF_X:
+			/* test a_reg, x_reg */
+			EMIT3(add_2mod(0x48, a_reg, x_reg), 0x85,
+			      add_2reg(0xC0, a_reg, x_reg));
+			goto emit_cond_jmp;
+
+		case BPF_JMP | BPF_JSET | BPF_K:
+			/* test a_reg, imm32 */
+			EMIT1(add_1mod(0x48, a_reg));
+			EMIT2_off32(0xF7, add_1reg(0xC0, a_reg), K);
+			goto emit_cond_jmp;
+
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JNE | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JSGT | BPF_K:
+		case BPF_JMP | BPF_JSGE | BPF_K:
+			/* cmp a_reg, imm8/32 */
+			EMIT1(add_1mod(0x48, a_reg));
+
+			if (is_imm8(K))
+				EMIT3(0x83, add_1reg(0xF8, a_reg), K);
+			else
+				EMIT2_off32(0x81, add_1reg(0xF8, a_reg), K);
+
+emit_cond_jmp:		/* convert BPF opcode to x86 */
+			switch (BPF_OP(insn->code)) {
+			case BPF_JEQ:
+				jmp_cond = X86_JE;
+				break;
+			case BPF_JSET:
+			case BPF_JNE:
+				jmp_cond = X86_JNE;
+				break;
+			case BPF_JGT:
+				/* GT is unsigned '>', JA in x86 */
+				jmp_cond = X86_JA;
+				break;
+			case BPF_JGE:
+				/* GE is unsigned '>=', JAE in x86 */
+				jmp_cond = X86_JAE;
+				break;
+			case BPF_JSGT:
+				/* signed '>', GT in x86 */
+				jmp_cond = X86_JG;
+				break;
+			case BPF_JSGE:
+				/* signed '>=', GE in x86 */
+				jmp_cond = X86_JGE;
+				break;
+			default: /* to silence gcc warning */
+				return -EFAULT;
+			}
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (is_imm8(jmp_offset)) {
+				EMIT2(jmp_cond, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+			} else {
+				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+
+			break;
 
-				switch (filter[i].code) {
-				case BPF_S_JMP_JGT_X:
-				case BPF_S_JMP_JGE_X:
-				case BPF_S_JMP_JEQ_X:
-					seen |= SEEN_XREG;
-					EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
-					break;
-				case BPF_S_JMP_JSET_X:
-					seen |= SEEN_XREG;
-					EMIT2(0x85, 0xd8); /* test %ebx,%eax */
-					break;
-				case BPF_S_JMP_JEQ_K:
-					if (K == 0) {
-						EMIT2(0x85, 0xc0); /* test   %eax,%eax */
-						break;
-					}
-				case BPF_S_JMP_JGT_K:
-				case BPF_S_JMP_JGE_K:
-					if (K <= 127)
-						EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
+		case BPF_JMP | BPF_JA:
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (!jmp_offset)
+				/* optimize out nop jumps */
+				break;
+emit_jmp:
+			if (is_imm8(jmp_offset)) {
+				EMIT2(0xEB, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT1_off32(0xE9, jmp_offset);
+			} else {
+				pr_err("jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+			break;
+
+		case BPF_LD | BPF_IND | BPF_W:
+			func = sk_load_word;
+			goto common_load;
+		case BPF_LD | BPF_ABS | BPF_W:
+			func = CHOOSE_LOAD_FUNC(K, sk_load_word);
+common_load:		ctx->seen_ld_abs = true;
+			jmp_offset = func - (image + addrs[i]);
+			if (!func || !is_simm32(jmp_offset)) {
+				pr_err("unsupported bpf func %d addr %p image %p\n",
+				       K, func, image);
+				return -EINVAL;
+			}
+			if (BPF_MODE(insn->code) == BPF_ABS) {
+				/* mov %esi, imm32 */
+				EMIT1_off32(0xBE, K);
+			} else {
+				/* mov %rsi, x_reg */
+				EMIT_mov(BPF_REG_2, x_reg);
+				if (K) {
+					if (is_imm8(K))
+						/* add %esi, imm8 */
+						EMIT3(0x83, 0xC6, K);
 					else
-						EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
-					break;
-				case BPF_S_JMP_JSET_K:
-					if (K <= 0xFF)
-						EMIT2(0xa8, K); /* test imm8,%al */
-					else if (!(K & 0xFFFF00FF))
-						EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
-					else if (K <= 0xFFFF) {
-						EMIT2(0x66, 0xa9); /* test imm16,%ax */
-						EMIT(K, 2);
-					} else {
-						EMIT1_off32(0xa9, K); /* test imm32,%eax */
-					}
-					break;
+						/* add %esi, imm32 */
+						EMIT2_off32(0x81, 0xC6, K);
 				}
-				if (filter[i].jt != 0) {
-					if (filter[i].jf && f_offset)
-						t_offset += is_near(f_offset) ? 2 : 5;
-					EMIT_COND_JMP(t_op, t_offset);
-					if (filter[i].jf)
-						EMIT_JMP(f_offset);
-					break;
-				}
-				EMIT_COND_JMP(f_op, f_offset);
-				break;
+			}
+			/* skb pointer is in R6 (%rbx), it will be copied into
+			 * %rdi if skb_copy_bits() call is necessary.
+			 * sk_load_* helpers also use %r10 and %r9d.
+			 * See bpf_jit.S
+			 */
+			EMIT1_off32(0xE8, jmp_offset); /* call */
+			break;
+
+		case BPF_LD | BPF_IND | BPF_H:
+			func = sk_load_half;
+			goto common_load;
+		case BPF_LD | BPF_ABS | BPF_H:
+			func = CHOOSE_LOAD_FUNC(K, sk_load_half);
+			goto common_load;
+		case BPF_LD | BPF_IND | BPF_B:
+			func = sk_load_byte;
+			goto common_load;
+		case BPF_LD | BPF_ABS | BPF_B:
+			func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
+			goto common_load;
+
+		case BPF_JMP | BPF_EXIT:
+			if (i != insn_cnt - 1) {
+				jmp_offset = ctx->cleanup_addr - addrs[i];
+				goto emit_jmp;
+			}
+			/* update cleanup_addr */
+			ctx->cleanup_addr = proglen;
+			/* mov rbx, qword ptr [rbp-X] */
+			EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+			/* mov r13, qword ptr [rbp-X] */
+			EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+			/* mov r14, qword ptr [rbp-X] */
+			EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+			/* mov r15, qword ptr [rbp-X] */
+			EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+
+			EMIT1(0xC9); /* leave */
+			EMIT1(0xC3); /* ret */
+			break;
+
 		default:
-			/* hmm, too complex filter, give up with jit compiler */
+			/* By design x64 JIT should support all BPF instructions
+			 * This error will be seen if new instruction was added
+			 * to interpreter, but not to JIT
+			 * or if there is junk in sk_filter
+			 */
+			pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
 			return -EINVAL;
 		}
+
 		ilen = prog - temp;
 		if (image) {
 			if (unlikely(proglen + ilen > oldproglen)) {
-				pr_err("bpb_jit_compile fatal error\n");
+				pr_err("bpf_jit_compile fatal error\n");
 				return -EFAULT;
 			}
 			memcpy(image + proglen, temp, ilen);
@@ -726,21 +859,14 @@ cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 		addrs[i] = proglen;
 		prog = temp;
 	}
-	/* last bpf instruction is always a RET :
-	 * use it to give the cleanup instruction(s) addr
-	 */
-	ctx->cleanup_addr = proglen - 1; /* ret */
-	if (seen_or_pass0)
-		ctx->cleanup_addr -= 1; /* leaveq */
-	if (seen_or_pass0 & SEEN_XREG)
-		ctx->cleanup_addr -= 4; /* mov  -8(%rbp),%rbx */
-
-	ctx->seen = seen;
-
 	return proglen;
 }
 
 void bpf_jit_compile(struct sk_filter *prog)
+{
+}
+
+void bpf_int_jit_compile(struct sk_filter *prog)
 {
 	struct bpf_binary_header *header = NULL;
 	int proglen, oldproglen = 0;
@@ -768,8 +894,6 @@ void bpf_jit_compile(struct sk_filter *prog)
 		addrs[i] = proglen;
 	}
 	ctx.cleanup_addr = proglen;
-	ctx.seen = SEEN_XREG | SEEN_DATAREF | SEEN_MEM;
-	ctx.pc_ret0 = -1;
 
 	for (pass = 0; pass < 10; pass++) {
 		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4457b383961c..9d5ae0a2c954 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -207,6 +207,9 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
 void sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
+u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+void bpf_int_jit_compile(struct sk_filter *fp);
+
 #ifdef CONFIG_BPF_JIT
 #include <stdarg.h>
 #include <linux/linkage.h>
diff --git a/net/core/filter.c b/net/core/filter.c
index c442a0d7d0f7..32c5b44c537e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1524,6 +1524,10 @@ out_err:
 	return ERR_PTR(err);
 }
 
+void __weak bpf_int_jit_compile(struct sk_filter *prog)
+{
+}
+
 static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
 					     struct sock *sk)
 {
@@ -1544,9 +1548,12 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
 	/* JIT compiler couldn't process this filter, so do the
 	 * internal BPF translation for the optimized interpreter.
 	 */
-	if (!fp->jited)
+	if (!fp->jited) {
 		fp = __sk_migrate_filter(fp, sk);
 
+		/* Probe if internal BPF can be jit-ed */
+		bpf_int_jit_compile(fp);
+	}
 	return fp;
 }
 
-- 
cgit v1.2.3-71-gd317


From 3b514d24e200fcdcde0a57c354a51d3677a86743 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:47 -0400
Subject: cgroup: skip refcnting on normal root csses and cgrp_dfl_root self
 css

9395a4500404 ("cgroup: enable refcnting for root csses") enabled
reference counting for root csses (cgroup_subsys_states) so that
cgroup's self csses can be used to manage the lifetime of the
containing cgroups.

Unfortunately, this change was incorrect.  During early init,
cgrp_dfl_root self css refcnt is used.  percpu_ref can't initialized
during early init and its initialization is deferred till
cgroup_init() time.  This means that cpu was using percpu_ref which
wasn't properly initialized.  Due to the way percpu variables are laid
out on x86, this didn't blow up immediately on x86 but ended up
incrementing and decrementing the percpu variable at offset zero,
whatever it may be; however, on other archs, this caused fault and
early boot failure.

As cgroup self csses for root cgroups of non-dfl hierarchies need
working refcounting, we can't revert 9395a4500404.  This patch adds
CSS_NO_REF which explicitly inhibits reference counting on the css and
sets it on all normal (non-self) csses and cgroup_dfl_root self css.

v2: cgrp_dfl_root.self is the offending one.  Set the flag on it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Stephen Warren <swarren@nvidia.com>
Tested-by: Stephen Warren <swarren@nvidia.com>
Fixes: 9395a4500404 ("cgroup: enable refcnting for root csses")
---
 include/linux/cgroup.h | 11 ++++++++---
 kernel/cgroup.c        | 11 +++++++++--
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 76dadd77a120..1737db0c63fe 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -77,6 +77,7 @@ struct cgroup_subsys_state {
 
 /* bits in struct cgroup_subsys_state flags field */
 enum {
+	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
 
@@ -88,7 +89,8 @@ enum {
  */
 static inline void css_get(struct cgroup_subsys_state *css)
 {
-	percpu_ref_get(&css->refcnt);
+	if (!(css->flags & CSS_NO_REF))
+		percpu_ref_get(&css->refcnt);
 }
 
 /**
@@ -103,7 +105,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
  */
 static inline bool css_tryget_online(struct cgroup_subsys_state *css)
 {
-	return percpu_ref_tryget_live(&css->refcnt);
+	if (!(css->flags & CSS_NO_REF))
+		return percpu_ref_tryget_live(&css->refcnt);
+	return true;
 }
 
 /**
@@ -114,7 +118,8 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css)
  */
 static inline void css_put(struct cgroup_subsys_state *css)
 {
-	percpu_ref_put(&css->refcnt);
+	if (!(css->flags & CSS_NO_REF))
+		percpu_ref_put(&css->refcnt);
 }
 
 /* bits in struct cgroup flags field */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c01e8e8dfad0..0343d7ee6d62 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4593,11 +4593,17 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+	/*
+	 * Root csses are never destroyed and we can't initialize
+	 * percpu_ref during early init.  Disable refcnting.
+	 */
+	css->flags |= CSS_NO_REF;
+
 	if (early) {
 		/* allocation can't be done safely during early init */
 		css->id = 1;
 	} else {
-		BUG_ON(percpu_ref_init(&css->refcnt, css_release));
 		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
 		BUG_ON(css->id < 0);
 	}
@@ -4636,6 +4642,8 @@ int __init cgroup_init_early(void)
 	int i;
 
 	init_cgroup_root(&cgrp_dfl_root, &opts);
+	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
+
 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
 
 	for_each_subsys(ss, i) {
@@ -4684,7 +4692,6 @@ int __init cgroup_init(void)
 			struct cgroup_subsys_state *css =
 				init_css_set.subsys[ss->id];
 
-			BUG_ON(percpu_ref_init(&css->refcnt, css_release));
 			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
 						   GFP_KERNEL);
 			BUG_ON(css->id < 0);
-- 
cgit v1.2.3-71-gd317


From 5c9d535b893f30266ea29fe377cb9b002fcd76aa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:48 -0400
Subject: cgroup: remove css_parent()

cgroup in general is moving towards using cgroup_subsys_state as the
fundamental structural component and css_parent() was introduced to
convert from using cgroup->parent to css->parent.  It was quite some
time ago and we're moving forward with making css more prominent.

This patch drops the trivial wrapper css_parent() and let the users
dereference css->parent.  While at it, explicitly mark fields of css
which are public and immutable.

v2: New usage from device_cgroup.c converted.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 block/blk-cgroup.h           |  2 +-
 include/linux/cgroup.h       | 29 +++++++++++------------------
 kernel/cgroup.c              |  8 ++++----
 kernel/cgroup_freezer.c      |  2 +-
 kernel/cpuset.c              |  2 +-
 kernel/sched/core.c          |  2 +-
 kernel/sched/cpuacct.c       |  2 +-
 mm/hugetlb_cgroup.c          |  2 +-
 mm/memcontrol.c              | 14 +++++++-------
 net/core/netclassid_cgroup.c |  2 +-
 net/core/netprio_cgroup.c    |  2 +-
 security/device_cgroup.c     |  8 ++++----
 12 files changed, 34 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 371fe8e92ab5..d692b29c083a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -204,7 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
  */
 static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 {
-	return css_to_blkcg(css_parent(&blkcg->css));
+	return css_to_blkcg(blkcg->css.parent);
 }
 
 /**
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1737db0c63fe..2549493d518d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -48,22 +48,28 @@ enum cgroup_subsys_id {
 };
 #undef SUBSYS
 
-/* Per-subsystem/per-cgroup state maintained by the system. */
+/*
+ * Per-subsystem/per-cgroup state maintained by the system.  This is the
+ * fundamental structural building block that controllers deal with.
+ *
+ * Fields marked with "PI:" are public and immutable and may be accessed
+ * directly without synchronization.
+ */
 struct cgroup_subsys_state {
-	/* the cgroup that this css is attached to */
+	/* PI: the cgroup that this css is attached to */
 	struct cgroup *cgroup;
 
-	/* the cgroup subsystem that this css is attached to */
+	/* PI: the cgroup subsystem that this css is attached to */
 	struct cgroup_subsys *ss;
 
 	/* reference count - access via css_[try]get() and css_put() */
 	struct percpu_ref refcnt;
 
-	/* the parent css */
+	/* PI: the parent css */
 	struct cgroup_subsys_state *parent;
 
 	/*
-	 * Subsys-unique ID.  0 is unused and root is always 1.  The
+	 * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
 	 * matching css can be looked up using css_from_id().
 	 */
 	int id;
@@ -669,19 +675,6 @@ struct cgroup_subsys {
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 
-/**
- * css_parent - find the parent css
- * @css: the target cgroup_subsys_state
- *
- * Return the parent css of @css.  This function is guaranteed to return
- * non-NULL parent as long as @css isn't the root.
- */
-static inline
-struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
-{
-	return css->parent;
-}
-
 /**
  * task_css_set_check - obtain a task's css_set with extra access conditions
  * @task: the task to obtain css_set for
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0343d7ee6d62..929bbbc539e9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3176,10 +3176,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 
 	/* no child, visit my or the closest ancestor's next sibling */
 	while (pos != root) {
-		next = css_next_child(pos, css_parent(pos));
+		next = css_next_child(pos, pos->parent);
 		if (next)
 			return next;
-		pos = css_parent(pos);
+		pos = pos->parent;
 	}
 
 	return NULL;
@@ -3261,12 +3261,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 		return NULL;
 
 	/* if there's an unvisited sibling, visit its leftmost descendant */
-	next = css_next_child(pos, css_parent(pos));
+	next = css_next_child(pos, pos->parent);
 	if (next)
 		return css_leftmost_descendant(next);
 
 	/* no sibling left, visit parent */
-	return css_parent(pos);
+	return pos->parent;
 }
 
 static bool cgroup_has_live_children(struct cgroup *cgrp)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6b4e60e33a9a..a79e40f9d700 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -59,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)
 
 static struct freezer *parent_freezer(struct freezer *freezer)
 {
-	return css_freezer(css_parent(&freezer->css));
+	return css_freezer(freezer->css.parent);
 }
 
 bool cgroup_freezing(struct task_struct *task)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2f4b08b8db24..5b2a31082f4f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -124,7 +124,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)
 
 static inline struct cpuset *parent_cs(struct cpuset *cs)
 {
-	return css_cs(css_parent(&cs->css));
+	return css_cs(cs->css.parent);
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45ea238c..ac61ad1a5f9f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7586,7 +7586,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
-	struct task_group *parent = css_tg(css_parent(css));
+	struct task_group *parent = css_tg(css->parent);
 
 	if (parent)
 		sched_online_group(tg, parent);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index c143ee380e3a..9cf350c94ec4 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
 
 static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 {
-	return css_ca(css_parent(&ca->css));
+	return css_ca(ca->css.parent);
 }
 
 static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a380681ab3cf..493f758445e7 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -52,7 +52,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
 static inline struct hugetlb_cgroup *
 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
 {
-	return hugetlb_cgroup_from_css(css_parent(&h_cg->css));
+	return hugetlb_cgroup_from_css(h_cg->css.parent);
 }
 
 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b638a79209ee..a5e0417b4f9a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1540,7 +1540,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
 	/* root ? */
-	if (!css_parent(&memcg->css))
+	if (!memcg->css.parent)
 		return vm_swappiness;
 
 	return memcg->swappiness;
@@ -4909,7 +4909,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 {
 	int retval = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
+	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
 
 	mutex_lock(&memcg_create_mutex);
 
@@ -5207,8 +5207,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 	if (!memcg->use_hierarchy)
 		goto out;
 
-	while (css_parent(&memcg->css)) {
-		memcg = mem_cgroup_from_css(css_parent(&memcg->css));
+	while (memcg->css.parent) {
+		memcg = mem_cgroup_from_css(memcg->css.parent);
 		if (!memcg->use_hierarchy)
 			break;
 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -5443,7 +5443,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 				       struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
+	struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent);
 
 	if (val > 100 || !parent)
 		return -EINVAL;
@@ -5790,7 +5790,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 	struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
+	struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent);
 
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
 	if (!parent || !((val == 0) || (val == 1)))
@@ -6407,7 +6407,7 @@ static int
 mem_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
+	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
 
 	if (css->id > MEM_CGROUP_ID_MAX)
 		return -ENOSPC;
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 22931e1b99b4..30d903b19c62 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -42,7 +42,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
 static int cgrp_css_online(struct cgroup_subsys_state *css)
 {
 	struct cgroup_cls_state *cs = css_cls_state(css);
-	struct cgroup_cls_state *parent = css_cls_state(css_parent(css));
+	struct cgroup_cls_state *parent = css_cls_state(css->parent);
 
 	if (parent)
 		cs->classid = parent->classid;
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index b990cefd906b..2f385b9bccc0 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
 
 static int cgrp_css_online(struct cgroup_subsys_state *css)
 {
-	struct cgroup_subsys_state *parent_css = css_parent(css);
+	struct cgroup_subsys_state *parent_css = css->parent;
 	struct net_device *dev;
 	int ret = 0;
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 7dbac4061b1c..ce14a31b1337 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -182,7 +182,7 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg)
 static int devcgroup_online(struct cgroup_subsys_state *css)
 {
 	struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
-	struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css));
+	struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent);
 	int ret = 0;
 
 	mutex_lock(&devcgroup_mutex);
@@ -455,7 +455,7 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
 static int parent_has_perm(struct dev_cgroup *childcg,
 				  struct dev_exception_item *ex)
 {
-	struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css));
+	struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);
 
 	if (!parent)
 		return 1;
@@ -476,7 +476,7 @@ static int parent_has_perm(struct dev_cgroup *childcg,
 static bool parent_allows_removal(struct dev_cgroup *childcg,
 				  struct dev_exception_item *ex)
 {
-	struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css));
+	struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);
 
 	if (!parent)
 		return true;
@@ -614,7 +614,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	char temp[12];		/* 11 + 1 characters needed for a u32 */
 	int count, rc = 0;
 	struct dev_exception_item ex;
-	struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css));
+	struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-- 
cgit v1.2.3-71-gd317


From d51f39b05ce0008118c45945e681b20484990571 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:48 -0400
Subject: cgroup: remove cgroup->parent

cgroup->parent is redundant as cgroup->self.parent can also be used to
determine the parent cgroup and we're moving towards using
cgroup_subsys_states as the fundamental structural blocks.  This patch
introduces cgroup_parent() which follows cgroup->self.parent and
removes cgroup->parent.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  1 -
 kernel/cgroup.c        | 52 +++++++++++++++++++++++++++++---------------------
 2 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2549493d518d..fd538f4c2bb6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -178,7 +178,6 @@ struct cgroup {
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
 
-	struct cgroup *parent;		/* my parent */
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 929bbbc539e9..8c67a739aea4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -218,6 +218,15 @@ static void cgroup_idr_remove(struct idr *idr, int id)
 	spin_unlock_bh(&cgroup_idr_lock);
 }
 
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+	if (parent_css)
+		return container_of(parent_css, struct cgroup, self);
+	return NULL;
+}
+
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -260,9 +269,9 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
 		return NULL;
 
-	while (cgrp->parent &&
-	       !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
-		cgrp = cgrp->parent;
+	while (cgroup_parent(cgrp) &&
+	       !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+		cgrp = cgroup_parent(cgrp);
 
 	return cgroup_css(cgrp, ss);
 }
@@ -307,7 +316,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 	while (cgrp) {
 		if (cgrp == ancestor)
 			return true;
-		cgrp = cgrp->parent;
+		cgrp = cgroup_parent(cgrp);
 	}
 	return false;
 }
@@ -454,7 +463,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 
 		if (cgrp->populated_kn)
 			kernfs_notify(cgrp->populated_kn);
-		cgrp = cgrp->parent;
+		cgrp = cgroup_parent(cgrp);
 	} while (cgrp);
 }
 
@@ -2018,7 +2027,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
 	 * Except for the root, child_subsys_mask must be zero for a cgroup
 	 * with tasks so that child cgroups don't compete against tasks.
 	 */
-	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
+	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
 	    dst_cgrp->child_subsys_mask)
 		return -EBUSY;
 
@@ -2427,7 +2436,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 
-	cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
+	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
 	return 0;
 }
 
@@ -2610,8 +2619,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 
 			/* unavailable or not enabled on the parent? */
 			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
-			    (cgrp->parent &&
-			     !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
+			    (cgroup_parent(cgrp) &&
+			     !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
 				ret = -ENOENT;
 				goto out_unlock;
 			}
@@ -2640,7 +2649,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 * Except for the root, child_subsys_mask must be zero for a cgroup
 	 * with tasks so that child cgroups don't compete against tasks.
 	 */
-	if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
+	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -2898,9 +2907,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			continue;
 		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
 			continue;
-		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
 			continue;
-		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
 			continue;
 
 		if (is_add) {
@@ -4092,14 +4101,14 @@ static void css_free_work_fn(struct work_struct *work)
 		atomic_dec(&cgrp->root->nr_cgrps);
 		cgroup_pidlist_destroy_all(cgrp);
 
-		if (cgrp->parent) {
+		if (cgroup_parent(cgrp)) {
 			/*
 			 * We get a ref to the parent, and put the ref when
 			 * this cgroup is being freed, so it's guaranteed
 			 * that the parent won't be destroyed before its
 			 * children.
 			 */
-			cgroup_put(cgrp->parent);
+			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
 			kfree(cgrp);
 		} else {
@@ -4163,8 +4172,8 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	css->ss = ss;
 	css->flags = 0;
 
-	if (cgrp->parent) {
-		css->parent = cgroup_css(cgrp->parent, ss);
+	if (cgroup_parent(cgrp)) {
+		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
 		css_get(css->parent);
 	}
 
@@ -4218,7 +4227,7 @@ static void offline_css(struct cgroup_subsys_state *css)
  */
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 {
-	struct cgroup *parent = cgrp->parent;
+	struct cgroup *parent = cgroup_parent(cgrp);
 	struct cgroup_subsys_state *css;
 	int err;
 
@@ -4251,7 +4260,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 		goto err_clear_dir;
 
 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
-	    parent->parent) {
+	    cgroup_parent(parent)) {
 		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
 			current->comm, current->pid, ss->name);
 		if (!strcmp(ss->name, "memory"))
@@ -4309,7 +4318,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
 	init_cgroup_housekeeping(cgrp);
 
-	cgrp->parent = parent;
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 
@@ -4336,7 +4344,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	cgrp->serial_nr = cgroup_serial_nr_next++;
 
 	/* allocation complete, commit to creation */
-	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
+	list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children);
 	atomic_inc(&root->nr_cgrps);
 	cgroup_get(parent);
 
@@ -4531,8 +4539,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	kernfs_remove(cgrp->kn);
 
-	set_bit(CGRP_RELEASABLE, &cgrp->parent->flags);
-	check_for_release(cgrp->parent);
+	set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
+	check_for_release(cgroup_parent(cgrp));
 
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
-- 
cgit v1.2.3-71-gd317


From d5c419b68e368fdd9f1857bf8d4bb4480edb9b80 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:48 -0400
Subject: cgroup: move cgroup->sibling and ->children into cgroup_subsys_state

We're moving towards using cgroup_subsys_states as the fundamental
structural blocks.  Let's move cgroup->sibling and ->children into
cgroup_subsys_state.  This is pure move without functional change and
only cgroup->self's fields are actually used.  Other csses will make
use of the fields later.

While at it, update init_and_link_css() so that it zeroes the whole
css before initializing it and remove explicit zeroing of ->flags.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 11 ++++-------
 kernel/cgroup.c        | 38 ++++++++++++++++++++------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index fd538f4c2bb6..cf8ba26b7c6e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -68,6 +68,10 @@ struct cgroup_subsys_state {
 	/* PI: the parent css */
 	struct cgroup_subsys_state *parent;
 
+	/* siblings list anchored at the parent's ->children */
+	struct list_head sibling;
+	struct list_head children;
+
 	/*
 	 * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
 	 * matching css can be looked up using css_from_id().
@@ -171,13 +175,6 @@ struct cgroup {
 	 */
 	int populated_cnt;
 
-	/*
-	 * We link our 'sibling' struct into our parent's 'children'.
-	 * Our children link their 'sibling' into our 'children'.
-	 */
-	struct list_head sibling;	/* my parent's children */
-	struct list_head children;	/* my children */
-
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8c67a739aea4..5385839e727b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -378,7 +378,7 @@ static int notify_on_release(const struct cgroup *cgrp)
 
 /* iterate over child cgrps, lock should be held throughout iteration */
 #define cgroup_for_each_live_child(child, cgrp)				\
-	list_for_each_entry((child), &(cgrp)->children, sibling)	\
+	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
 		if (({ lockdep_assert_held(&cgroup_mutex);		\
 		       cgroup_is_dead(child); }))			\
 			;						\
@@ -870,7 +870,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	mutex_lock(&cgroup_mutex);
 
 	BUG_ON(atomic_read(&root->nr_cgrps));
-	BUG_ON(!list_empty(&cgrp->children));
+	BUG_ON(!list_empty(&cgrp->self.children));
 
 	/* Rebind all subsystems back to the default hierarchy */
 	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
@@ -1432,7 +1432,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	}
 
 	/* remounting is not allowed for populated hierarchies */
-	if (!list_empty(&root->cgrp.children)) {
+	if (!list_empty(&root->cgrp.self.children)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -1512,8 +1512,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	INIT_LIST_HEAD(&cgrp->sibling);
-	INIT_LIST_HEAD(&cgrp->children);
+	INIT_LIST_HEAD(&cgrp->self.sibling);
+	INIT_LIST_HEAD(&cgrp->self.children);
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1612,7 +1612,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 		link_css_set(&tmp_links, cset, root_cgrp);
 	up_write(&css_set_rwsem);
 
-	BUG_ON(!list_empty(&root_cgrp->children));
+	BUG_ON(!list_empty(&root_cgrp->self.children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
 
 	kernfs_activate(root_cgrp->kn);
@@ -3128,11 +3128,11 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 	 * cgroup is removed or iteration and removal race.
 	 */
 	if (!pos) {
-		next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
+		next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling);
 	} else if (likely(!cgroup_is_dead(pos))) {
-		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+		next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling);
 	} else {
-		list_for_each_entry_rcu(next, &cgrp->children, sibling)
+		list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling)
 			if (next->serial_nr > pos->serial_nr)
 				break;
 	}
@@ -3142,12 +3142,12 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 	 * the next sibling; however, it might have @ss disabled.  If so,
 	 * fast-forward to the next enabled one.
 	 */
-	while (&next->sibling != &cgrp->children) {
+	while (&next->self.sibling != &cgrp->self.children) {
 		struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
 
 		if (next_css)
 			return next_css;
-		next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
+		next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling);
 	}
 	return NULL;
 }
@@ -3283,7 +3283,7 @@ static bool cgroup_has_live_children(struct cgroup *cgrp)
 	struct cgroup *child;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(child, &cgrp->children, sibling) {
+	list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) {
 		if (!cgroup_is_dead(child)) {
 			rcu_read_unlock();
 			return true;
@@ -4144,7 +4144,7 @@ static void css_release_work_fn(struct work_struct *work)
 	} else {
 		/* cgroup release path */
 		mutex_lock(&cgroup_mutex);
-		list_del_rcu(&cgrp->sibling);
+		list_del_rcu(&cgrp->self.sibling);
 		mutex_unlock(&cgroup_mutex);
 
 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
@@ -4168,9 +4168,11 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 {
 	cgroup_get(cgrp);
 
+	memset(css, 0, sizeof(*css));
 	css->cgroup = cgrp;
 	css->ss = ss;
-	css->flags = 0;
+	INIT_LIST_HEAD(&css->sibling);
+	INIT_LIST_HEAD(&css->children);
 
 	if (cgroup_parent(cgrp)) {
 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4344,7 +4346,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	cgrp->serial_nr = cgroup_serial_nr_next++;
 
 	/* allocation complete, commit to creation */
-	list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children);
+	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
 	atomic_inc(&root->nr_cgrps);
 	cgroup_get(parent);
 
@@ -4507,9 +4509,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 		return -EBUSY;
 
 	/*
-	 * Make sure there's no live children.  We can't test ->children
-	 * emptiness as dead children linger on it while being destroyed;
-	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
+	 * Make sure there's no live children.  We can't test emptiness of
+	 * ->self.children as dead children linger on it while being
+	 * drained; otherwise, "rmdir parent/child parent" may fail.
 	 */
 	if (cgroup_has_live_children(cgrp))
 		return -EBUSY;
-- 
cgit v1.2.3-71-gd317


From 0cb51d71c1fa9234afe4213089844be76ec1765a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:49 -0400
Subject: cgroup: move cgroup->serial_nr into cgroup_subsys_state

We're moving towards using cgroup_subsys_states as the fundamental
structural blocks.  All csses including the cgroup->self and actual
ones now form trees through css->children and ->sibling which follow
the same rules as what cgroup->children and ->sibling followed.  This
patch moves cgroup->serial_nr which is used to implement css iteration
into css.

Note that all csses, regardless of their types, allocate their serial
numbers from the same monotonically increasing counter.  This doesn't
affect the ordering needed by css iteration or cause any other
material behavior changes.  This will be used to update css iteration.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 16 ++++++++--------
 kernel/cgroup.c        | 20 +++++++++++---------
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cf8ba26b7c6e..ebe7ce49f4b7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -80,6 +80,14 @@ struct cgroup_subsys_state {
 
 	unsigned int flags;
 
+	/*
+	 * Monotonically increasing unique serial number which defines a
+	 * uniform order among all csses.  It's guaranteed that all
+	 * ->children lists are in the ascending order of ->serial_nr and
+	 * used to allow interrupting and resuming iterations.
+	 */
+	u64 serial_nr;
+
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
@@ -178,14 +186,6 @@ struct cgroup {
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
 
-	/*
-	 * Monotonically increasing unique serial number which defines a
-	 * uniform order among all cgroups.  It's guaranteed that all
-	 * ->children lists are in the ascending order of ->serial_nr.
-	 * It's used to allow interrupting and resuming iterations.
-	 */
-	u64 serial_nr;
-
 	/* the bitmask of subsystems enabled on the child cgroups */
 	unsigned int child_subsys_mask;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dcb06e181ce4..d5af128ec1ec 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,14 +157,13 @@ static int cgroup_root_count;
 static DEFINE_IDR(cgroup_hierarchy_idr);
 
 /*
- * Assign a monotonically increasing serial number to cgroups.  It
- * guarantees cgroups with bigger numbers are newer than those with smaller
- * numbers.  Also, as cgroups are always appended to the parent's
- * ->children list, it guarantees that sibling cgroups are always sorted in
- * the ascending serial number order on the list.  Protected by
- * cgroup_mutex.
+ * Assign a monotonically increasing serial number to csses.  It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list.  Protected by cgroup_mutex.
  */
-static u64 cgroup_serial_nr_next = 1;
+static u64 css_serial_nr_next = 1;
 
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
@@ -3133,7 +3132,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 		next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling);
 	} else {
 		list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling)
-			if (next->serial_nr > pos->serial_nr)
+			if (next->self.serial_nr > pos->self.serial_nr)
 				break;
 	}
 
@@ -4168,6 +4167,8 @@ static void css_release(struct percpu_ref *ref)
 static void init_and_link_css(struct cgroup_subsys_state *css,
 			      struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
+	lockdep_assert_held(&cgroup_mutex);
+
 	cgroup_get(cgrp);
 
 	memset(css, 0, sizeof(*css));
@@ -4175,6 +4176,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	css->ss = ss;
 	INIT_LIST_HEAD(&css->sibling);
 	INIT_LIST_HEAD(&css->children);
+	css->serial_nr = css_serial_nr_next++;
 
 	if (cgroup_parent(cgrp)) {
 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4348,7 +4350,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	 */
 	kernfs_get(kn);
 
-	cgrp->serial_nr = cgroup_serial_nr_next++;
+	cgrp->self.serial_nr = css_serial_nr_next++;
 
 	/* allocation complete, commit to creation */
 	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
-- 
cgit v1.2.3-71-gd317


From de3f034182ecbf0efbcef7ab8b253c6c3049a592 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:49 -0400
Subject: cgroup: introduce CSS_RELEASED and reduce css iteration fallback
 window

css iterations allow the caller to drop RCU read lock.  As long as the
caller keeps the current position accessible, it can simply re-grab
RCU read lock later and continue iteration.  This is achieved by using
CGRP_DEAD to detect whether the current positions next pointer is safe
to dereference and if not re-iterate from the beginning to the next
position using ->serial_nr.

CGRP_DEAD is used as the marker to invalidate the next pointer and the
only requirement is that the marker is set before the next sibling
starts its RCU grace period.  Because CGRP_DEAD is set at the end of
cgroup_destroy_locked() but the cgroup is unlinked when the reference
count reaches zero, we currently have a rather large window where this
fallback re-iteration logic can be triggered.

This patch introduces CSS_RELEASED which is set when a css is unlinked
from its sibling list.  This still keeps the re-iteration logic
working while drastically reducing the window of its activation.
While at it, rewrite the comment in css_next_child() to reflect the
new flag and better explain the synchronization.

This will also enable iterating csses directly instead of through
cgroups.

v2: CSS_RELEASED now assigned to 1 << 2 as 1 << 0 is used by
    CSS_NO_REF.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 41 ++++++++++++++++++++---------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ebe7ce49f4b7..5375582ea5f6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -97,6 +97,7 @@ struct cgroup_subsys_state {
 enum {
 	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
+	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
 };
 
 /**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d5af128ec1ec..5544e685f2da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3108,27 +3108,28 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 	cgroup_assert_mutex_or_rcu_locked();
 
 	/*
-	 * @pos could already have been removed.  Once a cgroup is removed,
-	 * its ->sibling.next is no longer updated when its next sibling
-	 * changes.  As CGRP_DEAD assertion is serialized and happens
-	 * before the cgroup is taken off the ->sibling list, if we see it
-	 * unasserted, it's guaranteed that the next sibling hasn't
-	 * finished its grace period even if it's already removed, and thus
-	 * safe to dereference from this RCU critical section.  If
-	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
-	 * to be visible as %true here.
+	 * @pos could already have been unlinked from the sibling list.
+	 * Once a cgroup is removed, its ->sibling.next is no longer
+	 * updated when its next sibling changes.  CSS_RELEASED is set when
+	 * @pos is taken off list, at which time its next pointer is valid,
+	 * and, as releases are serialized, the one pointed to by the next
+	 * pointer is guaranteed to not have started release yet.  This
+	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
+	 * critical section, the one pointed to by its next pointer is
+	 * guaranteed to not have finished its RCU grace period even if we
+	 * have dropped rcu_read_lock() inbetween iterations.
 	 *
-	 * If @pos is dead, its next pointer can't be dereferenced;
-	 * however, as each cgroup is given a monotonically increasing
-	 * unique serial number and always appended to the sibling list,
-	 * the next one can be found by walking the parent's children until
-	 * we see a cgroup with higher serial number than @pos's.  While
-	 * this path can be slower, it's taken only when either the current
-	 * cgroup is removed or iteration and removal race.
+	 * If @pos has CSS_RELEASED set, its next pointer can't be
+	 * dereferenced; however, as each css is given a monotonically
+	 * increasing unique serial number and always appended to the
+	 * sibling list, the next one can be found by walking the parent's
+	 * children until the first css with higher serial number than
+	 * @pos's.  While this path can be slower, it happens iff iteration
+	 * races against release and the race window is very small.
 	 */
 	if (!pos) {
 		next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling);
-	} else if (likely(!cgroup_is_dead(pos))) {
+	} else if (likely(!(pos->self.flags & CSS_RELEASED))) {
 		next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling);
 	} else {
 		list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling)
@@ -4139,6 +4140,7 @@ static void css_release_work_fn(struct work_struct *work)
 
 	mutex_lock(&cgroup_mutex);
 
+	css->flags |= CSS_RELEASED;
 	list_del_rcu(&css->sibling);
 
 	if (ss) {
@@ -4525,10 +4527,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	/*
 	 * Mark @cgrp dead.  This prevents further task migration and child
-	 * creation by disabling cgroup_lock_live_group().  Note that
-	 * CGRP_DEAD assertion is depended upon by css_next_child() to
-	 * resume iteration after dropping RCU read lock.  See
-	 * css_next_child() for details.
+	 * creation by disabling cgroup_lock_live_group().
 	 */
 	set_bit(CGRP_DEAD, &cgrp->flags);
 
-- 
cgit v1.2.3-71-gd317


From c2931b70a32c705b9bd5762f5044f9eac8a52bb3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:51 -0400
Subject: cgroup: iterate cgroup_subsys_states directly

Currently, css_next_child() is implemented as finding the next child
cgroup which has the css enabled, which used to be the only way to do
it as only cgroups participated in sibling lists and thus could be
iteratd.  This works as long as what's required during iteration is
not missing online csses; however, it turns out that there are use
cases where offlined but not yet released csses need to be iterated.
This is difficult to implement through cgroup iteration the unified
hierarchy as there may be multiple dying csses for the same subsystem
associated with single cgroup.

After the recent changes, the cgroup self and regular csses behave
identically in how they're linked and unlinked from the sibling lists
including assertion of CSS_RELEASED and css_next_child() can simply
switch to iterating csses directly.  This both simplifies the logic
and ensures that all visible non-released csses are included in the
iteration whether there are multiple dying csses for a subsystem or
not.

As all other iterators depend on css_next_child() for sibling
iteration, this changes behaviors of all css iterators.  Add and
update explanations on the css states which are included in traversal
to all iterators.

As css iteration could always contain offlined csses, this shouldn't
break any of the current users and new usages which need iteration of
all on and offline csses can make use of the new semantics.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup.h | 44 ++++++++++++++++++++---------------
 kernel/cgroup.c        | 62 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 63 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5375582ea5f6..f2ff578fc03a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -764,14 +764,14 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
  * @pos: the css * to use as the loop cursor
  * @parent: css whose children to walk
  *
- * Walk @parent's children.  Must be called under rcu_read_lock().  A child
- * css which hasn't finished ->css_online() or already has finished
- * ->css_offline() may show up during traversal and it's each subsystem's
- * responsibility to verify that each @pos is alive.
+ * Walk @parent's children.  Must be called under rcu_read_lock().
  *
- * If a subsystem synchronizes against the parent in its ->css_online() and
- * before starting iterating, a css which finished ->css_online() is
- * guaranteed to be visible in the future iterations.
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
  *
  * It is allowed to temporarily drop RCU read lock during iteration.  The
  * caller is responsible for ensuring that @pos remains accessible until
@@ -794,17 +794,16 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos);
  * @root: css whose descendants to walk
  *
  * Walk @root's descendants.  @root is included in the iteration and the
- * first node to be visited.  Must be called under rcu_read_lock().  A
- * descendant css which hasn't finished ->css_online() or already has
- * finished ->css_offline() may show up during traversal and it's each
- * subsystem's responsibility to verify that each @pos is alive.
+ * first node to be visited.  Must be called under rcu_read_lock().
  *
- * If a subsystem synchronizes against the parent in its ->css_online() and
- * before starting iterating, and synchronizes against @pos on each
- * iteration, any descendant css which finished ->css_online() is
- * guaranteed to be visible in the future iterations.
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
  *
- * In other words, the following guarantees that a descendant can't escape
+ * For example, the following guarantees that a descendant can't escape
  * state updates of its ancestors.
  *
  * my_online(@css)
@@ -860,8 +859,17 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
  *
  * Similar to css_for_each_descendant_pre() but performs post-order
  * traversal instead.  @root is included in the iteration and the last
- * node to be visited.  Note that the walk visibility guarantee described
- * in pre-order walk doesn't apply the same to post-order walks.
+ * node to be visited.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ *
+ * Note that the walk visibility guarantee example described in pre-order
+ * walk doesn't apply the same to post-order walks.
  */
 #define css_for_each_descendant_post(pos, css)				\
 	for ((pos) = css_next_descendant_post(NULL, (css)); (pos);	\
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5544e685f2da..097a1fc1e1e8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3089,21 +3089,25 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 
 /**
  * css_next_child - find the next child of a given css
- * @pos_css: the current position (%NULL to initiate traversal)
- * @parent_css: css whose children to walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
  *
- * This function returns the next child of @parent_css and should be called
+ * This function returns the next child of @parent and should be called
  * under either cgroup_mutex or RCU read lock.  The only requirement is
- * that @parent_css and @pos_css are accessible.  The next sibling is
- * guaranteed to be returned regardless of their states.
+ * that @parent and @pos are accessible.  The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
  */
-struct cgroup_subsys_state *
-css_next_child(struct cgroup_subsys_state *pos_css,
-	       struct cgroup_subsys_state *parent_css)
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+					   struct cgroup_subsys_state *parent)
 {
-	struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
-	struct cgroup *cgrp = parent_css->cgroup;
-	struct cgroup *next;
+	struct cgroup_subsys_state *next;
 
 	cgroup_assert_mutex_or_rcu_locked();
 
@@ -3128,27 +3132,21 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 	 * races against release and the race window is very small.
 	 */
 	if (!pos) {
-		next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling);
-	} else if (likely(!(pos->self.flags & CSS_RELEASED))) {
-		next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling);
+		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+	} else if (likely(!(pos->flags & CSS_RELEASED))) {
+		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
 	} else {
-		list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling)
-			if (next->self.serial_nr > pos->self.serial_nr)
+		list_for_each_entry_rcu(next, &parent->children, sibling)
+			if (next->serial_nr > pos->serial_nr)
 				break;
 	}
 
 	/*
 	 * @next, if not pointing to the head, can be dereferenced and is
-	 * the next sibling; however, it might have @ss disabled.  If so,
-	 * fast-forward to the next enabled one.
+	 * the next sibling.
 	 */
-	while (&next->self.sibling != &cgrp->self.children) {
-		struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
-
-		if (next_css)
-			return next_css;
-		next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling);
-	}
+	if (&next->sibling != &parent->children)
+		return next;
 	return NULL;
 }
 
@@ -3165,6 +3163,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct next descendant as long
  * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
  */
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3252,6 +3257,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
  * section.  This function will return the correct next descendant as long
  * as both @pos and @cgroup are accessible and @pos is a descendant of
  * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
  */
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
-- 
cgit v1.2.3-71-gd317


From 184faf32328c65c9d86b19577b8d8b90bdd2cd2e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:51 -0400
Subject: cgroup: use CSS_ONLINE instead of CGRP_DEAD

Use CSS_ONLINE on the self css to indicate whether a cgroup has been
killed instead of CGRP_DEAD.  This will allow re-using css online test
for cgroup liveliness test.  This doesn't introduce any functional
change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 --
 kernel/cgroup.c        | 7 ++++---
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f2ff578fc03a..51a339c99eb6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -143,8 +143,6 @@ static inline void css_put(struct cgroup_subsys_state *css)
 
 /* bits in struct cgroup flags field */
 enum {
-	/* Control Group is dead */
-	CGRP_DEAD,
 	/*
 	 * Control Group has previously had a child cgroup or a task,
 	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 097a1fc1e1e8..004004fd0ded 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -278,7 +278,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
-	return test_bit(CGRP_DEAD, &cgrp->flags);
+	return !(cgrp->self.flags & CSS_ONLINE);
 }
 
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
@@ -1518,6 +1518,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->self.cgroup = cgrp;
+	cgrp->self.flags |= CSS_ONLINE;
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -4541,13 +4542,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * Mark @cgrp dead.  This prevents further task migration and child
 	 * creation by disabling cgroup_lock_live_group().
 	 */
-	set_bit(CGRP_DEAD, &cgrp->flags);
+	cgrp->self.flags &= ~CSS_ONLINE;
 
 	/* initiate massacre of all css's */
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
 
-	/* CGRP_DEAD is set, remove from ->release_list for the last time */
+	/* CSS_ONLINE is clear, remove from ->release_list for the last time */
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
-- 
cgit v1.2.3-71-gd317


From f3d4650015301d1c880df4523f7e7ef320a38aab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:52 -0400
Subject: cgroup: convert cgroup_has_live_children() into
 css_has_online_children()

Now that cgroup liveliness and css onliness are the same state,
convert cgroup_has_live_children() into css_has_online_children() so
that it can be used for actual csses too.  The function now uses
css_for_each_child() for iteration and is published.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  2 ++
 kernel/cgroup.c        | 32 ++++++++++++++++++++------------
 2 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 51a339c99eb6..b76999954beb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -873,6 +873,8 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 	for ((pos) = css_next_descendant_post(NULL, (css)); (pos);	\
 	     (pos) = css_next_descendant_post((pos), (css)))
 
+bool css_has_online_children(struct cgroup_subsys_state *css);
+
 /* A css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 004004fd0ded..082bb842b11a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -175,7 +175,6 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 
 static void cgroup_put(struct cgroup *cgrp);
-static bool cgroup_has_live_children(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned int ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -1769,7 +1768,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	 * This prevents new mounts by disabling percpu_ref_tryget_live().
 	 * cgroup_mount() may wait for @root's release.
 	 */
-	if (cgroup_has_live_children(&root->cgrp))
+	if (css_has_online_children(&root->cgrp.self))
 		cgroup_put(&root->cgrp);
 	else
 		percpu_ref_kill(&root->cgrp.self.refcnt);
@@ -3291,19 +3290,28 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 	return pos->parent;
 }
 
-static bool cgroup_has_live_children(struct cgroup *cgrp)
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false.  This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
 {
-	struct cgroup *child;
+	struct cgroup_subsys_state *child;
+	bool ret = false;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) {
-		if (!cgroup_is_dead(child)) {
-			rcu_read_unlock();
-			return true;
+	css_for_each_child(child, css) {
+		if (css->flags & CSS_ONLINE) {
+			ret = true;
+			break;
 		}
 	}
 	rcu_read_unlock();
-	return false;
+	return ret;
 }
 
 /**
@@ -4535,7 +4543,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * ->self.children as dead children linger on it while being
 	 * drained; otherwise, "rmdir parent/child parent" may fail.
 	 */
-	if (cgroup_has_live_children(cgrp))
+	if (css_has_online_children(&cgrp->self))
 		return -EBUSY;
 
 	/*
@@ -5014,8 +5022,8 @@ void cgroup_exit(struct task_struct *tsk)
 
 static void check_for_release(struct cgroup *cgrp)
 {
-	if (cgroup_is_releasable(cgrp) &&
-	    list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) {
+	if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
+	    !css_has_online_children(&cgrp->self)) {
 		/*
 		 * Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
-- 
cgit v1.2.3-71-gd317


From 6f4524d355a86769b65d5420a6ef47fb0bba9b72 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 May 2014 13:22:52 -0400
Subject: cgroup: implement css_tryget()

Implement css_tryget() which tries to grab a cgroup_subsys_state's
reference as long as it already hasn't reached zero.  Combined with
the recent css iterator changes to include offline && !released csses
during traversal, this can be used to access csses regardless of its
online state.

v2: Take the new flag CSS_NO_REF into account.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b76999954beb..4afe544d3547 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -112,6 +112,24 @@ static inline void css_get(struct cgroup_subsys_state *css)
 		percpu_ref_get(&css->refcnt);
 }
 
+/**
+ * css_tryget - try to obtain a reference on the specified css
+ * @css: target css
+ *
+ * Obtain a reference on @css unless it already has reached zero and is
+ * being released.  This function doesn't care whether @css is on or
+ * offline.  The caller naturally needs to ensure that @css is accessible
+ * but doesn't have to be holding a reference on it - IOW, RCU protected
+ * access is good enough for this function.  Returns %true if a reference
+ * count was successfully obtained; %false otherwise.
+ */
+static inline bool css_tryget(struct cgroup_subsys_state *css)
+{
+	if (!(css->flags & CSS_NO_REF))
+		return percpu_ref_tryget(&css->refcnt);
+	return true;
+}
+
 /**
  * css_tryget_online - try to obtain a reference on the specified css if online
  * @css: target css
-- 
cgit v1.2.3-71-gd317


From 1836eea209546b870dd83f3f4ef234d6598a560d Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@horizon.com>
Date: Sat, 10 May 2014 10:32:57 -0400
Subject: lib/crc7: Shift crc7() output left 1 bit

This eliminates a 1-bit left shift in every single caller,
and makes the inner loop of the CRC computation more efficient.

Renamed crc7 to crc7_be (big-endian) since the interface changed.

Also purged #include <linux/crc7.h> from files that don't use it at all.

Signed-off-by: George Spelvin <linux@horizon.com>
Reviewed-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/mmc/host/mmc_spi.c           |  2 +-
 drivers/net/wireless/ti/wl1251/acx.c |  1 -
 drivers/net/wireless/ti/wl1251/cmd.c |  1 -
 drivers/net/wireless/ti/wl1251/spi.c |  3 +-
 drivers/net/wireless/ti/wlcore/spi.c |  3 +-
 include/linux/crc7.h                 |  8 ++--
 lib/crc7.c                           | 84 ++++++++++++++++++++----------------
 7 files changed, 53 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index 0a87e5691341..338e2202eaaa 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -472,7 +472,7 @@ mmc_spi_command_send(struct mmc_spi_host *host,
 	*cp++ = (u8)(arg >> 16);
 	*cp++ = (u8)(arg >> 8);
 	*cp++ = (u8)arg;
-	*cp++ = (crc7(0, &data->status[1], 5) << 1) | 0x01;
+	*cp++ = crc7_be(0, &data->status[1], 5) | 0x01;
 
 	/* Then, read up to 13 bytes (while writing all-ones):
 	 *  - N(CR) (== 1..8) bytes of all-ones
diff --git a/drivers/net/wireless/ti/wl1251/acx.c b/drivers/net/wireless/ti/wl1251/acx.c
index 5a4ec56c83d0..5695628757ee 100644
--- a/drivers/net/wireless/ti/wl1251/acx.c
+++ b/drivers/net/wireless/ti/wl1251/acx.c
@@ -2,7 +2,6 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/crc7.h>
 
 #include "wl1251.h"
 #include "reg.h"
diff --git a/drivers/net/wireless/ti/wl1251/cmd.c b/drivers/net/wireless/ti/wl1251/cmd.c
index bf1fa18b9786..ede31f048ef9 100644
--- a/drivers/net/wireless/ti/wl1251/cmd.c
+++ b/drivers/net/wireless/ti/wl1251/cmd.c
@@ -2,7 +2,6 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/crc7.h>
 #include <linux/etherdevice.h>
 
 #include "wl1251.h"
diff --git a/drivers/net/wireless/ti/wl1251/spi.c b/drivers/net/wireless/ti/wl1251/spi.c
index b06d36d99362..e94b57cd5a22 100644
--- a/drivers/net/wireless/ti/wl1251/spi.c
+++ b/drivers/net/wireless/ti/wl1251/spi.c
@@ -122,8 +122,7 @@ static void wl1251_spi_wake(struct wl1251 *wl)
 	crc[3] = cmd[6];
 	crc[4] = cmd[5];
 
-	cmd[4] |= crc7(0, crc, WSPI_INIT_CMD_CRC_LEN) << 1;
-	cmd[4] |= WSPI_INIT_CMD_END;
+	cmd[4] = crc7_be(0, crc, WSPI_INIT_CMD_CRC_LEN) | WSPI_INIT_CMD_END;
 
 	t.tx_buf = cmd;
 	t.len = WSPI_INIT_CMD_LEN;
diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c
index 5f3a389dd74c..1d4ddabe6063 100644
--- a/drivers/net/wireless/ti/wlcore/spi.c
+++ b/drivers/net/wireless/ti/wlcore/spi.c
@@ -151,8 +151,7 @@ static void wl12xx_spi_init(struct device *child)
 	crc[3] = cmd[6];
 	crc[4] = cmd[5];
 
-	cmd[4] |= crc7(0, crc, WSPI_INIT_CMD_CRC_LEN) << 1;
-	cmd[4] |= WSPI_INIT_CMD_END;
+	cmd[4] = crc7_be(0, crc, WSPI_INIT_CMD_CRC_LEN) | WSPI_INIT_CMD_END;
 
 	t.tx_buf = cmd;
 	t.len = WSPI_INIT_CMD_LEN;
diff --git a/include/linux/crc7.h b/include/linux/crc7.h
index 1786e772d5c6..d590765106f3 100644
--- a/include/linux/crc7.h
+++ b/include/linux/crc7.h
@@ -2,13 +2,13 @@
 #define _LINUX_CRC7_H
 #include <linux/types.h>
 
-extern const u8 crc7_syndrome_table[256];
+extern const u8 crc7_be_syndrome_table[256];
 
-static inline u8 crc7_byte(u8 crc, u8 data)
+static inline u8 crc7_be_byte(u8 crc, u8 data)
 {
-	return crc7_syndrome_table[(crc << 1) ^ data];
+	return crc7_be_syndrome_table[crc ^ data];
 }
 
-extern u8 crc7(u8 crc, const u8 *buffer, size_t len);
+extern u8 crc7_be(u8 crc, const u8 *buffer, size_t len);
 
 #endif
diff --git a/lib/crc7.c b/lib/crc7.c
index f1c3a144cec1..bf6255e23919 100644
--- a/lib/crc7.c
+++ b/lib/crc7.c
@@ -10,42 +10,47 @@
 #include <linux/crc7.h>
 
 
-/* Table for CRC-7 (polynomial x^7 + x^3 + 1) */
-const u8 crc7_syndrome_table[256] = {
-	0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
-	0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
-	0x19, 0x10, 0x0b, 0x02, 0x3d, 0x34, 0x2f, 0x26,
-	0x51, 0x58, 0x43, 0x4a, 0x75, 0x7c, 0x67, 0x6e,
-	0x32, 0x3b, 0x20, 0x29, 0x16, 0x1f, 0x04, 0x0d,
-	0x7a, 0x73, 0x68, 0x61, 0x5e, 0x57, 0x4c, 0x45,
-	0x2b, 0x22, 0x39, 0x30, 0x0f, 0x06, 0x1d, 0x14,
-	0x63, 0x6a, 0x71, 0x78, 0x47, 0x4e, 0x55, 0x5c,
-	0x64, 0x6d, 0x76, 0x7f, 0x40, 0x49, 0x52, 0x5b,
-	0x2c, 0x25, 0x3e, 0x37, 0x08, 0x01, 0x1a, 0x13,
-	0x7d, 0x74, 0x6f, 0x66, 0x59, 0x50, 0x4b, 0x42,
-	0x35, 0x3c, 0x27, 0x2e, 0x11, 0x18, 0x03, 0x0a,
-	0x56, 0x5f, 0x44, 0x4d, 0x72, 0x7b, 0x60, 0x69,
-	0x1e, 0x17, 0x0c, 0x05, 0x3a, 0x33, 0x28, 0x21,
-	0x4f, 0x46, 0x5d, 0x54, 0x6b, 0x62, 0x79, 0x70,
-	0x07, 0x0e, 0x15, 0x1c, 0x23, 0x2a, 0x31, 0x38,
-	0x41, 0x48, 0x53, 0x5a, 0x65, 0x6c, 0x77, 0x7e,
-	0x09, 0x00, 0x1b, 0x12, 0x2d, 0x24, 0x3f, 0x36,
-	0x58, 0x51, 0x4a, 0x43, 0x7c, 0x75, 0x6e, 0x67,
-	0x10, 0x19, 0x02, 0x0b, 0x34, 0x3d, 0x26, 0x2f,
-	0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c,
-	0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04,
-	0x6a, 0x63, 0x78, 0x71, 0x4e, 0x47, 0x5c, 0x55,
-	0x22, 0x2b, 0x30, 0x39, 0x06, 0x0f, 0x14, 0x1d,
-	0x25, 0x2c, 0x37, 0x3e, 0x01, 0x08, 0x13, 0x1a,
-	0x6d, 0x64, 0x7f, 0x76, 0x49, 0x40, 0x5b, 0x52,
-	0x3c, 0x35, 0x2e, 0x27, 0x18, 0x11, 0x0a, 0x03,
-	0x74, 0x7d, 0x66, 0x6f, 0x50, 0x59, 0x42, 0x4b,
-	0x17, 0x1e, 0x05, 0x0c, 0x33, 0x3a, 0x21, 0x28,
-	0x5f, 0x56, 0x4d, 0x44, 0x7b, 0x72, 0x69, 0x60,
-	0x0e, 0x07, 0x1c, 0x15, 0x2a, 0x23, 0x38, 0x31,
-	0x46, 0x4f, 0x54, 0x5d, 0x62, 0x6b, 0x70, 0x79
+/*
+ * Table for CRC-7 (polynomial x^7 + x^3 + 1).
+ * This is a big-endian CRC (msbit is highest power of x),
+ * aligned so the msbit of the byte is the x^6 coefficient
+ * and the lsbit is not used.
+ */
+const u8 crc7_be_syndrome_table[256] = {
+	0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+	0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee,
+	0x32, 0x20, 0x16, 0x04, 0x7a, 0x68, 0x5e, 0x4c,
+	0xa2, 0xb0, 0x86, 0x94, 0xea, 0xf8, 0xce, 0xdc,
+	0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x08, 0x1a,
+	0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a,
+	0x56, 0x44, 0x72, 0x60, 0x1e, 0x0c, 0x3a, 0x28,
+	0xc6, 0xd4, 0xe2, 0xf0, 0x8e, 0x9c, 0xaa, 0xb8,
+	0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6,
+	0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x02, 0x34, 0x26,
+	0xfa, 0xe8, 0xde, 0xcc, 0xb2, 0xa0, 0x96, 0x84,
+	0x6a, 0x78, 0x4e, 0x5c, 0x22, 0x30, 0x06, 0x14,
+	0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2,
+	0x3c, 0x2e, 0x18, 0x0a, 0x74, 0x66, 0x50, 0x42,
+	0x9e, 0x8c, 0xba, 0xa8, 0xd6, 0xc4, 0xf2, 0xe0,
+	0x0e, 0x1c, 0x2a, 0x38, 0x46, 0x54, 0x62, 0x70,
+	0x82, 0x90, 0xa6, 0xb4, 0xca, 0xd8, 0xee, 0xfc,
+	0x12, 0x00, 0x36, 0x24, 0x5a, 0x48, 0x7e, 0x6c,
+	0xb0, 0xa2, 0x94, 0x86, 0xf8, 0xea, 0xdc, 0xce,
+	0x20, 0x32, 0x04, 0x16, 0x68, 0x7a, 0x4c, 0x5e,
+	0xe6, 0xf4, 0xc2, 0xd0, 0xae, 0xbc, 0x8a, 0x98,
+	0x76, 0x64, 0x52, 0x40, 0x3e, 0x2c, 0x1a, 0x08,
+	0xd4, 0xc6, 0xf0, 0xe2, 0x9c, 0x8e, 0xb8, 0xaa,
+	0x44, 0x56, 0x60, 0x72, 0x0c, 0x1e, 0x28, 0x3a,
+	0x4a, 0x58, 0x6e, 0x7c, 0x02, 0x10, 0x26, 0x34,
+	0xda, 0xc8, 0xfe, 0xec, 0x92, 0x80, 0xb6, 0xa4,
+	0x78, 0x6a, 0x5c, 0x4e, 0x30, 0x22, 0x14, 0x06,
+	0xe8, 0xfa, 0xcc, 0xde, 0xa0, 0xb2, 0x84, 0x96,
+	0x2e, 0x3c, 0x0a, 0x18, 0x66, 0x74, 0x42, 0x50,
+	0xbe, 0xac, 0x9a, 0x88, 0xf6, 0xe4, 0xd2, 0xc0,
+	0x1c, 0x0e, 0x38, 0x2a, 0x54, 0x46, 0x70, 0x62,
+	0x8c, 0x9e, 0xa8, 0xba, 0xc4, 0xd6, 0xe0, 0xf2
 };
-EXPORT_SYMBOL(crc7_syndrome_table);
+EXPORT_SYMBOL(crc7_be_syndrome_table);
 
 /**
  * crc7 - update the CRC7 for the data buffer
@@ -55,14 +60,17 @@ EXPORT_SYMBOL(crc7_syndrome_table);
  * Context: any
  *
  * Returns the updated CRC7 value.
+ * The CRC7 is left-aligned in the byte (the lsbit is always 0), as that
+ * makes the computation easier, and all callers want it in that form.
+ *
  */
-u8 crc7(u8 crc, const u8 *buffer, size_t len)
+u8 crc7_be(u8 crc, const u8 *buffer, size_t len)
 {
 	while (len--)
-		crc = crc7_byte(crc, *buffer++);
+		crc = crc7_be_byte(crc, *buffer++);
 	return crc;
 }
-EXPORT_SYMBOL(crc7);
+EXPORT_SYMBOL(crc7_be);
 
 MODULE_DESCRIPTION("CRC7 calculations");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-71-gd317


From a75951217472c522c324adb0a4de3ba69d656ef5 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Fri, 16 May 2014 16:14:04 +0200
Subject: net: phy: extend fixed driver with fixed_phy_register()

The existing fixed_phy_add() function has several drawbacks that
prevents it from being used as is for OF-based declaration of fixed
PHYs:

 * The address of the PHY on the fake bus needs to be passed, while a
   dynamic allocation is desired.

 * Since the phy_device instantiation is post-poned until the next
   mdiobus scan, there is no way to associate the fixed PHY with its
   OF node, which later prevents of_phy_connect() from finding this
   fixed PHY from a given OF node.

To solve this, this commit introduces fixed_phy_register(), which will
allocate an available PHY address, add the PHY using fixed_phy_add()
and instantiate the phy_device structure associated with the provided
OF node.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Grant Likely <grant.likely@linaro.org>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/fixed.c   | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy_fixed.h | 11 +++++++++
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/fixed.c b/drivers/net/phy/fixed.c
index e41546da105e..d60d875cb445 100644
--- a/drivers/net/phy/fixed.c
+++ b/drivers/net/phy/fixed.c
@@ -21,6 +21,7 @@
 #include <linux/phy_fixed.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/of.h>
 
 #define MII_REGS_NUM 29
 
@@ -203,6 +204,66 @@ err_regs:
 }
 EXPORT_SYMBOL_GPL(fixed_phy_add);
 
+void fixed_phy_del(int phy_addr)
+{
+	struct fixed_mdio_bus *fmb = &platform_fmb;
+	struct fixed_phy *fp, *tmp;
+
+	list_for_each_entry_safe(fp, tmp, &fmb->phys, node) {
+		if (fp->addr == phy_addr) {
+			list_del(&fp->node);
+			kfree(fp);
+			return;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(fixed_phy_del);
+
+static int phy_fixed_addr;
+static DEFINE_SPINLOCK(phy_fixed_addr_lock);
+
+int fixed_phy_register(unsigned int irq,
+		       struct fixed_phy_status *status,
+		       struct device_node *np)
+{
+	struct fixed_mdio_bus *fmb = &platform_fmb;
+	struct phy_device *phy;
+	int phy_addr;
+	int ret;
+
+	/* Get the next available PHY address, up to PHY_MAX_ADDR */
+	spin_lock(&phy_fixed_addr_lock);
+	if (phy_fixed_addr == PHY_MAX_ADDR) {
+		spin_unlock(&phy_fixed_addr_lock);
+		return -ENOSPC;
+	}
+	phy_addr = phy_fixed_addr++;
+	spin_unlock(&phy_fixed_addr_lock);
+
+	ret = fixed_phy_add(PHY_POLL, phy_addr, status);
+	if (ret < 0)
+		return ret;
+
+	phy = get_phy_device(fmb->mii_bus, phy_addr, false);
+	if (!phy || IS_ERR(phy)) {
+		fixed_phy_del(phy_addr);
+		return -EINVAL;
+	}
+
+	of_node_get(np);
+	phy->dev.of_node = np;
+
+	ret = phy_device_register(phy);
+	if (ret) {
+		phy_device_free(phy);
+		of_node_put(np);
+		fixed_phy_del(phy_addr);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int __init fixed_mdio_bus_init(void)
 {
 	struct fixed_mdio_bus *fmb = &platform_fmb;
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 509d8f5f984e..4f2478b47136 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -9,15 +9,26 @@ struct fixed_phy_status {
 	int asym_pause;
 };
 
+struct device_node;
+
 #ifdef CONFIG_FIXED_PHY
 extern int fixed_phy_add(unsigned int irq, int phy_id,
 			 struct fixed_phy_status *status);
+extern int fixed_phy_register(unsigned int irq,
+			      struct fixed_phy_status *status,
+			      struct device_node *np);
 #else
 static inline int fixed_phy_add(unsigned int irq, int phy_id,
 				struct fixed_phy_status *status)
 {
 	return -ENODEV;
 }
+static inline int fixed_phy_register(unsigned int irq,
+				     struct fixed_phy_status *status,
+				     struct device_node *np)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_FIXED_PHY */
 
 /*
-- 
cgit v1.2.3-71-gd317


From 3be2a49e5c08d268f8af0dd4fe89a24ea8cdc339 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Fri, 16 May 2014 16:14:05 +0200
Subject: of: provide a binding for fixed link PHYs

Some Ethernet MACs have a "fixed link", and are not connected to a
normal MDIO-managed PHY device. For those situations, a Device Tree
binding allows to describe a "fixed link" using a special PHY node.

This patch adds:

 * A documentation for the fixed PHY Device Tree binding.

 * An of_phy_is_fixed_link() function that an Ethernet driver can call
   on its PHY phandle to find out whether it's a fixed link PHY or
   not. It should typically be used to know if
   of_phy_register_fixed_link() should be called.

 * An of_phy_register_fixed_link() function that instantiates the
   fixed PHY into the PHY subsystem, so that when the driver calls
   of_phy_connect(), the PHY device associated to the OF node will be
   found.

These two additional functions also support the old fixed-link Device
Tree binding used on PowerPC platforms, so that ultimately, the
network device drivers for those platforms could be converted to use
of_phy_is_fixed_link() and of_phy_register_fixed_link() instead of
of_phy_connect_fixed_link(), while keeping compatibility with their
respective Device Tree bindings.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/fixed-link.txt         | 30 ++++++++++
 drivers/of/of_mdio.c                               | 67 ++++++++++++++++++++++
 include/linux/of_mdio.h                            | 15 +++++
 3 files changed, 112 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/fixed-link.txt

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/fixed-link.txt b/Documentation/devicetree/bindings/net/fixed-link.txt
new file mode 100644
index 000000000000..e956de1be935
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/fixed-link.txt
@@ -0,0 +1,30 @@
+Fixed link Device Tree binding
+------------------------------
+
+Some Ethernet MACs have a "fixed link", and are not connected to a
+normal MDIO-managed PHY device. For those situations, a Device Tree
+binding allows to describe a "fixed link".
+
+Such a fixed link situation is described by creating a 'fixed-link'
+sub-node of the Ethernet MAC device node, with the following
+properties:
+
+* 'speed' (integer, mandatory), to indicate the link speed. Accepted
+  values are 10, 100 and 1000
+* 'full-duplex' (boolean, optional), to indicate that full duplex is
+  used. When absent, half duplex is assumed.
+* 'pause' (boolean, optional), to indicate that pause should be
+  enabled.
+* 'asym-pause' (boolean, optional), to indicate that asym_pause should
+  be enabled.
+
+Example:
+
+ethernet@0 {
+	...
+	fixed-link {
+	      speed = <1000>;
+	      full-duplex;
+	};
+	...
+};
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 9a95831bd065..1def0bb5cb37 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -14,6 +14,7 @@
 #include <linux/netdevice.h>
 #include <linux/err.h>
 #include <linux/phy.h>
+#include <linux/phy_fixed.h>
 #include <linux/of.h>
 #include <linux/of_irq.h>
 #include <linux/of_mdio.h>
@@ -301,3 +302,69 @@ struct phy_device *of_phy_attach(struct net_device *dev,
 	return phy_attach_direct(dev, phy, flags, iface) ? NULL : phy;
 }
 EXPORT_SYMBOL(of_phy_attach);
+
+#if defined(CONFIG_FIXED_PHY)
+/*
+ * of_phy_is_fixed_link() and of_phy_register_fixed_link() must
+ * support two DT bindings:
+ * - the old DT binding, where 'fixed-link' was a property with 5
+ *   cells encoding various informations about the fixed PHY
+ * - the new DT binding, where 'fixed-link' is a sub-node of the
+ *   Ethernet device.
+ */
+bool of_phy_is_fixed_link(struct device_node *np)
+{
+	struct device_node *dn;
+	int len;
+
+	/* New binding */
+	dn = of_get_child_by_name(np, "fixed-link");
+	if (dn) {
+		of_node_put(dn);
+		return true;
+	}
+
+	/* Old binding */
+	if (of_get_property(np, "fixed-link", &len) &&
+	    len == (5 * sizeof(__be32)))
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(of_phy_is_fixed_link);
+
+int of_phy_register_fixed_link(struct device_node *np)
+{
+	struct fixed_phy_status status = {};
+	struct device_node *fixed_link_node;
+	const __be32 *fixed_link_prop;
+	int len;
+
+	/* New binding */
+	fixed_link_node = of_get_child_by_name(np, "fixed-link");
+	if (fixed_link_node) {
+		status.link = 1;
+		status.duplex = of_property_read_bool(np, "full-duplex");
+		if (of_property_read_u32(fixed_link_node, "speed", &status.speed))
+			return -EINVAL;
+		status.pause = of_property_read_bool(np, "pause");
+		status.asym_pause = of_property_read_bool(np, "asym-pause");
+		of_node_put(fixed_link_node);
+		return fixed_phy_register(PHY_POLL, &status, np);
+	}
+
+	/* Old binding */
+	fixed_link_prop = of_get_property(np, "fixed-link", &len);
+	if (fixed_link_prop && len == (5 * sizeof(__be32))) {
+		status.link = 1;
+		status.duplex = be32_to_cpu(fixed_link_prop[1]);
+		status.speed = be32_to_cpu(fixed_link_prop[2]);
+		status.pause = be32_to_cpu(fixed_link_prop[3]);
+		status.asym_pause = be32_to_cpu(fixed_link_prop[4]);
+		return fixed_phy_register(PHY_POLL, &status, np);
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(of_phy_register_fixed_link);
+#endif
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index 881a7c3571f4..0aa367e316cb 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -72,4 +72,19 @@ static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np)
 }
 #endif /* CONFIG_OF */
 
+#if defined(CONFIG_OF) && defined(CONFIG_FIXED_PHY)
+extern int of_phy_register_fixed_link(struct device_node *np);
+extern bool of_phy_is_fixed_link(struct device_node *np);
+#else
+static inline int of_phy_register_fixed_link(struct device_node *np)
+{
+	return -ENOSYS;
+}
+static inline bool of_phy_is_fixed_link(struct device_node *np)
+{
+	return false;
+}
+#endif
+
+
 #endif /* __LINUX_OF_MDIO_H */
-- 
cgit v1.2.3-71-gd317


From 3e9c156e2c210ab67b12b1b692983a6b97c19d3f Mon Sep 17 00:00:00 2001
From: Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
Date: Fri, 16 May 2014 17:46:44 +0200
Subject: ieee802154: add netlink interfaces for llsec

This patch adds user-visible interfaces for the llsec infrastructure.
For the added methods, the only major difference between all add/remove
implementation lies in how the specific object is parsed, and for dump
requests, how objects are written into netlink messages.

To save on boilerplate code, table dumps are routed through a helper
function that handles netlink dump state, leaving the actual dumping
code to care only about iterating over the table to be dumped and
filling netlink messages. For add/remove methods, the boilerplate
required to work is not quite as large, but still enough to also move
into a local helper.

Signed-off-by: Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/nl802154.h    |  31 ++
 net/ieee802154/ieee802154.h |  19 ++
 net/ieee802154/netlink.c    |  20 ++
 net/ieee802154/nl-mac.c     | 807 ++++++++++++++++++++++++++++++++++++++++++++
 net/ieee802154/nl_policy.c  |  16 +
 5 files changed, 893 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
index c8d7f3965fff..20163b9a0eae 100644
--- a/include/linux/nl802154.h
+++ b/include/linux/nl802154.h
@@ -80,6 +80,22 @@ enum {
 
 	IEEE802154_ATTR_FRAME_RETRIES,
 
+	IEEE802154_ATTR_LLSEC_ENABLED,
+	IEEE802154_ATTR_LLSEC_SECLEVEL,
+	IEEE802154_ATTR_LLSEC_KEY_MODE,
+	IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT,
+	IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED,
+	IEEE802154_ATTR_LLSEC_KEY_ID,
+	IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+	IEEE802154_ATTR_LLSEC_KEY_BYTES,
+	IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES,
+	IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS,
+	IEEE802154_ATTR_LLSEC_FRAME_TYPE,
+	IEEE802154_ATTR_LLSEC_CMD_FRAME_ID,
+	IEEE802154_ATTR_LLSEC_SECLEVELS,
+	IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
+	IEEE802154_ATTR_LLSEC_DEV_KEY_MODE,
+
 	__IEEE802154_ATTR_MAX,
 };
 
@@ -134,6 +150,21 @@ enum {
 
 	IEEE802154_SET_MACPARAMS,
 
+	IEEE802154_LLSEC_GETPARAMS,
+	IEEE802154_LLSEC_SETPARAMS,
+	IEEE802154_LLSEC_LIST_KEY,
+	IEEE802154_LLSEC_ADD_KEY,
+	IEEE802154_LLSEC_DEL_KEY,
+	IEEE802154_LLSEC_LIST_DEV,
+	IEEE802154_LLSEC_ADD_DEV,
+	IEEE802154_LLSEC_DEL_DEV,
+	IEEE802154_LLSEC_LIST_DEVKEY,
+	IEEE802154_LLSEC_ADD_DEVKEY,
+	IEEE802154_LLSEC_DEL_DEVKEY,
+	IEEE802154_LLSEC_LIST_SECLEVEL,
+	IEEE802154_LLSEC_ADD_SECLEVEL,
+	IEEE802154_LLSEC_DEL_SECLEVEL,
+
 	__IEEE802154_CMD_MAX,
 };
 
diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h
index 6693a5cf01ce..8b83a231299e 100644
--- a/net/ieee802154/ieee802154.h
+++ b/net/ieee802154/ieee802154.h
@@ -68,4 +68,23 @@ int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info);
 int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb);
 int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info);
 
+int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_keys(struct sk_buff *skb,
+			       struct netlink_callback *cb);
+int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_devs(struct sk_buff *skb,
+			       struct netlink_callback *cb);
+int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_devkeys(struct sk_buff *skb,
+				  struct netlink_callback *cb);
+int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_seclevels(struct sk_buff *skb,
+				    struct netlink_callback *cb);
+
 #endif
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 04b20589d97a..26efcf4fd2ff 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -124,6 +124,26 @@ static const struct genl_ops ieee8021154_ops[] = {
 	IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface,
 			ieee802154_dump_iface),
 	IEEE802154_OP(IEEE802154_SET_MACPARAMS, ieee802154_set_macparams),
+	IEEE802154_OP(IEEE802154_LLSEC_GETPARAMS, ieee802154_llsec_getparams),
+	IEEE802154_OP(IEEE802154_LLSEC_SETPARAMS, ieee802154_llsec_setparams),
+	IEEE802154_DUMP(IEEE802154_LLSEC_LIST_KEY, NULL,
+			ieee802154_llsec_dump_keys),
+	IEEE802154_OP(IEEE802154_LLSEC_ADD_KEY, ieee802154_llsec_add_key),
+	IEEE802154_OP(IEEE802154_LLSEC_DEL_KEY, ieee802154_llsec_del_key),
+	IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEV, NULL,
+			ieee802154_llsec_dump_devs),
+	IEEE802154_OP(IEEE802154_LLSEC_ADD_DEV, ieee802154_llsec_add_dev),
+	IEEE802154_OP(IEEE802154_LLSEC_DEL_DEV, ieee802154_llsec_del_dev),
+	IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEVKEY, NULL,
+			ieee802154_llsec_dump_devkeys),
+	IEEE802154_OP(IEEE802154_LLSEC_ADD_DEVKEY, ieee802154_llsec_add_devkey),
+	IEEE802154_OP(IEEE802154_LLSEC_DEL_DEVKEY, ieee802154_llsec_del_devkey),
+	IEEE802154_DUMP(IEEE802154_LLSEC_LIST_SECLEVEL, NULL,
+			ieee802154_llsec_dump_seclevels),
+	IEEE802154_OP(IEEE802154_LLSEC_ADD_SECLEVEL,
+		      ieee802154_llsec_add_seclevel),
+	IEEE802154_OP(IEEE802154_LLSEC_DEL_SECLEVEL,
+		      ieee802154_llsec_del_seclevel),
 };
 
 static const struct genl_multicast_group ieee802154_mcgrps[] = {
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 5d285498c0f6..5617b4c6d6d5 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -715,3 +715,810 @@ out:
 	dev_put(dev);
 	return rc;
 }
+
+
+
+static int
+ieee802154_llsec_parse_key_id(struct genl_info *info,
+			      struct ieee802154_llsec_key_id *desc)
+{
+	memset(desc, 0, sizeof(*desc));
+
+	if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE])
+		return -EINVAL;
+
+	desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]);
+
+	if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) {
+		if (!info->attrs[IEEE802154_ATTR_PAN_ID] &&
+		    !(info->attrs[IEEE802154_ATTR_SHORT_ADDR] ||
+		      info->attrs[IEEE802154_ATTR_HW_ADDR]))
+			return -EINVAL;
+
+		desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]);
+
+		if (info->attrs[IEEE802154_ATTR_SHORT_ADDR]) {
+			desc->device_addr.mode = IEEE802154_ADDR_SHORT;
+			desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]);
+		} else {
+			desc->device_addr.mode = IEEE802154_ADDR_LONG;
+			desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+		}
+	}
+
+	if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT &&
+	    !info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID])
+		return -EINVAL;
+
+	if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX &&
+	    !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT])
+		return -EINVAL;
+
+	if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
+	    !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED])
+		return -EINVAL;
+
+	if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT)
+		desc->id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]);
+
+	switch (desc->mode) {
+	case IEEE802154_SCF_KEY_SHORT_INDEX:
+	{
+		u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]);
+		desc->short_source = cpu_to_le32(source);
+		break;
+	}
+	case IEEE802154_SCF_KEY_HW_INDEX:
+		desc->extended_source = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]);
+		break;
+	}
+
+	return 0;
+}
+
+static int
+ieee802154_llsec_fill_key_id(struct sk_buff *msg,
+			     const struct ieee802154_llsec_key_id *desc)
+{
+	if (nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_MODE, desc->mode))
+		return -EMSGSIZE;
+
+	if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) {
+		if (nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID,
+				      desc->device_addr.pan_id))
+			return -EMSGSIZE;
+
+		if (desc->device_addr.mode == IEEE802154_ADDR_SHORT &&
+		    nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
+				      desc->device_addr.short_addr))
+			return -EMSGSIZE;
+
+		if (desc->device_addr.mode == IEEE802154_ADDR_LONG &&
+		    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR,
+				   desc->device_addr.extended_addr))
+			return -EMSGSIZE;
+	}
+
+	if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT &&
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_ID, desc->id))
+		return -EMSGSIZE;
+
+	if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX &&
+	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT,
+			le32_to_cpu(desc->short_source)))
+		return -EMSGSIZE;
+
+	if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
+	    nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED,
+			   desc->extended_source))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct net_device *dev = NULL;
+	int rc = -ENOBUFS;
+	struct ieee802154_mlme_ops *ops;
+	void *hdr;
+	struct ieee802154_llsec_params params;
+
+	pr_debug("%s\n", __func__);
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	ops = ieee802154_mlme_ops(dev);
+	if (!ops->llsec)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		goto out_dev;
+
+	hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0,
+		IEEE802154_LLSEC_GETPARAMS);
+	if (!hdr)
+		goto out_free;
+
+	rc = ops->llsec->get_params(dev, &params);
+	if (rc < 0)
+		goto out_free;
+
+	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_ENABLED, params.enabled) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+			be32_to_cpu(params.frame_counter)) ||
+	    ieee802154_llsec_fill_key_id(msg, &params.out_key))
+		goto out_free;
+
+	dev_put(dev);
+
+	return ieee802154_nl_reply(msg, info);
+out_free:
+	nlmsg_free(msg);
+out_dev:
+	dev_put(dev);
+	return rc;
+}
+
+int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device *dev = NULL;
+	int rc = -EINVAL;
+	struct ieee802154_mlme_ops *ops;
+	struct ieee802154_llsec_params params;
+	int changed = 0;
+
+	pr_debug("%s\n", __func__);
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	if (!info->attrs[IEEE802154_ATTR_LLSEC_ENABLED] &&
+	    !info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE] &&
+	    !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL])
+		goto out;
+
+	ops = ieee802154_mlme_ops(dev);
+	if (!ops->llsec) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL] &&
+	    nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) > 7)
+		goto out;
+
+	if (info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]) {
+		params.enabled = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]);
+		changed |= IEEE802154_LLSEC_PARAM_ENABLED;
+	}
+
+	if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) {
+		if (ieee802154_llsec_parse_key_id(info, &params.out_key))
+			goto out;
+
+		changed |= IEEE802154_LLSEC_PARAM_OUT_KEY;
+	}
+
+	if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) {
+		params.out_level = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]);
+		changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL;
+	}
+
+	if (info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]) {
+		u32 fc = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+
+		params.frame_counter = cpu_to_be32(fc);
+		changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER;
+	}
+
+	rc = ops->llsec->set_params(dev, &params, changed);
+
+	dev_put(dev);
+
+	return rc;
+out:
+	dev_put(dev);
+	return rc;
+}
+
+
+
+struct llsec_dump_data {
+	struct sk_buff *skb;
+	int s_idx, s_idx2;
+	int portid;
+	int nlmsg_seq;
+	struct net_device *dev;
+	struct ieee802154_mlme_ops *ops;
+	struct ieee802154_llsec_table *table;
+};
+
+static int
+ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb,
+			    int (*step)(struct llsec_dump_data*))
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	struct llsec_dump_data data;
+	int idx = 0;
+	int first_dev = cb->args[0];
+	int rc;
+
+	for_each_netdev(net, dev) {
+		if (idx < first_dev || dev->type != ARPHRD_IEEE802154)
+			goto skip;
+
+		data.ops = ieee802154_mlme_ops(dev);
+		if (!data.ops->llsec)
+			goto skip;
+
+		data.skb = skb;
+		data.s_idx = cb->args[1];
+		data.s_idx2 = cb->args[2];
+		data.dev = dev;
+		data.portid = NETLINK_CB(cb->skb).portid;
+		data.nlmsg_seq = cb->nlh->nlmsg_seq;
+
+		data.ops->llsec->lock_table(dev);
+		data.ops->llsec->get_table(data.dev, &data.table);
+		rc = step(&data);
+		data.ops->llsec->unlock_table(dev);
+
+		if (rc < 0)
+			break;
+
+skip:
+		idx++;
+	}
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int
+ieee802154_nl_llsec_change(struct sk_buff *skb, struct genl_info *info,
+			   int (*fn)(struct net_device*, struct genl_info*))
+{
+	struct net_device *dev = NULL;
+	int rc = -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	if (!ieee802154_mlme_ops(dev)->llsec)
+		rc = -EOPNOTSUPP;
+	else
+		rc = fn(dev, info);
+
+	dev_put(dev);
+	return rc;
+}
+
+
+
+static int
+ieee802154_llsec_parse_key(struct genl_info *info,
+			   struct ieee802154_llsec_key *key)
+{
+	u8 frames;
+	u32 commands[256 / 32];
+
+	memset(key, 0, sizeof(*key));
+
+	if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] ||
+	    !info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES])
+		return -EINVAL;
+
+	frames = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES]);
+	if ((frames & BIT(IEEE802154_FC_TYPE_MAC_CMD)) &&
+	    !info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS])
+		return -EINVAL;
+
+	if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) {
+		nla_memcpy(commands,
+			   info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS],
+			   256 / 8);
+
+		if (commands[0] || commands[1] || commands[2] || commands[3] ||
+		    commands[4] || commands[5] || commands[6] ||
+		    commands[7] >= BIT(IEEE802154_CMD_GTS_REQ + 1))
+			return -EINVAL;
+
+		key->cmd_frame_ids = commands[7];
+	}
+
+	key->frame_types = frames;
+
+	nla_memcpy(key->key, info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES],
+		   IEEE802154_LLSEC_KEY_SIZE);
+
+	return 0;
+}
+
+static int llsec_add_key(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	struct ieee802154_llsec_key key;
+	struct ieee802154_llsec_key_id id;
+
+	if (ieee802154_llsec_parse_key(info, &key) ||
+	    ieee802154_llsec_parse_key_id(info, &id))
+		return -EINVAL;
+
+	return ops->llsec->add_key(dev, &id, &key);
+}
+
+int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info)
+{
+	if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+	    (NLM_F_CREATE | NLM_F_EXCL))
+		return -EINVAL;
+
+	return ieee802154_nl_llsec_change(skb, info, llsec_add_key);
+}
+
+static int llsec_remove_key(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	struct ieee802154_llsec_key_id id;
+
+	if (ieee802154_llsec_parse_key_id(info, &id))
+		return -EINVAL;
+
+	return ops->llsec->del_key(dev, &id);
+}
+
+int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info)
+{
+	return ieee802154_nl_llsec_change(skb, info, llsec_remove_key);
+}
+
+static int
+ieee802154_nl_fill_key(struct sk_buff *msg, u32 portid, u32 seq,
+		       const struct ieee802154_llsec_key_entry *key,
+		       const struct net_device *dev)
+{
+	void *hdr;
+	u32 commands[256 / 32];
+
+	hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+			  IEEE802154_LLSEC_LIST_KEY);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+	    ieee802154_llsec_fill_key_id(msg, &key->id) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES,
+		       key->key->frame_types))
+		goto nla_put_failure;
+
+	if (key->key->frame_types & BIT(IEEE802154_FC_TYPE_MAC_CMD)) {
+		memset(commands, 0, sizeof(commands));
+		commands[7] = key->key->cmd_frame_ids;
+		if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS,
+			    sizeof(commands), commands))
+			goto nla_put_failure;
+	}
+
+	if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_BYTES,
+		    IEEE802154_LLSEC_KEY_SIZE, key->key->key))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+out:
+	return -EMSGSIZE;
+}
+
+static int llsec_iter_keys(struct llsec_dump_data *data)
+{
+	struct ieee802154_llsec_key_entry *pos;
+	int rc = 0, idx = 0;
+
+	list_for_each_entry(pos, &data->table->keys, list) {
+		if (idx++ < data->s_idx)
+			continue;
+
+		if (ieee802154_nl_fill_key(data->skb, data->portid,
+					   data->nlmsg_seq, pos, data->dev)) {
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		data->s_idx++;
+	}
+
+	return rc;
+}
+
+int ieee802154_llsec_dump_keys(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return ieee802154_llsec_dump_table(skb, cb, llsec_iter_keys);
+}
+
+
+
+static int
+llsec_parse_dev(struct genl_info *info,
+		struct ieee802154_llsec_device *dev)
+{
+	memset(dev, 0, sizeof(*dev));
+
+	if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] ||
+	    !info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+	    !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] ||
+	    !info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] ||
+	    (!!info->attrs[IEEE802154_ATTR_PAN_ID] !=
+	     !!info->attrs[IEEE802154_ATTR_SHORT_ADDR]))
+		return -EINVAL;
+
+	if (info->attrs[IEEE802154_ATTR_PAN_ID]) {
+		dev->pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]);
+		dev->short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]);
+	} else {
+		dev->short_addr = cpu_to_le16(IEEE802154_ADDR_UNDEF);
+	}
+
+	dev->hwaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+	dev->frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+	dev->seclevel_exempt = !!nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]);
+	dev->key_mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE]);
+
+	if (dev->key_mode >= __IEEE802154_LLSEC_DEVKEY_MAX)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int llsec_add_dev(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	struct ieee802154_llsec_device desc;
+
+	if (llsec_parse_dev(info, &desc))
+		return -EINVAL;
+
+	return ops->llsec->add_dev(dev, &desc);
+}
+
+int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info)
+{
+	if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+	    (NLM_F_CREATE | NLM_F_EXCL))
+		return -EINVAL;
+
+	return ieee802154_nl_llsec_change(skb, info, llsec_add_dev);
+}
+
+static int llsec_del_dev(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	__le64 devaddr;
+
+	if (!info->attrs[IEEE802154_ATTR_HW_ADDR])
+		return -EINVAL;
+
+	devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+
+	return ops->llsec->del_dev(dev, devaddr);
+}
+
+int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info)
+{
+	return ieee802154_nl_llsec_change(skb, info, llsec_del_dev);
+}
+
+static int
+ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq,
+		       const struct ieee802154_llsec_device *desc,
+		       const struct net_device *dev)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+			  IEEE802154_LLSEC_LIST_DEV);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+	    nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) ||
+	    nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
+			      desc->short_addr) ||
+	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+			desc->frame_counter) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
+		       desc->seclevel_exempt) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_KEY_MODE, desc->key_mode))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+out:
+	return -EMSGSIZE;
+}
+
+static int llsec_iter_devs(struct llsec_dump_data *data)
+{
+	struct ieee802154_llsec_device *pos;
+	int rc = 0, idx = 0;
+
+	list_for_each_entry(pos, &data->table->devices, list) {
+		if (idx++ < data->s_idx)
+			continue;
+
+		if (ieee802154_nl_fill_dev(data->skb, data->portid,
+					   data->nlmsg_seq, pos, data->dev)) {
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		data->s_idx++;
+	}
+
+	return rc;
+}
+
+int ieee802154_llsec_dump_devs(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devs);
+}
+
+
+
+static int llsec_add_devkey(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	struct ieee802154_llsec_device_key key;
+	__le64 devaddr;
+
+	if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] ||
+	    !info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+	    ieee802154_llsec_parse_key_id(info, &key.key_id))
+		return -EINVAL;
+
+	devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+	key.frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+
+	return ops->llsec->add_devkey(dev, devaddr, &key);
+}
+
+int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+	if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+	    (NLM_F_CREATE | NLM_F_EXCL))
+		return -EINVAL;
+
+	return ieee802154_nl_llsec_change(skb, info, llsec_add_devkey);
+}
+
+static int llsec_del_devkey(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	struct ieee802154_llsec_device_key key;
+	__le64 devaddr;
+
+	if (!info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+	    ieee802154_llsec_parse_key_id(info, &key.key_id))
+		return -EINVAL;
+
+	devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+
+	return ops->llsec->del_devkey(dev, devaddr, &key);
+}
+
+int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+	return ieee802154_nl_llsec_change(skb, info, llsec_del_devkey);
+}
+
+static int
+ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq,
+			  __le64 devaddr,
+			  const struct ieee802154_llsec_device_key *devkey,
+			  const struct net_device *dev)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+			  IEEE802154_LLSEC_LIST_DEVKEY);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+	    nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+			devkey->frame_counter) ||
+	    ieee802154_llsec_fill_key_id(msg, &devkey->key_id))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+out:
+	return -EMSGSIZE;
+}
+
+static int llsec_iter_devkeys(struct llsec_dump_data *data)
+{
+	struct ieee802154_llsec_device *dpos;
+	struct ieee802154_llsec_device_key *kpos;
+	int rc = 0, idx = 0, idx2;
+
+	list_for_each_entry(dpos, &data->table->devices, list) {
+		if (idx++ < data->s_idx)
+			continue;
+
+		idx2 = 0;
+
+		list_for_each_entry(kpos, &dpos->keys, list) {
+			if (idx2++ < data->s_idx2)
+				continue;
+
+			if (ieee802154_nl_fill_devkey(data->skb, data->portid,
+						      data->nlmsg_seq,
+						      dpos->hwaddr, kpos,
+						      data->dev)) {
+				return rc = -EMSGSIZE;
+			}
+
+			data->s_idx2++;
+		}
+
+		data->s_idx++;
+	}
+
+	return rc;
+}
+
+int ieee802154_llsec_dump_devkeys(struct sk_buff *skb,
+				  struct netlink_callback *cb)
+{
+	return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devkeys);
+}
+
+
+
+static int
+llsec_parse_seclevel(struct genl_info *info,
+		     struct ieee802154_llsec_seclevel *sl)
+{
+	memset(sl, 0, sizeof(*sl));
+
+	if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE] ||
+	    !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS] ||
+	    !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE])
+		return -EINVAL;
+
+	sl->frame_type = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE]);
+	if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD) {
+		if (!info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID])
+			return -EINVAL;
+
+		sl->cmd_frame_id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]);
+	}
+
+	sl->sec_levels = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS]);
+	sl->device_override = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]);
+
+	return 0;
+}
+
+static int llsec_add_seclevel(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	struct ieee802154_llsec_seclevel sl;
+
+	if (llsec_parse_seclevel(info, &sl))
+		return -EINVAL;
+
+	return ops->llsec->add_seclevel(dev, &sl);
+}
+
+int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info)
+{
+	if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+	    (NLM_F_CREATE | NLM_F_EXCL))
+		return -EINVAL;
+
+	return ieee802154_nl_llsec_change(skb, info, llsec_add_seclevel);
+}
+
+static int llsec_del_seclevel(struct net_device *dev, struct genl_info *info)
+{
+	struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+	struct ieee802154_llsec_seclevel sl;
+
+	if (llsec_parse_seclevel(info, &sl))
+		return -EINVAL;
+
+	return ops->llsec->del_seclevel(dev, &sl);
+}
+
+int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info)
+{
+	return ieee802154_nl_llsec_change(skb, info, llsec_del_seclevel);
+}
+
+static int
+ieee802154_nl_fill_seclevel(struct sk_buff *msg, u32 portid, u32 seq,
+			    const struct ieee802154_llsec_seclevel *sl,
+			    const struct net_device *dev)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+			  IEEE802154_LLSEC_LIST_SECLEVEL);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+	    nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_FRAME_TYPE, sl->frame_type) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVELS, sl->sec_levels) ||
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
+		       sl->device_override))
+		goto nla_put_failure;
+
+	if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD &&
+	    nla_put_u8(msg, IEEE802154_ATTR_LLSEC_CMD_FRAME_ID,
+		       sl->cmd_frame_id))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+out:
+	return -EMSGSIZE;
+}
+
+static int llsec_iter_seclevels(struct llsec_dump_data *data)
+{
+	struct ieee802154_llsec_seclevel *pos;
+	int rc = 0, idx = 0;
+
+	list_for_each_entry(pos, &data->table->security_levels, list) {
+		if (idx++ < data->s_idx)
+			continue;
+
+		if (ieee802154_nl_fill_seclevel(data->skb, data->portid,
+						data->nlmsg_seq, pos,
+						data->dev)) {
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		data->s_idx++;
+	}
+
+	return rc;
+}
+
+int ieee802154_llsec_dump_seclevels(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	return ieee802154_llsec_dump_table(skb, cb, llsec_iter_seclevels);
+}
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index fd7be5e45cef..3a703ab88348 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -62,5 +62,21 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
 	[IEEE802154_ATTR_CSMA_MAX_BE] = { .type = NLA_U8, },
 
 	[IEEE802154_ATTR_FRAME_RETRIES] = { .type = NLA_S8, },
+
+	[IEEE802154_ATTR_LLSEC_ENABLED] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_SECLEVEL] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_KEY_MODE] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT] = { .type = NLA_U32, },
+	[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_LLSEC_KEY_ID] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] = { .type = NLA_U32 },
+	[IEEE802154_ATTR_LLSEC_KEY_BYTES] = { .len = 16, },
+	[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS] = { .len = 258 / 8 },
+	[IEEE802154_ATTR_LLSEC_FRAME_TYPE] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_SECLEVELS] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, },
 };
 
-- 
cgit v1.2.3-71-gd317


From 6c4e548ff36672eeb78f8288a2920d66fa4a6a66 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 16 May 2014 21:48:22 +0200
Subject: net: cdc_ncm: use ethtool to tune coalescing settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Datagram coalescing is an integral part of the NCM and MBIM
protocols, intended to reduce the interrupt load primarily
on the device end of the USB link.  As with all coalescing
solutions, there is a trade-off between buffering and
interrupts.

The current defaults are based on the assumption that device
side buffers should be the limiting factor.  However, many
modern high speed LTE modems suffers from buffer-bloat,
making this assumption fail. This results in sub-optimal
performance due to excessive coalescing.  And in cases where
such modems are connected to cheap embedded hosts there is
often severe buffer allocation issues, giving very noticeable
performance degradation .

A start on improving this is going from build time hard
coded limits to per device user configurable limits.  The
ethtool coalescing API was selected as user interface
because, although the tuned values are buffer sizes, these
settings directly control datagram coalescing.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c   | 71 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/usb/cdc_ncm.h |  6 +++-
 2 files changed, 74 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 2ec3790a4db8..141dbec912be 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -65,6 +65,67 @@ static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx);
 static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer);
 static struct usb_driver cdc_ncm_driver;
 
+static int cdc_ncm_get_coalesce(struct net_device *netdev,
+				struct ethtool_coalesce *ec)
+{
+	struct usbnet *dev = netdev_priv(netdev);
+	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+
+	/* assuming maximum sized dgrams and ignoring NDPs */
+	ec->rx_max_coalesced_frames = ctx->rx_max / ctx->max_datagram_size;
+	ec->tx_max_coalesced_frames = ctx->tx_max / ctx->max_datagram_size;
+
+	/* the timer will fire CDC_NCM_TIMER_PENDING_CNT times in a row */
+	ec->tx_coalesce_usecs = (ctx->timer_interval * CDC_NCM_TIMER_PENDING_CNT) / NSEC_PER_USEC;
+	return 0;
+}
+
+static void cdc_ncm_update_rxtx_max(struct usbnet *dev, u32 new_rx, u32 new_tx);
+
+static int cdc_ncm_set_coalesce(struct net_device *netdev,
+				struct ethtool_coalesce *ec)
+{
+	struct usbnet *dev = netdev_priv(netdev);
+	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+	u32 new_rx_max = ctx->rx_max;
+	u32 new_tx_max = ctx->tx_max;
+
+	/* assuming maximum sized dgrams and a single NDP */
+	if (ec->rx_max_coalesced_frames)
+		new_rx_max = ec->rx_max_coalesced_frames * ctx->max_datagram_size;
+	if (ec->tx_max_coalesced_frames)
+		new_tx_max = ec->tx_max_coalesced_frames * ctx->max_datagram_size;
+
+	if (ec->tx_coalesce_usecs &&
+	    (ec->tx_coalesce_usecs < CDC_NCM_TIMER_INTERVAL_MIN * CDC_NCM_TIMER_PENDING_CNT ||
+	     ec->tx_coalesce_usecs > CDC_NCM_TIMER_INTERVAL_MAX * CDC_NCM_TIMER_PENDING_CNT))
+		return -EINVAL;
+
+	spin_lock_bh(&ctx->mtx);
+	ctx->timer_interval = ec->tx_coalesce_usecs * NSEC_PER_USEC / CDC_NCM_TIMER_PENDING_CNT;
+	if (!ctx->timer_interval)
+		ctx->tx_timer_pending = 0;
+	spin_unlock_bh(&ctx->mtx);
+
+	/* inform device of new values */
+	if (new_rx_max != ctx->rx_max || new_tx_max != ctx->tx_max)
+		cdc_ncm_update_rxtx_max(dev, new_rx_max, new_tx_max);
+	return 0;
+}
+
+static const struct ethtool_ops cdc_ncm_ethtool_ops = {
+	.get_settings      = usbnet_get_settings,
+	.set_settings      = usbnet_set_settings,
+	.get_link          = usbnet_get_link,
+	.nway_reset        = usbnet_nway_reset,
+	.get_drvinfo       = usbnet_get_drvinfo,
+	.get_msglevel      = usbnet_get_msglevel,
+	.set_msglevel      = usbnet_set_msglevel,
+	.get_ts_info       = ethtool_op_get_ts_info,
+	.get_coalesce      = cdc_ncm_get_coalesce,
+	.set_coalesce      = cdc_ncm_set_coalesce,
+};
+
 /* handle rx_max and tx_max changes */
 static void cdc_ncm_update_rxtx_max(struct usbnet *dev, u32 new_rx, u32 new_tx)
 {
@@ -257,6 +318,9 @@ static int cdc_ncm_init(struct usbnet *dev)
 			(ctx->tx_max_datagrams > CDC_NCM_DPT_DATAGRAMS_MAX))
 		ctx->tx_max_datagrams = CDC_NCM_DPT_DATAGRAMS_MAX;
 
+	/* initial coalescing timer interval */
+	ctx->timer_interval = CDC_NCM_TIMER_INTERVAL_USEC * NSEC_PER_USEC;
+
 	return 0;
 }
 
@@ -596,6 +660,9 @@ advance:
 	/* finish setting up the device specific data */
 	cdc_ncm_setup(dev);
 
+	/* override ethtool_ops */
+	dev->net->ethtool_ops = &cdc_ncm_ethtool_ops;
+
 	return 0;
 
 error2:
@@ -863,7 +930,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		ctx->tx_curr_skb = skb_out;
 		goto exit_no_skb;
 
-	} else if ((n < ctx->tx_max_datagrams) && (ready2send == 0)) {
+	} else if ((n < ctx->tx_max_datagrams) && (ready2send == 0) && (ctx->timer_interval > 0)) {
 		/* wait for more frames */
 		/* push variables */
 		ctx->tx_curr_skb = skb_out;
@@ -915,7 +982,7 @@ static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx)
 	/* start timer, if not already started */
 	if (!(hrtimer_active(&ctx->tx_timer) || atomic_read(&ctx->stop)))
 		hrtimer_start(&ctx->tx_timer,
-				ktime_set(0, CDC_NCM_TIMER_INTERVAL),
+				ktime_set(0, ctx->timer_interval),
 				HRTIMER_MODE_REL);
 }
 
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 55b6feead93b..5c1066b4dc41 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -72,7 +72,9 @@
 /* Restart the timer, if amount of datagrams is less than given value */
 #define	CDC_NCM_RESTART_TIMER_DATAGRAM_CNT	3
 #define	CDC_NCM_TIMER_PENDING_CNT		2
-#define CDC_NCM_TIMER_INTERVAL			(400UL * NSEC_PER_USEC)
+#define CDC_NCM_TIMER_INTERVAL_USEC		400UL
+#define CDC_NCM_TIMER_INTERVAL_MIN		5UL
+#define CDC_NCM_TIMER_INTERVAL_MAX		(15UL * USEC_PER_SEC)
 
 /* The following macro defines the minimum header space */
 #define	CDC_NCM_MIN_HDR_SIZE \
@@ -107,6 +109,8 @@ struct cdc_ncm_ctx {
 	spinlock_t mtx;
 	atomic_t stop;
 
+	u64 timer_interval;
+
 	u32 tx_timer_pending;
 	u32 tx_curr_frame_num;
 	u32 rx_max;
-- 
cgit v1.2.3-71-gd317


From 70559b8970e52aa9962dc823fd4498af06809544 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 16 May 2014 21:48:23 +0200
Subject: net: cdc_ncm: use true max dgram count for header estimates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many newer NCM and MBIM devices will request a maximum tx
datagram count which is much smaller than our hard-coded
absolute max. We can reduce the overhead without sacrificing
any of the simplicity for these devices, by simply using the
true negotiated count in when calculated the maximum NTH and
NDP header sizes.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c   |  9 ++++++---
 include/linux/usb/cdc_ncm.h | 10 +---------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 141dbec912be..b9b562b9128a 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -174,7 +174,7 @@ static void cdc_ncm_update_rxtx_max(struct usbnet *dev, u32 new_rx, u32 new_tx)
 	}
 
 	/* clamp new_tx to sane values */
-	min = CDC_NCM_MIN_HDR_SIZE + ctx->max_datagram_size;
+	min = ctx->max_datagram_size + ctx->max_ndp_size + sizeof(struct usb_cdc_ncm_nth16);
 	max = min_t(u32, CDC_NCM_NTB_MAX_SIZE_TX, le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize));
 
 	/* some devices set dwNtbOutMaxSize too low for the above default */
@@ -318,6 +318,9 @@ static int cdc_ncm_init(struct usbnet *dev)
 			(ctx->tx_max_datagrams > CDC_NCM_DPT_DATAGRAMS_MAX))
 		ctx->tx_max_datagrams = CDC_NCM_DPT_DATAGRAMS_MAX;
 
+	/* set up maximum NDP size */
+	ctx->max_ndp_size = sizeof(struct usb_cdc_ncm_ndp16) + (ctx->tx_max_datagrams + 1) * sizeof(struct usb_cdc_ncm_dpe16);
+
 	/* initial coalescing timer interval */
 	ctx->timer_interval = CDC_NCM_TIMER_INTERVAL_USEC * NSEC_PER_USEC;
 
@@ -800,7 +803,7 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_max);
 
 	/* verify that there is room for the NDP and the datagram (reserve) */
-	if ((ctx->tx_max - skb->len - reserve) < CDC_NCM_NDP_SIZE)
+	if ((ctx->tx_max - skb->len - reserve) < ctx->max_ndp_size)
 		return NULL;
 
 	/* link to it */
@@ -810,7 +813,7 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 		nth16->wNdpIndex = cpu_to_le16(skb->len);
 
 	/* push a new empty NDP */
-	ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, CDC_NCM_NDP_SIZE), 0, CDC_NCM_NDP_SIZE);
+	ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, ctx->max_ndp_size), 0, ctx->max_ndp_size);
 	ndp16->dwSignature = sign;
 	ndp16->wLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_ndp16) + sizeof(struct usb_cdc_ncm_dpe16));
 	return ndp16;
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 5c1066b4dc41..60a44b8a464e 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -76,15 +76,6 @@
 #define CDC_NCM_TIMER_INTERVAL_MIN		5UL
 #define CDC_NCM_TIMER_INTERVAL_MAX		(15UL * USEC_PER_SEC)
 
-/* The following macro defines the minimum header space */
-#define	CDC_NCM_MIN_HDR_SIZE \
-	(sizeof(struct usb_cdc_ncm_nth16) + sizeof(struct usb_cdc_ncm_ndp16) + \
-	(CDC_NCM_DPT_DATAGRAMS_MAX + 1) * sizeof(struct usb_cdc_ncm_dpe16))
-
-#define CDC_NCM_NDP_SIZE \
-	(sizeof(struct usb_cdc_ncm_ndp16) +				\
-	      (CDC_NCM_DPT_DATAGRAMS_MAX + 1) * sizeof(struct usb_cdc_ncm_dpe16))
-
 #define cdc_ncm_comm_intf_is_mbim(x)  ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \
 				       (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE)
 #define cdc_ncm_data_intf_is_mbim(x)  ((x)->desc.bInterfaceProtocol == USB_CDC_MBIM_PROTO_NTB)
@@ -110,6 +101,7 @@ struct cdc_ncm_ctx {
 	atomic_t stop;
 
 	u64 timer_interval;
+	u32 max_ndp_size;
 
 	u32 tx_timer_pending;
 	u32 tx_curr_frame_num;
-- 
cgit v1.2.3-71-gd317


From 43e4c6dfc0fd781e68f20caf563a06f5c6ece995 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 16 May 2014 21:48:24 +0200
Subject: net: cdc_ncm: set reasonable padding limits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We pad frames larger than X to maximum size for devices which
don't need a ZLP after maximum sized frames. This allows the
device to optimize its transfers for one fixed buffer size.

X was arbitrarily set at 512 bytes regardless of real buffer
maximum, causing extreme overheads due to excessive padding of
larger tx buffers. Limit the padding to at most 3 full USB
packets, still allowing the overhead to payload ratio of 3/1.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c   | 8 ++++++--
 include/linux/usb/cdc_ncm.h | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index b9b562b9128a..9592d4669435 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -213,6 +213,10 @@ static void cdc_ncm_update_rxtx_max(struct usbnet *dev, u32 new_rx, u32 new_tx)
 
 	/* max qlen depend on hard_mtu and rx_urb_size */
 	usbnet_update_max_qlen(dev);
+
+	/* never pad more than 3 full USB packets per transfer */
+	ctx->min_tx_pkt = clamp_t(u16, ctx->tx_max - 3 * usb_maxpacket(dev->udev, dev->out, 1),
+				  CDC_NCM_MIN_TX_PKT, ctx->tx_max);
 }
 
 /* helpers for NCM and MBIM differences */
@@ -947,7 +951,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		/* variables will be reset at next call */
 	}
 
-	/* If collected data size is less or equal CDC_NCM_MIN_TX_PKT
+	/* If collected data size is less or equal ctx->min_tx_pkt
 	 * bytes, we send buffers as it is. If we get more data, it
 	 * would be more efficient for USB HS mobile device with DMA
 	 * engine to receive a full size NTB, than canceling DMA
@@ -957,7 +961,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	 * a ZLP after full sized NTBs.
 	 */
 	if (!(dev->driver_info->flags & FLAG_SEND_ZLP) &&
-	    skb_out->len > CDC_NCM_MIN_TX_PKT)
+	    skb_out->len > ctx->min_tx_pkt)
 		memset(skb_put(skb_out, ctx->tx_max - skb_out->len), 0,
 		       ctx->tx_max - skb_out->len);
 	else if (skb_out->len < ctx->tx_max && (skb_out->len % dev->maxpacket) == 0)
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 60a44b8a464e..79de6724d398 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -115,6 +115,7 @@ struct cdc_ncm_ctx {
 	u16 tx_seq;
 	u16 rx_seq;
 	u16 connected;
+	u16 min_tx_pkt;
 };
 
 u8 cdc_ncm_select_altsetting(struct usb_interface *intf);
-- 
cgit v1.2.3-71-gd317


From beeecd42c3b41d17d0bf1d839db99274c287f514 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 16 May 2014 21:48:25 +0200
Subject: net: cdc_ncm/cdc_mbim: adding NCM protocol statistics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To have an idea of the effects of the protocol coalescing
it's useful to have some counters showing the different
aspects.

Due to the asymmetrical usbnet interface the netdev
rx_bytes counter has been counting real received payload,
while the tx_bytes counter has included the NCM/MBIM
framing overhead. This overhead can be many times the
payload because of the aggressive padding strategy of
this driver, and will vary a lot depending on device
and traffic.

With very few exceptions, users are only interested in
the payload size.  Having an somewhat accurate payload
byte counter is particularly important for mobile
broadband devices, which many NCM devices and of course
all MBIM devices are. Users and userspace applications
will use this counter to monitor account quotas.

Having protocol specific counters for the overhead, we are
now able to correct the tx_bytes netdev counter so that
it shows the real payload

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_mbim.c  |  6 +++
 drivers/net/usb/cdc_ncm.c   | 91 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/cdc_ncm.h | 11 ++++++
 3 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index bc23273d0455..5ee7a1dbc023 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -420,6 +420,7 @@ static int cdc_mbim_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in)
 	struct usb_cdc_ncm_dpe16 *dpe16;
 	int ndpoffset;
 	int loopcount = 50; /* arbitrary max preventing infinite loop */
+	u32 payload = 0;
 	u8 *c;
 	u16 tci;
 
@@ -482,6 +483,7 @@ next_ndp:
 			if (!skb)
 				goto error;
 			usbnet_skb_return(dev, skb);
+			payload += len;	/* count payload bytes in this NTB */
 		}
 	}
 err_ndp:
@@ -490,6 +492,10 @@ err_ndp:
 	if (ndpoffset && loopcount--)
 		goto next_ndp;
 
+	/* update stats */
+	ctx->rx_overhead += skb_in->len - payload;
+	ctx->rx_ntbs++;
+
 	return 1;
 error:
 	return 0;
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 9592d4669435..f4b439847d04 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -65,6 +65,68 @@ static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx);
 static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer);
 static struct usb_driver cdc_ncm_driver;
 
+struct cdc_ncm_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define CDC_NCM_STAT(str, m) { \
+		.stat_string = str, \
+		.sizeof_stat = sizeof(((struct cdc_ncm_ctx *)0)->m), \
+		.stat_offset = offsetof(struct cdc_ncm_ctx, m) }
+#define CDC_NCM_SIMPLE_STAT(m)	CDC_NCM_STAT(__stringify(m), m)
+
+static const struct cdc_ncm_stats cdc_ncm_gstrings_stats[] = {
+	CDC_NCM_SIMPLE_STAT(tx_reason_ntb_full),
+	CDC_NCM_SIMPLE_STAT(tx_reason_ndp_full),
+	CDC_NCM_SIMPLE_STAT(tx_reason_timeout),
+	CDC_NCM_SIMPLE_STAT(tx_reason_max_datagram),
+	CDC_NCM_SIMPLE_STAT(tx_overhead),
+	CDC_NCM_SIMPLE_STAT(tx_ntbs),
+	CDC_NCM_SIMPLE_STAT(rx_overhead),
+	CDC_NCM_SIMPLE_STAT(rx_ntbs),
+};
+
+static int cdc_ncm_get_sset_count(struct net_device __always_unused *netdev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(cdc_ncm_gstrings_stats);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void cdc_ncm_get_ethtool_stats(struct net_device *netdev,
+				    struct ethtool_stats __always_unused *stats,
+				    u64 *data)
+{
+	struct usbnet *dev = netdev_priv(netdev);
+	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+	int i;
+	char *p = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(cdc_ncm_gstrings_stats); i++) {
+		p = (char *)ctx + cdc_ncm_gstrings_stats[i].stat_offset;
+		data[i] = (cdc_ncm_gstrings_stats[i].sizeof_stat == sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+}
+
+static void cdc_ncm_get_strings(struct net_device __always_unused *netdev, u32 stringset, u8 *data)
+{
+	u8 *p = data;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ARRAY_SIZE(cdc_ncm_gstrings_stats); i++) {
+			memcpy(p, cdc_ncm_gstrings_stats[i].stat_string, ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+	}
+}
+
 static int cdc_ncm_get_coalesce(struct net_device *netdev,
 				struct ethtool_coalesce *ec)
 {
@@ -122,6 +184,9 @@ static const struct ethtool_ops cdc_ncm_ethtool_ops = {
 	.get_msglevel      = usbnet_get_msglevel,
 	.set_msglevel      = usbnet_set_msglevel,
 	.get_ts_info       = ethtool_op_get_ts_info,
+	.get_sset_count    = cdc_ncm_get_sset_count,
+	.get_strings       = cdc_ncm_get_strings,
+	.get_ethtool_stats = cdc_ncm_get_ethtool_stats,
 	.get_coalesce      = cdc_ncm_get_coalesce,
 	.set_coalesce      = cdc_ncm_set_coalesce,
 };
@@ -862,6 +927,9 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 
 		/* count total number of frames in this NTB */
 		ctx->tx_curr_frame_num = 0;
+
+		/* recent payload counter for this skb_out */
+		ctx->tx_curr_frame_payload = 0;
 	}
 
 	for (n = ctx->tx_curr_frame_num; n < ctx->tx_max_datagrams; n++) {
@@ -899,6 +967,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 				ctx->tx_rem_sign = sign;
 				skb = NULL;
 				ready2send = 1;
+				ctx->tx_reason_ntb_full++;	/* count reason for transmitting */
 			}
 			break;
 		}
@@ -912,12 +981,14 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		ndp16->dpe16[index].wDatagramIndex = cpu_to_le16(skb_out->len);
 		ndp16->wLength = cpu_to_le16(ndplen + sizeof(struct usb_cdc_ncm_dpe16));
 		memcpy(skb_put(skb_out, skb->len), skb->data, skb->len);
+		ctx->tx_curr_frame_payload += skb->len;	/* count real tx payload data */
 		dev_kfree_skb_any(skb);
 		skb = NULL;
 
 		/* send now if this NDP is full */
 		if (index >= CDC_NCM_DPT_DATAGRAMS_MAX) {
 			ready2send = 1;
+			ctx->tx_reason_ndp_full++;	/* count reason for transmitting */
 			break;
 		}
 	}
@@ -947,6 +1018,8 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		goto exit_no_skb;
 
 	} else {
+		if (n == ctx->tx_max_datagrams)
+			ctx->tx_reason_max_datagram++;	/* count reason for transmitting */
 		/* frame goes out */
 		/* variables will be reset at next call */
 	}
@@ -974,6 +1047,17 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	/* return skb */
 	ctx->tx_curr_skb = NULL;
 	dev->net->stats.tx_packets += ctx->tx_curr_frame_num;
+
+	/* keep private stats: framing overhead and number of NTBs */
+	ctx->tx_overhead += skb_out->len - ctx->tx_curr_frame_payload;
+	ctx->tx_ntbs++;
+
+	/* usbnet has already counted all the framing overhead.
+	 * Adjust the stats so that the tx_bytes counter show real
+	 * payload data instead.
+	 */
+	dev->net->stats.tx_bytes -= skb_out->len - ctx->tx_curr_frame_payload;
+
 	return skb_out;
 
 exit_no_skb:
@@ -1014,6 +1098,7 @@ static void cdc_ncm_txpath_bh(unsigned long param)
 		cdc_ncm_tx_timeout_start(ctx);
 		spin_unlock_bh(&ctx->mtx);
 	} else if (dev->net != NULL) {
+		ctx->tx_reason_timeout++;	/* count reason for transmitting */
 		spin_unlock_bh(&ctx->mtx);
 		netif_tx_lock_bh(dev->net);
 		usbnet_start_xmit(NULL, dev->net);
@@ -1149,6 +1234,7 @@ int cdc_ncm_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in)
 	struct usb_cdc_ncm_dpe16 *dpe16;
 	int ndpoffset;
 	int loopcount = 50; /* arbitrary max preventing infinite loop */
+	u32 payload = 0;
 
 	ndpoffset = cdc_ncm_rx_verify_nth16(ctx, skb_in);
 	if (ndpoffset < 0)
@@ -1201,6 +1287,7 @@ next_ndp:
 			skb->data = ((u8 *)skb_in->data) + offset;
 			skb_set_tail_pointer(skb, len);
 			usbnet_skb_return(dev, skb);
+			payload += len;	/* count payload bytes in this NTB */
 		}
 	}
 err_ndp:
@@ -1209,6 +1296,10 @@ err_ndp:
 	if (ndpoffset && loopcount--)
 		goto next_ndp;
 
+	/* update stats */
+	ctx->rx_overhead += skb_in->len - payload;
+	ctx->rx_ntbs++;
+
 	return 1;
 error:
 	return 0;
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 79de6724d398..88d2d7f1820f 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -116,6 +116,17 @@ struct cdc_ncm_ctx {
 	u16 rx_seq;
 	u16 connected;
 	u16 min_tx_pkt;
+
+	/* statistics */
+	u32 tx_curr_frame_payload;
+	u32 tx_reason_ntb_full;
+	u32 tx_reason_ndp_full;
+	u32 tx_reason_timeout;
+	u32 tx_reason_max_datagram;
+	u64 tx_overhead;
+	u64 tx_ntbs;
+	u64 rx_overhead;
+	u64 rx_ntbs;
 };
 
 u8 cdc_ncm_select_altsetting(struct usb_interface *intf);
-- 
cgit v1.2.3-71-gd317


From 50f1cb1cc8f50fa88dbeaf990ae2bae91b9ff306 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 16 May 2014 21:48:26 +0200
Subject: net: cdc_ncm: use sane defaults for rx/tx buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lots of devices request much larger buffers than reasonable. This
cause real problems for users of hosts with limited resources.

Reducing the default buffer size to 16kB for such devices is
a reasonable trade-off between allowing them to aggregate traffic
and avoiding memory exhaustion on resource restrained hosts.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c   | 12 ++++++++++--
 include/linux/usb/cdc_ncm.h |  4 ++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index f4b439847d04..bb53abe1f3a1 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -492,10 +492,18 @@ static void cdc_ncm_fix_modulus(struct usbnet *dev)
 static int cdc_ncm_setup(struct usbnet *dev)
 {
 	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+	u32 def_rx, def_tx;
+
+	/* be conservative when selecting intial buffer size to
+	 * increase the number of hosts this will work for
+	 */
+	def_rx = min_t(u32, CDC_NCM_NTB_DEF_SIZE_RX,
+		       le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize));
+	def_tx = min_t(u32, CDC_NCM_NTB_DEF_SIZE_TX,
+		       le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize));
 
 	/* clamp rx_max and tx_max and inform device */
-	cdc_ncm_update_rxtx_max(dev, le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize),
-				le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize));
+	cdc_ncm_update_rxtx_max(dev, def_rx, def_tx);
 
 	/* sanitize the modulus and remainder values */
 	cdc_ncm_fix_modulus(dev);
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 88d2d7f1820f..cde506731c48 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -52,6 +52,10 @@
 #define	CDC_NCM_NTB_MAX_SIZE_TX			32768	/* bytes */
 #define	CDC_NCM_NTB_MAX_SIZE_RX			32768	/* bytes */
 
+/* Initial NTB length */
+#define	CDC_NCM_NTB_DEF_SIZE_TX			16384	/* bytes */
+#define	CDC_NCM_NTB_DEF_SIZE_RX			16384	/* bytes */
+
 /* Minimum value for MaxDatagramSize, ch. 6.2.9 */
 #define	CDC_NCM_MIN_DATAGRAM_SIZE		1514	/* bytes */
 
-- 
cgit v1.2.3-71-gd317


From fa83dbeee55865678025b6c1637ca08860209f87 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 16 May 2014 21:48:28 +0200
Subject: net: cdc_ncm: remove redundant "disconnected" flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Calling netif_carrier_{on,off} is sufficient.  There is no need
to duplicate the carrier state in a driver specific flag.

Acked-by: Enrico Mioso <mrkiko.rs@gmail.com>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c        | 19 ++-----------------
 drivers/net/usb/huawei_cdc_ncm.c | 13 -------------
 include/linux/usb/cdc_ncm.h      |  1 -
 3 files changed, 2 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 1d1ff2fa8ae1..783c4ed96395 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1364,11 +1364,10 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
 		 * USB_CDC_NOTIFY_NETWORK_CONNECTION notification shall be
 		 * sent by device after USB_CDC_NOTIFY_SPEED_CHANGE.
 		 */
-		ctx->connected = le16_to_cpu(event->wValue);
 		netif_info(dev, link, dev->net,
 			   "network connection: %sconnected\n",
-			   ctx->connected ? "" : "dis");
-		usbnet_link_change(dev, ctx->connected, 0);
+			   !!event->wValue ? "" : "dis");
+		usbnet_link_change(dev, !!event->wValue, 0);
 		break;
 
 	case USB_CDC_NOTIFY_SPEED_CHANGE:
@@ -1388,23 +1387,11 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
 	}
 }
 
-static int cdc_ncm_check_connect(struct usbnet *dev)
-{
-	struct cdc_ncm_ctx *ctx;
-
-	ctx = (struct cdc_ncm_ctx *)dev->data[0];
-	if (ctx == NULL)
-		return 1;	/* disconnected */
-
-	return !ctx->connected;
-}
-
 static const struct driver_info cdc_ncm_info = {
 	.description = "CDC NCM",
 	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
-	.check_connect = cdc_ncm_check_connect,
 	.manage_power = usbnet_manage_power,
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
@@ -1418,7 +1405,6 @@ static const struct driver_info wwan_info = {
 			| FLAG_WWAN,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
-	.check_connect = cdc_ncm_check_connect,
 	.manage_power = usbnet_manage_power,
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
@@ -1432,7 +1418,6 @@ static const struct driver_info wwan_noarp_info = {
 			| FLAG_WWAN | FLAG_NOARP,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
-	.check_connect = cdc_ncm_check_connect,
 	.manage_power = usbnet_manage_power,
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c
index 312178d7b698..f9822bc75425 100644
--- a/drivers/net/usb/huawei_cdc_ncm.c
+++ b/drivers/net/usb/huawei_cdc_ncm.c
@@ -172,24 +172,11 @@ err:
 	return ret;
 }
 
-static int huawei_cdc_ncm_check_connect(struct usbnet *usbnet_dev)
-{
-	struct cdc_ncm_ctx *ctx;
-
-	ctx = (struct cdc_ncm_ctx *)usbnet_dev->data[0];
-
-	if (ctx == NULL)
-		return 1; /* disconnected */
-
-	return !ctx->connected;
-}
-
 static const struct driver_info huawei_cdc_ncm_info = {
 	.description = "Huawei CDC NCM device",
 	.flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN,
 	.bind = huawei_cdc_ncm_bind,
 	.unbind = huawei_cdc_ncm_unbind,
-	.check_connect = huawei_cdc_ncm_check_connect,
 	.manage_power = huawei_cdc_ncm_manage_power,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index cde506731c48..8c5e38819828 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -118,7 +118,6 @@ struct cdc_ncm_ctx {
 	u16 tx_ndp_modulus;
 	u16 tx_seq;
 	u16 rx_seq;
-	u16 connected;
 	u16 min_tx_pkt;
 
 	/* statistics */
-- 
cgit v1.2.3-71-gd317


From a17597d3b418ca5a394d14724ccfc295cb3186c8 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Fri, 16 May 2014 11:42:43 +0930
Subject: virtio-rng: fixes for device registration/unregistration

There are several fixes in this patch (mostly because it's hard
splitting them up):

 - Revert the name field in struct hwrng back to 'const'. Also, don't
do an extra kmalloc for the name - just wasteful.
 - Deal with allocation failures properly.
 - Use IDA to allocate device number instead of brute forcing one.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/hw_random/virtio-rng.c | 41 +++++++++++++++++++++----------------
 include/linux/hw_random.h           |  2 +-
 2 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 5b25daa7f798..f3e71501de54 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -25,6 +25,7 @@
 #include <linux/virtio_rng.h>
 #include <linux/module.h>
 
+static DEFINE_IDA(rng_index_ida);
 
 struct virtrng_info {
 	struct virtio_device *vdev;
@@ -33,6 +34,8 @@ struct virtrng_info {
 	unsigned int data_avail;
 	struct completion have_data;
 	bool busy;
+	char name[25];
+	int index;
 };
 
 static void random_recv_done(struct virtqueue *vq)
@@ -92,41 +95,45 @@ static void virtio_cleanup(struct hwrng *rng)
 
 static int probe_common(struct virtio_device *vdev)
 {
-	int err, i;
+	int err, index;
 	struct virtrng_info *vi = NULL;
 
 	vi = kzalloc(sizeof(struct virtrng_info), GFP_KERNEL);
-	vi->hwrng.name = kmalloc(40, GFP_KERNEL);
+	if (!vi)
+		return -ENOMEM;
+
+	vi->index = index = ida_simple_get(&rng_index_ida, 0, 0, GFP_KERNEL);
+	if (index < 0) {
+		kfree(vi);
+		return index;
+	}
+	sprintf(vi->name, "virtio_rng.%d", index);
 	init_completion(&vi->have_data);
 
-	vi->hwrng.read = virtio_read;
-	vi->hwrng.cleanup = virtio_cleanup;
-	vi->hwrng.priv = (unsigned long)vi;
+	vi->hwrng = (struct hwrng) {
+		.read = virtio_read,
+		.cleanup = virtio_cleanup,
+		.priv = (unsigned long)vi,
+		.name = vi->name,
+	};
 	vdev->priv = vi;
 
 	/* We expect a single virtqueue. */
 	vi->vq = virtio_find_single_vq(vdev, random_recv_done, "input");
 	if (IS_ERR(vi->vq)) {
 		err = PTR_ERR(vi->vq);
-		kfree(vi->hwrng.name);
 		vi->vq = NULL;
 		kfree(vi);
-		vi = NULL;
+		ida_simple_remove(&rng_index_ida, index);
 		return err;
 	}
 
-	i = 0;
-	do {
-		sprintf(vi->hwrng.name, "virtio_rng.%d", i++);
-		err = hwrng_register(&vi->hwrng);
-	} while (err == -EEXIST);
-
+	err = hwrng_register(&vi->hwrng);
 	if (err) {
 		vdev->config->del_vqs(vdev);
-		kfree(vi->hwrng.name);
 		vi->vq = NULL;
 		kfree(vi);
-		vi = NULL;
+		ida_simple_remove(&rng_index_ida, index);
 		return err;
 	}
 
@@ -140,10 +147,8 @@ static void remove_common(struct virtio_device *vdev)
 	vi->busy = false;
 	hwrng_unregister(&vi->hwrng);
 	vdev->config->del_vqs(vdev);
-	kfree(vi->hwrng.name);
-	vi->vq = NULL;
+	ida_simple_remove(&rng_index_ida, vi->index);
 	kfree(vi);
-	vi = NULL;
 }
 
 static int virtrng_probe(struct virtio_device *vdev)
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 02d9c87be54c..b4b0eef5fddf 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -31,7 +31,7 @@
  * @priv:		Private data, for use by the RNG driver.
  */
 struct hwrng {
-	char *name;
+	const char *name;
 	int (*init)(struct hwrng *rng);
 	void (*cleanup)(struct hwrng *rng);
 	int (*data_present)(struct hwrng *rng, int wait);
-- 
cgit v1.2.3-71-gd317


From 678e30df2e5664619e06fcfea5490a476826d8fe Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 19 May 2014 01:25:59 +0100
Subject: ethtool: Expand documentation of ethtool_ops::{get,set}_rxfh()

Some corner-cases are not explained properly.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 include/linux/ethtool.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 212f537fc686..886e127d51a6 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -162,15 +162,16 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	Will not be called if @get_rxfh_indir_size returns zero.
  * @get_rxfh: Get the contents of the RX flow hash indirection table and hash
  *	key.
- *	Will not be called if @get_rxfh_indir_size and @get_rxfh_key_size
- *	returns zero.
+ *	Will only be called if one or both of @get_rxfh_indir_size and
+ *	@get_rxfh_key_size are implemented and return non-zero.
  *	Returns a negative error code or zero.
  * @set_rxfh_indir: Set the contents of the RX flow hash indirection table.
  *	Will not be called if @get_rxfh_indir_size returns zero.
- * @set_rxfh: Set the contents of the RX flow hash indirection table and
- *	hash key.
- *	Will not be called if @get_rxfh_indir_size and @get_rxfh_key_size
- *	returns zero.
+ * @set_rxfh: Set the contents of the RX flow hash indirection table and/or
+ *	hash key.  Either or both arguments may be %NULL if that attribute
+ *	is not to be changed.
+ *	Will only be called if one or both of @get_rxfh_indir_size and
+ *	@get_rxfh_key_size are implemented and return non-zero.
  *	Returns a negative error code or zero.
  * @get_channels: Get number of channels.
  * @set_channels: Set number of channels.  Returns a negative error code or
@@ -244,8 +245,8 @@ struct ethtool_ops {
 	int	(*reset)(struct net_device *, u32 *);
 	u32	(*get_rxfh_key_size)(struct net_device *);
 	u32	(*get_rxfh_indir_size)(struct net_device *);
-	int	(*get_rxfh)(struct net_device *, u32 *, u8 *);
-	int	(*set_rxfh)(struct net_device *, u32 *, u8 *);
+	int	(*get_rxfh)(struct net_device *, u32 *indir, u8 *key);
+	int	(*set_rxfh)(struct net_device *, u32 *indir, u8 *key);
 	int	(*get_rxfh_indir)(struct net_device *, u32 *);
 	int	(*set_rxfh_indir)(struct net_device *, const u32 *);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
-- 
cgit v1.2.3-71-gd317


From 61d88c6811f216de4ec26aafe24e650dc1aeb00e Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 19 May 2014 01:29:42 +0100
Subject: ethtool: Disallow ETHTOOL_SRSSH with both indir table and hash key
 unchanged

This would be a no-op, so there is no reason to request it.

This also allows conversion of the current implementations of
ethtool_ops::{get,set}_rxfh_indir to ethtool_ops::{get,set}_rxfh
with no change other than their parameters.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 include/linux/ethtool.h | 4 ++--
 net/core/ethtool.c      | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 886e127d51a6..de687a97c6e7 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -168,8 +168,8 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  * @set_rxfh_indir: Set the contents of the RX flow hash indirection table.
  *	Will not be called if @get_rxfh_indir_size returns zero.
  * @set_rxfh: Set the contents of the RX flow hash indirection table and/or
- *	hash key.  Either or both arguments may be %NULL if that attribute
- *	is not to be changed.
+ *	hash key.  In case only the indirection table or hash key is to be
+ *	changed, the other argument will be %NULL.
  *	Will only be called if one or both of @get_rxfh_indir_size and
  *	@get_rxfh_key_size are implemented and return non-zero.
  *	Returns a negative error code or zero.
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 7156fe5ca876..b8857348bdf3 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -802,11 +802,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		return -EFAULT;
 
 	/* If either indir or hash key is valid, proceed further.
+	 * It is not valid to request that both be unchanged.
 	 */
 	if ((user_indir_size &&
 	     user_indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
 	     user_indir_size != dev_indir_size) ||
-	    (user_key_size && (user_key_size != dev_key_size)))
+	    (user_key_size && (user_key_size != dev_key_size)) ||
+	    (user_indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+	     user_key_size == 0))
 		return -EINVAL;
 
 	if (user_indir_size != ETH_RXFH_INDIR_NO_CHANGE)
-- 
cgit v1.2.3-71-gd317


From 33cb0fa7888510b5bd2096352b200cfe29db10fe Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Thu, 15 May 2014 02:01:23 +0100
Subject: ethtool, be2net: constify array pointer parameters to
 ethtool_ops::set_rxfh

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 drivers/net/ethernet/emulex/benet/be_cmds.c    | 2 +-
 drivers/net/ethernet/emulex/benet/be_cmds.h    | 2 +-
 drivers/net/ethernet/emulex/benet/be_ethtool.c | 3 ++-
 include/linux/ethtool.h                        | 3 ++-
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 476752d0a6a4..7b59da241ccb 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -2033,7 +2033,7 @@ int be_cmd_reset_function(struct be_adapter *adapter)
 }
 
 int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable,
-		      u32 rss_hash_opts, u16 table_size, u8 *rss_hkey)
+		      u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey)
 {
 	struct be_mcc_wrb *wrb;
 	struct be_cmd_req_rss_config *req;
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index 228d4b611084..451f3138b8fb 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -2068,7 +2068,7 @@ int be_cmd_query_fw_cfg(struct be_adapter *adapter, u32 *port_num,
 			u32 *function_mode, u32 *function_caps, u16 *asic_rev);
 int be_cmd_reset_function(struct be_adapter *adapter);
 int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable,
-		      u32 rss_hash_opts, u16 table_size, u8 *rss_hkey);
+		      u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey);
 int be_process_mcc(struct be_adapter *adapter);
 int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, u8 beacon,
 			    u8 status, u8 state);
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index 970ae337daac..e2da4d20dd3d 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -1117,7 +1117,8 @@ static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey)
 	return 0;
 }
 
-static int be_set_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey)
+static int be_set_rxfh(struct net_device *netdev, const u32 *indir,
+		       const u8 *hkey)
 {
 	int rc = 0, i, j;
 	struct be_adapter *adapter = netdev_priv(netdev);
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index de687a97c6e7..874fde01d398 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -246,7 +246,8 @@ struct ethtool_ops {
 	u32	(*get_rxfh_key_size)(struct net_device *);
 	u32	(*get_rxfh_indir_size)(struct net_device *);
 	int	(*get_rxfh)(struct net_device *, u32 *indir, u8 *key);
-	int	(*set_rxfh)(struct net_device *, u32 *indir, u8 *key);
+	int	(*set_rxfh)(struct net_device *, const u32 *indir,
+			    const u8 *key);
 	int	(*get_rxfh_indir)(struct net_device *, u32 *);
 	int	(*set_rxfh_indir)(struct net_device *, const u32 *);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
-- 
cgit v1.2.3-71-gd317


From d3091298570006fa538ec9beacbfb1098964962e Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Fri, 16 May 2014 23:26:05 +0200
Subject: sparc: fix sparse warnings in smp_32.c + smp_64.c

Fix following warnings:
smp_32.c:177:5: warning: symbol 'setup_profiling_timer' was not declared. Should it be static?
smp_64.c:1202:5: warning: symbol 'setup_profiling_timer' was not declared. Should it be static?
smp_64.c:989:6: warning: symbol 'kgdb_roundup_cpus' was not declared. Should it be static?

Add prototype to include/linux/profile.h of setup_profiling_timer
Add missing include to smp_64.c

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_32.c | 1 +
 arch/sparc/kernel/smp_64.c | 1 +
 include/linux/profile.h    | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 9d3297d8d730..7958242d63c5 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -20,6 +20,7 @@
 #include <linux/seq_file.h>
 #include <linux/cache.h>
 #include <linux/delay.h>
+#include <linux/profile.h>
 #include <linux/cpu.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index df91e78dbd95..afc71bf719b1 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -25,6 +25,7 @@
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
+#include <linux/kgdb.h>
 
 #include <asm/head.h>
 #include <asm/ptrace.h>
diff --git a/include/linux/profile.h b/include/linux/profile.h
index aaad3861beb8..b537a25ffa17 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -44,6 +44,7 @@ extern int prof_on __read_mostly;
 int profile_init(void);
 int profile_setup(char *str);
 void profile_tick(int type);
+int setup_profiling_timer(unsigned int multiplier);
 
 /*
  * Add multiple profiler hits to a given address:
-- 
cgit v1.2.3-71-gd317


From 3b36fbb01dc50f58e7803006f5a99683daf26c8c Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Sun, 18 May 2014 22:44:35 -0700
Subject: Input: pixcir_i2c_ts - initialize interrupt mode and power mode

Introduce helper functions to configure power and interrupt registers.
Default to IDLE mode on probe as device supports auto wakeup to ACVIE mode
on detecting finger touch.

Configure interrupt mode and polarity on start up.  Power down on device
closure or module removal.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/pixcir_i2c_ts.c | 182 ++++++++++++++++++++++++++++--
 include/linux/input/pixcir_ts.h           |  42 +++++++
 2 files changed, 216 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/pixcir_i2c_ts.c b/drivers/input/touchscreen/pixcir_i2c_ts.c
index 5d36243bdc79..6c6f6dacb858 100644
--- a/drivers/input/touchscreen/pixcir_i2c_ts.c
+++ b/drivers/input/touchscreen/pixcir_i2c_ts.c
@@ -29,7 +29,7 @@ struct pixcir_i2c_ts_data {
 	struct i2c_client *client;
 	struct input_dev *input;
 	const struct pixcir_ts_platform_data *chip;
-	bool exiting;
+	bool running;
 };
 
 static void pixcir_ts_poscheck(struct pixcir_i2c_ts_data *data)
@@ -88,7 +88,7 @@ static irqreturn_t pixcir_ts_isr(int irq, void *dev_id)
 {
 	struct pixcir_i2c_ts_data *tsdata = dev_id;
 
-	while (!tsdata->exiting) {
+	while (tsdata->running) {
 		pixcir_ts_poscheck(tsdata);
 
 		if (tsdata->chip->attb_read_val())
@@ -100,6 +100,164 @@ static irqreturn_t pixcir_ts_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static int pixcir_set_power_mode(struct pixcir_i2c_ts_data *ts,
+				 enum pixcir_power_mode mode)
+{
+	struct device *dev = &ts->client->dev;
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(ts->client, PIXCIR_REG_POWER_MODE);
+	if (ret < 0) {
+		dev_err(dev, "%s: can't read reg 0x%x : %d\n",
+			__func__, PIXCIR_REG_POWER_MODE, ret);
+		return ret;
+	}
+
+	ret &= ~PIXCIR_POWER_MODE_MASK;
+	ret |= mode;
+
+	/* Always AUTO_IDLE */
+	ret |= PIXCIR_POWER_ALLOW_IDLE;
+
+	ret = i2c_smbus_write_byte_data(ts->client, PIXCIR_REG_POWER_MODE, ret);
+	if (ret < 0) {
+		dev_err(dev, "%s: can't write reg 0x%x : %d\n",
+			__func__, PIXCIR_REG_POWER_MODE, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Set the interrupt mode for the device i.e. ATTB line behaviour
+ *
+ * @polarity : 1 for active high, 0 for active low.
+ */
+static int pixcir_set_int_mode(struct pixcir_i2c_ts_data *ts,
+			       enum pixcir_int_mode mode, bool polarity)
+{
+	struct device *dev = &ts->client->dev;
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(ts->client, PIXCIR_REG_INT_MODE);
+	if (ret < 0) {
+		dev_err(dev, "%s: can't read reg 0x%x : %d\n",
+			__func__, PIXCIR_REG_INT_MODE, ret);
+		return ret;
+	}
+
+	ret &= ~PIXCIR_INT_MODE_MASK;
+	ret |= mode;
+
+	if (polarity)
+		ret |= PIXCIR_INT_POL_HIGH;
+	else
+		ret &= ~PIXCIR_INT_POL_HIGH;
+
+	ret = i2c_smbus_write_byte_data(ts->client, PIXCIR_REG_INT_MODE, ret);
+	if (ret < 0) {
+		dev_err(dev, "%s: can't write reg 0x%x : %d\n",
+			__func__, PIXCIR_REG_INT_MODE, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Enable/disable interrupt generation
+ */
+static int pixcir_int_enable(struct pixcir_i2c_ts_data *ts, bool enable)
+{
+	struct device *dev = &ts->client->dev;
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(ts->client, PIXCIR_REG_INT_MODE);
+	if (ret < 0) {
+		dev_err(dev, "%s: can't read reg 0x%x : %d\n",
+			__func__, PIXCIR_REG_INT_MODE, ret);
+		return ret;
+	}
+
+	if (enable)
+		ret |= PIXCIR_INT_ENABLE;
+	else
+		ret &= ~PIXCIR_INT_ENABLE;
+
+	ret = i2c_smbus_write_byte_data(ts->client, PIXCIR_REG_INT_MODE, ret);
+	if (ret < 0) {
+		dev_err(dev, "%s: can't write reg 0x%x : %d\n",
+			__func__, PIXCIR_REG_INT_MODE, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int pixcir_start(struct pixcir_i2c_ts_data *ts)
+{
+	struct device *dev = &ts->client->dev;
+	int error;
+
+	/* LEVEL_TOUCH interrupt with active low polarity */
+	error = pixcir_set_int_mode(ts, PIXCIR_INT_LEVEL_TOUCH, 0);
+	if (error) {
+		dev_err(dev, "Failed to set interrupt mode: %d\n", error);
+		return error;
+	}
+
+	ts->running = true;
+	mb();	/* Update status before IRQ can fire */
+
+	/* enable interrupt generation */
+	error = pixcir_int_enable(ts, true);
+	if (error) {
+		dev_err(dev, "Failed to enable interrupt generation: %d\n",
+			error);
+		return error;
+	}
+
+	return 0;
+}
+
+static int pixcir_stop(struct pixcir_i2c_ts_data *ts)
+{
+	int error;
+
+	/* Disable interrupt generation */
+	error = pixcir_int_enable(ts, false);
+	if (error) {
+		dev_err(&ts->client->dev,
+			"Failed to disable interrupt generation: %d\n",
+			error);
+		return error;
+	}
+
+	/* Exit ISR if running, no more report parsing */
+	ts->running = false;
+	mb();	/* update status before we synchronize irq */
+
+	/* Wait till running ISR is complete */
+	synchronize_irq(ts->client->irq);
+
+	return 0;
+}
+
+static int pixcir_input_open(struct input_dev *dev)
+{
+	struct pixcir_i2c_ts_data *ts = input_get_drvdata(dev);
+
+	return pixcir_start(ts);
+}
+
+static void pixcir_input_close(struct input_dev *dev)
+{
+	struct pixcir_i2c_ts_data *ts = input_get_drvdata(dev);
+
+	pixcir_stop(ts);
+}
+
 #ifdef CONFIG_PM_SLEEP
 static int pixcir_i2c_ts_suspend(struct device *dev)
 {
@@ -156,6 +314,8 @@ static int pixcir_i2c_ts_probe(struct i2c_client *client,
 
 	input->name = client->name;
 	input->id.bustype = BUS_I2C;
+	input->open = pixcir_input_open;
+	input->close = pixcir_input_close;
 	input->dev.parent = &client->dev;
 
 	__set_bit(EV_KEY, input->evbit);
@@ -176,11 +336,22 @@ static int pixcir_i2c_ts_probe(struct i2c_client *client,
 		return error;
 	}
 
+	/* Always be in IDLE mode to save power, device supports auto wake */
+	error = pixcir_set_power_mode(tsdata, PIXCIR_POWER_IDLE);
+	if (error) {
+		dev_err(dev, "Failed to set IDLE mode\n");
+		return error;
+	}
+
+	/* Stop device till opened */
+	error = pixcir_stop(tsdata);
+	if (error)
+		return error;
+
 	error = input_register_device(input);
 	if (error)
 		return error;
 
-	i2c_set_clientdata(client, tsdata);
 	device_init_wakeup(&client->dev, 1);
 
 	return 0;
@@ -188,13 +359,8 @@ static int pixcir_i2c_ts_probe(struct i2c_client *client,
 
 static int pixcir_i2c_ts_remove(struct i2c_client *client)
 {
-	struct pixcir_i2c_ts_data *tsdata = i2c_get_clientdata(client);
-
 	device_init_wakeup(&client->dev, 0);
 
-	tsdata->exiting = true;
-	mb();
-
 	return 0;
 }
 
diff --git a/include/linux/input/pixcir_ts.h b/include/linux/input/pixcir_ts.h
index 7163d91c0373..7942804464d3 100644
--- a/include/linux/input/pixcir_ts.h
+++ b/include/linux/input/pixcir_ts.h
@@ -1,6 +1,48 @@
 #ifndef	_PIXCIR_I2C_TS_H
 #define	_PIXCIR_I2C_TS_H
 
+/*
+ * Register map
+ */
+#define PIXCIR_REG_POWER_MODE	51
+#define PIXCIR_REG_INT_MODE	52
+
+/*
+ * Power modes:
+ * active: max scan speed
+ * idle: lower scan speed with automatic transition to active on touch
+ * halt: datasheet says sleep but this is more like halt as the chip
+ *       clocks are cut and it can only be brought out of this mode
+ *	 using the RESET pin.
+ */
+enum pixcir_power_mode {
+	PIXCIR_POWER_ACTIVE,
+	PIXCIR_POWER_IDLE,
+	PIXCIR_POWER_HALT,
+};
+
+#define PIXCIR_POWER_MODE_MASK	0x03
+#define PIXCIR_POWER_ALLOW_IDLE (1UL << 2)
+
+/*
+ * Interrupt modes:
+ * periodical: interrupt is asserted periodicaly
+ * diff coordinates: interrupt is asserted when coordinates change
+ * level on touch: interrupt level asserted during touch
+ * pulse on touch: interrupt pulse asserted druing touch
+ *
+ */
+enum pixcir_int_mode {
+	PIXCIR_INT_PERIODICAL,
+	PIXCIR_INT_DIFF_COORD,
+	PIXCIR_INT_LEVEL_TOUCH,
+	PIXCIR_INT_PULSE_TOUCH,
+};
+
+#define PIXCIR_INT_MODE_MASK	0x03
+#define PIXCIR_INT_ENABLE	(1UL << 3)
+#define PIXCIR_INT_POL_HIGH	(1UL << 2)
+
 struct pixcir_ts_platform_data {
 	int (*attb_read_val)(void);
 	int x_max;
-- 
cgit v1.2.3-71-gd317


From 0dfc8d41bfa091a61354eea73199a5af0eaae9c0 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Sun, 18 May 2014 22:46:43 -0700
Subject: Input: pixcir_i2c_ts - get rid of pdata->attb_read_val()

Get rid of the attb_read_val() platform hook. Instead, read the ATTB gpio
directly from the driver.

Fail if valid ATTB gpio is not provided by patform data.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Mugunthan V N <mugunthanvnm@ti.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/pixcir_i2c_ts.c | 16 +++++++++++++++-
 include/linux/input/pixcir_ts.h           |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/pixcir_i2c_ts.c b/drivers/input/touchscreen/pixcir_i2c_ts.c
index 6c6f6dacb858..f2c0ae18e24a 100644
--- a/drivers/input/touchscreen/pixcir_i2c_ts.c
+++ b/drivers/input/touchscreen/pixcir_i2c_ts.c
@@ -24,6 +24,7 @@
 #include <linux/i2c.h>
 #include <linux/input.h>
 #include <linux/input/pixcir_ts.h>
+#include <linux/gpio.h>
 
 struct pixcir_i2c_ts_data {
 	struct i2c_client *client;
@@ -87,11 +88,12 @@ static void pixcir_ts_poscheck(struct pixcir_i2c_ts_data *data)
 static irqreturn_t pixcir_ts_isr(int irq, void *dev_id)
 {
 	struct pixcir_i2c_ts_data *tsdata = dev_id;
+	const struct pixcir_ts_platform_data *pdata = tsdata->chip;
 
 	while (tsdata->running) {
 		pixcir_ts_poscheck(tsdata);
 
-		if (tsdata->chip->attb_read_val())
+		if (gpio_get_value(pdata->gpio_attb))
 			break;
 
 		msleep(20);
@@ -298,6 +300,11 @@ static int pixcir_i2c_ts_probe(struct i2c_client *client,
 		return -EINVAL;
 	}
 
+	if (!gpio_is_valid(pdata->gpio_attb)) {
+		dev_err(dev, "Invalid gpio_attb in pdata\n");
+		return -EINVAL;
+	}
+
 	tsdata = devm_kzalloc(dev, sizeof(*tsdata), GFP_KERNEL);
 	if (!tsdata)
 		return -ENOMEM;
@@ -328,6 +335,13 @@ static int pixcir_i2c_ts_probe(struct i2c_client *client,
 
 	input_set_drvdata(input, tsdata);
 
+	error = devm_gpio_request_one(dev, pdata->gpio_attb,
+				      GPIOF_DIR_IN, "pixcir_i2c_attb");
+	if (error) {
+		dev_err(dev, "Failed to request ATTB gpio\n");
+		return error;
+	}
+
 	error = devm_request_threaded_irq(dev, client->irq, NULL, pixcir_ts_isr,
 					  IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
 					  client->name, tsdata);
diff --git a/include/linux/input/pixcir_ts.h b/include/linux/input/pixcir_ts.h
index 7942804464d3..160cf353aa39 100644
--- a/include/linux/input/pixcir_ts.h
+++ b/include/linux/input/pixcir_ts.h
@@ -44,9 +44,9 @@ enum pixcir_int_mode {
 #define PIXCIR_INT_POL_HIGH	(1UL << 2)
 
 struct pixcir_ts_platform_data {
-	int (*attb_read_val)(void);
 	int x_max;
 	int y_max;
+	int gpio_attb;		/* GPIO connected to ATTB line */
 };
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 2cefdb1f0a27150755ef2730bafc58bf2ed16571 Mon Sep 17 00:00:00 2001
From: Nick Dyer <nick.dyer@itdev.co.uk>
Date: Sun, 18 May 2014 22:59:20 -0700
Subject: Input: atmel_mxt_ts - remove unnecessary platform data

It is not necessary to download these values to the maXTouch chip on every
probe, since they are stored in NVRAM. It makes life difficult when tuning
the device to keep them in sync with the config array/file, and requires a
new kernel build for minor tweaks.

These parameters only represent a tiny subset of the available
configuration options, tracking all of these options in platform data would
be a endless task. In addition, different versions of maXTouch chips may
have these values in different places or may not even have them at all.

Having these values also makes life more complex for device tree and other
platforms where having to define a static configuration isn't helpful.

Signed-off-by: Nick Dyer <nick.dyer@itdev.co.uk>
Acked-by: Benson Leung <bleung@chromium.org>
Acked-by: Yufeng Shen <miletus@chromium.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 arch/arm/mach-s5pv210/mach-goni.c         |  5 ----
 drivers/input/touchscreen/atmel_mxt_ts.c  | 50 -------------------------------
 drivers/platform/chrome/chromeos_laptop.c | 10 -------
 include/linux/i2c/atmel_mxt_ts.h          |  6 +---
 4 files changed, 1 insertion(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s5pv210/mach-goni.c b/arch/arm/mach-s5pv210/mach-goni.c
index b41a38a75844..e549ecf0e5dc 100644
--- a/arch/arm/mach-s5pv210/mach-goni.c
+++ b/arch/arm/mach-s5pv210/mach-goni.c
@@ -239,13 +239,8 @@ static void __init goni_radio_init(void)
 
 /* TSP */
 static struct mxt_platform_data qt602240_platform_data = {
-	.x_line		= 17,
-	.y_line		= 11,
 	.x_size		= 800,
 	.y_size		= 480,
-	.blen		= 0x21,
-	.threshold	= 0x28,
-	.voltage	= 2800000,              /* 2.8V */
 	.orient		= MXT_DIAGONAL,
 	.irqflags	= IRQF_TRIGGER_FALLING,
 };
diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index a70400754e92..7eb515caf215 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -685,54 +685,6 @@ static int mxt_make_highchg(struct mxt_data *data)
 	return 0;
 }
 
-static void mxt_handle_pdata(struct mxt_data *data)
-{
-	const struct mxt_platform_data *pdata = data->pdata;
-	u8 voltage;
-
-	/* Set touchscreen lines */
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9, MXT_TOUCH_XSIZE,
-			pdata->x_line);
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9, MXT_TOUCH_YSIZE,
-			pdata->y_line);
-
-	/* Set touchscreen orient */
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9, MXT_TOUCH_ORIENT,
-			pdata->orient);
-
-	/* Set touchscreen burst length */
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9,
-			MXT_TOUCH_BLEN, pdata->blen);
-
-	/* Set touchscreen threshold */
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9,
-			MXT_TOUCH_TCHTHR, pdata->threshold);
-
-	/* Set touchscreen resolution */
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9,
-			MXT_TOUCH_XRANGE_LSB, (pdata->x_size - 1) & 0xff);
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9,
-			MXT_TOUCH_XRANGE_MSB, (pdata->x_size - 1) >> 8);
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9,
-			MXT_TOUCH_YRANGE_LSB, (pdata->y_size - 1) & 0xff);
-	mxt_write_object(data, MXT_TOUCH_MULTI_T9,
-			MXT_TOUCH_YRANGE_MSB, (pdata->y_size - 1) >> 8);
-
-	/* Set touchscreen voltage */
-	if (pdata->voltage) {
-		if (pdata->voltage < MXT_VOLTAGE_DEFAULT) {
-			voltage = (MXT_VOLTAGE_DEFAULT - pdata->voltage) /
-				MXT_VOLTAGE_STEP;
-			voltage = 0xff - voltage + 1;
-		} else
-			voltage = (pdata->voltage - MXT_VOLTAGE_DEFAULT) /
-				MXT_VOLTAGE_STEP;
-
-		mxt_write_object(data, MXT_SPT_CTECONFIG_T28,
-				MXT_CTE_VOLTAGE, voltage);
-	}
-}
-
 static int mxt_get_info(struct mxt_data *data)
 {
 	struct i2c_client *client = data->client;
@@ -840,8 +792,6 @@ static int mxt_initialize(struct mxt_data *data)
 	if (error)
 		goto err_free_object_table;
 
-	mxt_handle_pdata(data);
-
 	/* Backup to memory */
 	mxt_write_object(data, MXT_GEN_COMMAND_T6,
 			MXT_COMMAND_BACKUPNV,
diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index 7f3aad0e115c..2559a0407c58 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -85,13 +85,8 @@ static struct i2c_board_info tsl2563_als_device = {
 };
 
 static struct mxt_platform_data atmel_224s_tp_platform_data = {
-	.x_line			= 18,
-	.y_line			= 12,
 	.x_size			= 102*20,
 	.y_size			= 68*20,
-	.blen			= 0x80,	/* Gain setting is in upper 4 bits */
-	.threshold		= 0x32,
-	.voltage		= 0,	/* 3.3V */
 	.orient			= MXT_VERTICAL_FLIP,
 	.irqflags		= IRQF_TRIGGER_FALLING,
 	.is_tp			= true,
@@ -110,13 +105,8 @@ static struct i2c_board_info atmel_224s_tp_device = {
 };
 
 static struct mxt_platform_data atmel_1664s_platform_data = {
-	.x_line			= 32,
-	.y_line			= 50,
 	.x_size			= 1700,
 	.y_size			= 2560,
-	.blen			= 0x89,	/* Gain setting is in upper 4 bits */
-	.threshold		= 0x28,
-	.voltage		= 0,	/* 3.3V */
 	.orient			= MXT_ROTATED_90_COUNTER,
 	.irqflags		= IRQF_TRIGGER_FALLING,
 	.is_tp			= false,
diff --git a/include/linux/i2c/atmel_mxt_ts.h b/include/linux/i2c/atmel_mxt_ts.h
index 99e379b74398..eff0cdc08843 100644
--- a/include/linux/i2c/atmel_mxt_ts.h
+++ b/include/linux/i2c/atmel_mxt_ts.h
@@ -33,14 +33,10 @@ struct mxt_platform_data {
 	const u8 *config;
 	size_t config_length;
 
-	unsigned int x_line;
-	unsigned int y_line;
 	unsigned int x_size;
 	unsigned int y_size;
-	unsigned int blen;
-	unsigned int threshold;
-	unsigned int voltage;
 	unsigned char orient;
+
 	unsigned long irqflags;
 	bool is_tp;
 	const unsigned int key_map[MXT_NUM_GPIO];
-- 
cgit v1.2.3-71-gd317


From fb5e4c3ee140b29e1935b4bbb19c319177bed231 Mon Sep 17 00:00:00 2001
From: Nick Dyer <nick.dyer@itdev.co.uk>
Date: Sun, 18 May 2014 23:00:15 -0700
Subject: Input: atmel_mxt_ts - improve T19 GPIO keys handling

 * The mapping of the GPIO numbers into the T19 status byte varies between
   different maXTouch chips. Some have up to 7 GPIOs. Allowing a keycode array
   of up to 8 items is simpler and more generic. So replace #define with
   configurable number of keys which also allows the removal of is_tp.
 * Rename platform data parameters to include "t19" to prevent confusion with
   T15 key array.
 * Probe aborts early on when pdata is NULL, so no need to check.
 * Move "int i" to beginning of function (mixed declarations and code)
 * Use API calls rather than __set_bit()
 * Remove unused dev variable.

Signed-off-by: Nick Dyer <nick.dyer@itdev.co.uk>
Acked-by: Yufeng Shen <miletus@chromium.org>
Reviewed-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/atmel_mxt_ts.c  | 44 ++++++++++++-------------------
 drivers/platform/chrome/chromeos_laptop.c | 17 +++++++-----
 include/linux/i2c/atmel_mxt_ts.h          |  7 ++---
 3 files changed, 30 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index 7eb515caf215..65df362cf327 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -180,12 +180,6 @@
 
 #define MXT_FWRESET_TIME	175	/* msec */
 
-/* MXT_SPT_GPIOPWM_T19 field */
-#define MXT_GPIO0_MASK		0x04
-#define MXT_GPIO1_MASK		0x08
-#define MXT_GPIO2_MASK		0x10
-#define MXT_GPIO3_MASK		0x20
-
 /* Command to unlock bootloader */
 #define MXT_UNLOCK_CMD_MSB	0xaa
 #define MXT_UNLOCK_CMD_LSB	0xdc
@@ -250,7 +244,6 @@ struct mxt_data {
 	const struct mxt_platform_data *pdata;
 	struct mxt_object *object_table;
 	struct mxt_info info;
-	bool is_tp;
 
 	unsigned int irq;
 	unsigned int max_x;
@@ -515,15 +508,16 @@ static int mxt_write_object(struct mxt_data *data,
 static void mxt_input_button(struct mxt_data *data, struct mxt_message *message)
 {
 	struct input_dev *input = data->input_dev;
+	const struct mxt_platform_data *pdata = data->pdata;
 	bool button;
 	int i;
 
 	/* Active-low switch */
-	for (i = 0; i < MXT_NUM_GPIO; i++) {
-		if (data->pdata->key_map[i] == KEY_RESERVED)
+	for (i = 0; i < pdata->t19_num_keys; i++) {
+		if (pdata->t19_keymap[i] == KEY_RESERVED)
 			continue;
-		button = !(message->message[0] & MXT_GPIO0_MASK << i);
-		input_report_key(input, data->pdata->key_map[i], button);
+		button = !(message->message[0] & (1 << i));
+		input_report_key(input, pdata->t19_keymap[i], button);
 	}
 }
 
@@ -1084,6 +1078,8 @@ static int mxt_probe(struct i2c_client *client,
 	struct input_dev *input_dev;
 	int error;
 	unsigned int num_mt_slots;
+	unsigned int mt_flags = 0;
+	int i;
 
 	if (!pdata)
 		return -EINVAL;
@@ -1096,10 +1092,7 @@ static int mxt_probe(struct i2c_client *client,
 		goto err_free_mem;
 	}
 
-	data->is_tp = pdata && pdata->is_tp;
-
-	input_dev->name = (data->is_tp) ? "Atmel maXTouch Touchpad" :
-					  "Atmel maXTouch Touchscreen";
+	input_dev->name = "Atmel maXTouch Touchscreen";
 	snprintf(data->phys, sizeof(data->phys), "i2c-%u-%04x/input0",
 		 client->adapter->nr, client->addr);
 
@@ -1125,20 +1118,15 @@ static int mxt_probe(struct i2c_client *client,
 	__set_bit(EV_KEY, input_dev->evbit);
 	__set_bit(BTN_TOUCH, input_dev->keybit);
 
-	if (data->is_tp) {
-		int i;
-		__set_bit(INPUT_PROP_POINTER, input_dev->propbit);
+	if (pdata->t19_num_keys) {
 		__set_bit(INPUT_PROP_BUTTONPAD, input_dev->propbit);
 
-		for (i = 0; i < MXT_NUM_GPIO; i++)
-			if (pdata->key_map[i] != KEY_RESERVED)
-				__set_bit(pdata->key_map[i], input_dev->keybit);
+		for (i = 0; i < pdata->t19_num_keys; i++)
+			if (pdata->t19_keymap[i] != KEY_RESERVED)
+				input_set_capability(input_dev, EV_KEY,
+						     pdata->t19_keymap[i]);
 
-		__set_bit(BTN_TOOL_FINGER, input_dev->keybit);
-		__set_bit(BTN_TOOL_DOUBLETAP, input_dev->keybit);
-		__set_bit(BTN_TOOL_TRIPLETAP, input_dev->keybit);
-		__set_bit(BTN_TOOL_QUADTAP, input_dev->keybit);
-		__set_bit(BTN_TOOL_QUINTTAP, input_dev->keybit);
+		mt_flags |= INPUT_MT_POINTER;
 
 		input_abs_set_res(input_dev, ABS_X, MXT_PIXELS_PER_MM);
 		input_abs_set_res(input_dev, ABS_Y, MXT_PIXELS_PER_MM);
@@ -1146,6 +1134,8 @@ static int mxt_probe(struct i2c_client *client,
 				  MXT_PIXELS_PER_MM);
 		input_abs_set_res(input_dev, ABS_MT_POSITION_Y,
 				  MXT_PIXELS_PER_MM);
+
+		input_dev->name = "Atmel maXTouch Touchpad";
 	}
 
 	/* For single touch */
@@ -1158,7 +1148,7 @@ static int mxt_probe(struct i2c_client *client,
 
 	/* For multi touch */
 	num_mt_slots = data->T9_reportid_max - data->T9_reportid_min + 1;
-	error = input_mt_init_slots(input_dev, num_mt_slots, 0);
+	error = input_mt_init_slots(input_dev, num_mt_slots, mt_flags);
 	if (error)
 		goto err_free_object;
 	input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR,
diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index 2559a0407c58..8b7523ab62e5 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -84,16 +84,22 @@ static struct i2c_board_info tsl2563_als_device = {
 	I2C_BOARD_INFO("tsl2563", TAOS_ALS_I2C_ADDR),
 };
 
+static int mxt_t19_keys[] = {
+	KEY_RESERVED,
+	KEY_RESERVED,
+	KEY_RESERVED,
+	KEY_RESERVED,
+	KEY_RESERVED,
+	BTN_LEFT
+};
+
 static struct mxt_platform_data atmel_224s_tp_platform_data = {
 	.x_size			= 102*20,
 	.y_size			= 68*20,
 	.orient			= MXT_VERTICAL_FLIP,
 	.irqflags		= IRQF_TRIGGER_FALLING,
-	.is_tp			= true,
-	.key_map		= { KEY_RESERVED,
-				    KEY_RESERVED,
-				    KEY_RESERVED,
-				    BTN_LEFT },
+	.t19_num_keys		= ARRAY_SIZE(mxt_t19_keys),
+	.t19_keymap		= mxt_t19_keys,
 	.config			= NULL,
 	.config_length		= 0,
 };
@@ -109,7 +115,6 @@ static struct mxt_platform_data atmel_1664s_platform_data = {
 	.y_size			= 2560,
 	.orient			= MXT_ROTATED_90_COUNTER,
 	.irqflags		= IRQF_TRIGGER_FALLING,
-	.is_tp			= false,
 	.config			= NULL,
 	.config_length		= 0,
 };
diff --git a/include/linux/i2c/atmel_mxt_ts.h b/include/linux/i2c/atmel_mxt_ts.h
index eff0cdc08843..d26080dc606c 100644
--- a/include/linux/i2c/atmel_mxt_ts.h
+++ b/include/linux/i2c/atmel_mxt_ts.h
@@ -15,9 +15,6 @@
 
 #include <linux/types.h>
 
-/* For key_map array */
-#define MXT_NUM_GPIO		4
-
 /* Orient */
 #define MXT_NORMAL		0x0
 #define MXT_DIAGONAL		0x1
@@ -38,8 +35,8 @@ struct mxt_platform_data {
 	unsigned char orient;
 
 	unsigned long irqflags;
-	bool is_tp;
-	const unsigned int key_map[MXT_NUM_GPIO];
+	u8 t19_num_keys;
+	const unsigned int *t19_keymap;
 };
 
 #endif /* __LINUX_ATMEL_MXT_TS_H */
-- 
cgit v1.2.3-71-gd317


From c3f78043d5aea39205a14c580babd87fbdcfa148 Mon Sep 17 00:00:00 2001
From: Nick Dyer <nick.dyer@itdev.co.uk>
Date: Sun, 18 May 2014 23:04:46 -0700
Subject: Input: atmel_mxt_ts - implement CRC check for configuration data

The configuration is stored in NVRAM on the maXTouch chip. When the device
is reset it reports a CRC of the stored configuration values. Therefore it
isn't necessary to send the configuration on each probe - we can check the
CRC matches and avoid a timeconsuming backup/reset cycle.

Signed-off-by: Nick Dyer <nick.dyer@itdev.co.uk>
Acked-by: Benson Leung <bleung@chromium.org>
Acked-by: Yufeng Shen <miletus@chromium.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/atmel_mxt_ts.c | 60 +++++++++++++++++++++++++++-----
 include/linux/i2c/atmel_mxt_ts.h         |  1 +
 2 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index 278f364ec6e1..61f9ef221d12 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -188,6 +188,7 @@
 #define MXT_BACKUP_TIME		50	/* msec */
 #define MXT_RESET_TIME		200	/* msec */
 #define MXT_RESET_TIMEOUT	3000	/* msec */
+#define MXT_CRC_TIMEOUT		1000	/* msec */
 #define MXT_FW_RESET_TIME	3000	/* msec */
 #define MXT_FW_CHG_TIMEOUT	300	/* msec */
 
@@ -259,6 +260,7 @@ struct mxt_data {
 	unsigned int max_x;
 	unsigned int max_y;
 	bool in_bootloader;
+	u32 config_crc;
 
 	/* Cached parameters from object table */
 	u8 T6_reportid;
@@ -272,6 +274,9 @@ struct mxt_data {
 
 	/* for reset handling */
 	struct completion reset_completion;
+
+	/* for config update handling */
+	struct completion crc_completion;
 };
 
 static size_t mxt_obj_size(const struct mxt_object *obj)
@@ -636,7 +641,7 @@ static void mxt_input_touchevent(struct mxt_data *data,
 	}
 }
 
-static unsigned mxt_extract_T6_csum(const u8 *csum)
+static u16 mxt_extract_T6_csum(const u8 *csum)
 {
 	return csum[0] | (csum[1] << 8) | (csum[2] << 16);
 }
@@ -654,6 +659,7 @@ static irqreturn_t mxt_process_messages_until_invalid(struct mxt_data *data)
 	struct device *dev = &data->client->dev;
 	u8 reportid;
 	bool update_input = false;
+	u32 crc;
 
 	do {
 		if (mxt_read_message(data, &message)) {
@@ -665,9 +671,15 @@ static irqreturn_t mxt_process_messages_until_invalid(struct mxt_data *data)
 
 		if (reportid == data->T6_reportid) {
 			u8 status = payload[0];
-			unsigned csum = mxt_extract_T6_csum(&payload[1]);
+
+			crc = mxt_extract_T6_csum(&payload[1]);
+			if (crc != data->config_crc) {
+				data->config_crc = crc;
+				complete(&data->crc_completion);
+			}
+
 			dev_dbg(dev, "Status: %02x Config Checksum: %06x\n",
-				status, csum);
+				status, data->config_crc);
 
 			if (status & MXT_T6_STATUS_RESET)
 				complete(&data->reset_completion);
@@ -757,6 +769,24 @@ static int mxt_soft_reset(struct mxt_data *data)
 	return 0;
 }
 
+static void mxt_update_crc(struct mxt_data *data, u8 cmd, u8 value)
+{
+	/*
+	 * On failure, CRC is set to 0 and config will always be
+	 * downloaded.
+	 */
+	data->config_crc = 0;
+	reinit_completion(&data->crc_completion);
+
+	mxt_t6_command(data, cmd, value, true);
+
+	/*
+	 * Wait for crc message. On failure, CRC is set to 0 and config will
+	 * always be downloaded.
+	 */
+	mxt_wait_for_completion(data, &data->crc_completion, MXT_CRC_TIMEOUT);
+}
+
 static int mxt_check_reg_init(struct mxt_data *data)
 {
 	const struct mxt_platform_data *pdata = data->pdata;
@@ -771,6 +801,16 @@ static int mxt_check_reg_init(struct mxt_data *data)
 		return 0;
 	}
 
+	mxt_update_crc(data, MXT_COMMAND_REPORTALL, 1);
+
+	if (data->config_crc == pdata->config_crc) {
+		dev_info(dev, "Config CRC 0x%06X: OK\n", data->config_crc);
+		return 0;
+	}
+
+	dev_info(dev, "Config CRC 0x%06X: does not match 0x%06X\n",
+		 data->config_crc, pdata->config_crc);
+
 	for (i = 0; i < data->info.object_num; i++) {
 		object = data->object_table + i;
 
@@ -790,6 +830,14 @@ static int mxt_check_reg_init(struct mxt_data *data)
 		index += size;
 	}
 
+	mxt_update_crc(data, MXT_COMMAND_BACKUPNV, MXT_BACKUP_VALUE);
+
+	ret = mxt_soft_reset(data);
+	if (ret)
+		return ret;
+
+	dev_info(dev, "Config successfully updated\n");
+
 	return 0;
 }
 
@@ -929,11 +977,6 @@ static int mxt_initialize(struct mxt_data *data)
 		goto err_free_object_table;
 	}
 
-	error = mxt_t6_command(data, MXT_COMMAND_BACKUPNV,
-			       MXT_BACKUP_VALUE, false);
-	if (!error)
-		mxt_soft_reset(data);
-
 	/* Update matrix size at info struct */
 	error = mxt_read_reg(client, MXT_MATRIX_X_SIZE, &val);
 	if (error)
@@ -1263,6 +1306,7 @@ static int mxt_probe(struct i2c_client *client,
 
 	init_completion(&data->bl_completion);
 	init_completion(&data->reset_completion);
+	init_completion(&data->crc_completion);
 
 	mxt_calc_resolution(data);
 
diff --git a/include/linux/i2c/atmel_mxt_ts.h b/include/linux/i2c/atmel_mxt_ts.h
index d26080dc606c..9f92135b6620 100644
--- a/include/linux/i2c/atmel_mxt_ts.h
+++ b/include/linux/i2c/atmel_mxt_ts.h
@@ -29,6 +29,7 @@
 struct mxt_platform_data {
 	const u8 *config;
 	size_t config_length;
+	u32 config_crc;
 
 	unsigned int x_size;
 	unsigned int y_size;
-- 
cgit v1.2.3-71-gd317


From fd1159318e55e901cf269de90163b19fd62938cb Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 17 May 2014 00:03:54 +0400
Subject: can: add Renesas R-Car CAN driver

Add support for the CAN controller found in Renesas R-Car SoCs.

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/Kconfig               |  10 +
 drivers/net/can/Makefile              |   1 +
 drivers/net/can/rcar_can.c            | 876 ++++++++++++++++++++++++++++++++++
 include/linux/can/platform/rcar_can.h |  17 +
 4 files changed, 904 insertions(+)
 create mode 100644 drivers/net/can/rcar_can.c
 create mode 100644 include/linux/can/platform/rcar_can.h

(limited to 'include/linux')

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index ac67afa7735c..714b18790caf 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -119,6 +119,16 @@ config CAN_GRCAN
 	  endian syntheses of the cores would need some modifications on
 	  the hardware level to work.
 
+config CAN_RCAR
+	tristate "Renesas R-Car CAN controller"
+	depends on ARM
+	---help---
+	  Say Y here if you want to use CAN controller found on Renesas R-Car
+	  SoCs.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called rcar_can.
+
 source "drivers/net/can/mscan/Kconfig"
 
 source "drivers/net/can/sja1000/Kconfig"
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index c42058868b0f..90f538c73f8c 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -25,5 +25,6 @@ obj-$(CONFIG_CAN_JANZ_ICAN3)	+= janz-ican3.o
 obj-$(CONFIG_CAN_FLEXCAN)	+= flexcan.o
 obj-$(CONFIG_PCH_CAN)		+= pch_can.o
 obj-$(CONFIG_CAN_GRCAN)		+= grcan.o
+obj-$(CONFIG_CAN_RCAR)		+= rcar_can.o
 
 ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
diff --git a/drivers/net/can/rcar_can.c b/drivers/net/can/rcar_can.c
new file mode 100644
index 000000000000..5268d216ecfa
--- /dev/null
+++ b/drivers/net/can/rcar_can.c
@@ -0,0 +1,876 @@
+/* Renesas R-Car CAN device driver
+ *
+ * Copyright (C) 2013 Cogent Embedded, Inc. <source@cogentembedded.com>
+ * Copyright (C) 2013 Renesas Solutions Corp.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/can/led.h>
+#include <linux/can/dev.h>
+#include <linux/clk.h>
+#include <linux/can/platform/rcar_can.h>
+
+#define RCAR_CAN_DRV_NAME	"rcar_can"
+
+/* Mailbox configuration:
+ * mailbox 60 - 63 - Rx FIFO mailboxes
+ * mailbox 56 - 59 - Tx FIFO mailboxes
+ * non-FIFO mailboxes are not used
+ */
+#define RCAR_CAN_N_MBX		64 /* Number of mailboxes in non-FIFO mode */
+#define RCAR_CAN_RX_FIFO_MBX	60 /* Mailbox - window to Rx FIFO */
+#define RCAR_CAN_TX_FIFO_MBX	56 /* Mailbox - window to Tx FIFO */
+#define RCAR_CAN_FIFO_DEPTH	4
+
+/* Mailbox registers structure */
+struct rcar_can_mbox_regs {
+	u32 id;		/* IDE and RTR bits, SID and EID */
+	u8 stub;	/* Not used */
+	u8 dlc;		/* Data Length Code - bits [0..3] */
+	u8 data[8];	/* Data Bytes */
+	u8 tsh;		/* Time Stamp Higher Byte */
+	u8 tsl;		/* Time Stamp Lower Byte */
+};
+
+struct rcar_can_regs {
+	struct rcar_can_mbox_regs mb[RCAR_CAN_N_MBX]; /* Mailbox registers */
+	u32 mkr_2_9[8];	/* Mask Registers 2-9 */
+	u32 fidcr[2];	/* FIFO Received ID Compare Register */
+	u32 mkivlr1;	/* Mask Invalid Register 1 */
+	u32 mier1;	/* Mailbox Interrupt Enable Register 1 */
+	u32 mkr_0_1[2];	/* Mask Registers 0-1 */
+	u32 mkivlr0;    /* Mask Invalid Register 0*/
+	u32 mier0;      /* Mailbox Interrupt Enable Register 0 */
+	u8 pad_440[0x3c0];
+	u8 mctl[64];	/* Message Control Registers */
+	u16 ctlr;	/* Control Register */
+	u16 str;	/* Status register */
+	u8 bcr[3];	/* Bit Configuration Register */
+	u8 clkr;	/* Clock Select Register */
+	u8 rfcr;	/* Receive FIFO Control Register */
+	u8 rfpcr;	/* Receive FIFO Pointer Control Register */
+	u8 tfcr;	/* Transmit FIFO Control Register */
+	u8 tfpcr;       /* Transmit FIFO Pointer Control Register */
+	u8 eier;	/* Error Interrupt Enable Register */
+	u8 eifr;	/* Error Interrupt Factor Judge Register */
+	u8 recr;	/* Receive Error Count Register */
+	u8 tecr;        /* Transmit Error Count Register */
+	u8 ecsr;	/* Error Code Store Register */
+	u8 cssr;	/* Channel Search Support Register */
+	u8 mssr;	/* Mailbox Search Status Register */
+	u8 msmr;	/* Mailbox Search Mode Register */
+	u16 tsr;	/* Time Stamp Register */
+	u8 afsr;	/* Acceptance Filter Support Register */
+	u8 pad_857;
+	u8 tcr;		/* Test Control Register */
+	u8 pad_859[7];
+	u8 ier;		/* Interrupt Enable Register */
+	u8 isr;		/* Interrupt Status Register */
+	u8 pad_862;
+	u8 mbsmr;	/* Mailbox Search Mask Register */
+};
+
+struct rcar_can_priv {
+	struct can_priv can;	/* Must be the first member! */
+	struct net_device *ndev;
+	struct napi_struct napi;
+	struct rcar_can_regs __iomem *regs;
+	struct clk *clk;
+	u8 tx_dlc[RCAR_CAN_FIFO_DEPTH];
+	u32 tx_head;
+	u32 tx_tail;
+	u8 clock_select;
+	u8 ier;
+};
+
+static const struct can_bittiming_const rcar_can_bittiming_const = {
+	.name = RCAR_CAN_DRV_NAME,
+	.tseg1_min = 4,
+	.tseg1_max = 16,
+	.tseg2_min = 2,
+	.tseg2_max = 8,
+	.sjw_max = 4,
+	.brp_min = 1,
+	.brp_max = 1024,
+	.brp_inc = 1,
+};
+
+/* Control Register bits */
+#define RCAR_CAN_CTLR_BOM	(3 << 11) /* Bus-Off Recovery Mode Bits */
+#define RCAR_CAN_CTLR_BOM_ENT	(1 << 11) /* Entry to halt mode */
+					/* at bus-off entry */
+#define RCAR_CAN_CTLR_SLPM	(1 << 10)
+#define RCAR_CAN_CTLR_CANM	(3 << 8) /* Operating Mode Select Bit */
+#define RCAR_CAN_CTLR_CANM_HALT	(1 << 9)
+#define RCAR_CAN_CTLR_CANM_RESET (1 << 8)
+#define RCAR_CAN_CTLR_CANM_FORCE_RESET (3 << 8)
+#define RCAR_CAN_CTLR_MLM	(1 << 3) /* Message Lost Mode Select */
+#define RCAR_CAN_CTLR_IDFM	(3 << 1) /* ID Format Mode Select Bits */
+#define RCAR_CAN_CTLR_IDFM_MIXED (1 << 2) /* Mixed ID mode */
+#define RCAR_CAN_CTLR_MBM	(1 << 0) /* Mailbox Mode select */
+
+/* Status Register bits */
+#define RCAR_CAN_STR_RSTST	(1 << 8) /* Reset Status Bit */
+
+/* FIFO Received ID Compare Registers 0 and 1 bits */
+#define RCAR_CAN_FIDCR_IDE	(1 << 31) /* ID Extension Bit */
+#define RCAR_CAN_FIDCR_RTR	(1 << 30) /* Remote Transmission Request Bit */
+
+/* Receive FIFO Control Register bits */
+#define RCAR_CAN_RFCR_RFEST	(1 << 7) /* Receive FIFO Empty Status Flag */
+#define RCAR_CAN_RFCR_RFE	(1 << 0) /* Receive FIFO Enable */
+
+/* Transmit FIFO Control Register bits */
+#define RCAR_CAN_TFCR_TFUST	(7 << 1) /* Transmit FIFO Unsent Message */
+					/* Number Status Bits */
+#define RCAR_CAN_TFCR_TFUST_SHIFT 1	/* Offset of Transmit FIFO Unsent */
+					/* Message Number Status Bits */
+#define RCAR_CAN_TFCR_TFE	(1 << 0) /* Transmit FIFO Enable */
+
+#define RCAR_CAN_N_RX_MKREGS1	2	/* Number of mask registers */
+					/* for Rx mailboxes 0-31 */
+#define RCAR_CAN_N_RX_MKREGS2	8
+
+/* Bit Configuration Register settings */
+#define RCAR_CAN_BCR_TSEG1(x)	(((x) & 0x0f) << 20)
+#define RCAR_CAN_BCR_BPR(x)	(((x) & 0x3ff) << 8)
+#define RCAR_CAN_BCR_SJW(x)	(((x) & 0x3) << 4)
+#define RCAR_CAN_BCR_TSEG2(x)	((x) & 0x07)
+
+/* Mailbox and Mask Registers bits */
+#define RCAR_CAN_IDE		(1 << 31)
+#define RCAR_CAN_RTR		(1 << 30)
+#define RCAR_CAN_SID_SHIFT	18
+
+/* Mailbox Interrupt Enable Register 1 bits */
+#define RCAR_CAN_MIER1_RXFIE	(1 << 28) /* Receive  FIFO Interrupt Enable */
+#define RCAR_CAN_MIER1_TXFIE	(1 << 24) /* Transmit FIFO Interrupt Enable */
+
+/* Interrupt Enable Register bits */
+#define RCAR_CAN_IER_ERSIE	(1 << 5) /* Error (ERS) Interrupt Enable Bit */
+#define RCAR_CAN_IER_RXFIE	(1 << 4) /* Reception FIFO Interrupt */
+					/* Enable Bit */
+#define RCAR_CAN_IER_TXFIE	(1 << 3) /* Transmission FIFO Interrupt */
+					/* Enable Bit */
+/* Interrupt Status Register bits */
+#define RCAR_CAN_ISR_ERSF	(1 << 5) /* Error (ERS) Interrupt Status Bit */
+#define RCAR_CAN_ISR_RXFF	(1 << 4) /* Reception FIFO Interrupt */
+					/* Status Bit */
+#define RCAR_CAN_ISR_TXFF	(1 << 3) /* Transmission FIFO Interrupt */
+					/* Status Bit */
+
+/* Error Interrupt Enable Register bits */
+#define RCAR_CAN_EIER_BLIE	(1 << 7) /* Bus Lock Interrupt Enable */
+#define RCAR_CAN_EIER_OLIE	(1 << 6) /* Overload Frame Transmit */
+					/* Interrupt Enable */
+#define RCAR_CAN_EIER_ORIE	(1 << 5) /* Receive Overrun  Interrupt Enable */
+#define RCAR_CAN_EIER_BORIE	(1 << 4) /* Bus-Off Recovery Interrupt Enable */
+#define RCAR_CAN_EIER_BOEIE	(1 << 3) /* Bus-Off Entry Interrupt Enable */
+#define RCAR_CAN_EIER_EPIE	(1 << 2) /* Error Passive Interrupt Enable */
+#define RCAR_CAN_EIER_EWIE	(1 << 1) /* Error Warning Interrupt Enable */
+#define RCAR_CAN_EIER_BEIE	(1 << 0) /* Bus Error Interrupt Enable */
+
+/* Error Interrupt Factor Judge Register bits */
+#define RCAR_CAN_EIFR_BLIF	(1 << 7) /* Bus Lock Detect Flag */
+#define RCAR_CAN_EIFR_OLIF	(1 << 6) /* Overload Frame Transmission */
+					 /* Detect Flag */
+#define RCAR_CAN_EIFR_ORIF	(1 << 5) /* Receive Overrun Detect Flag */
+#define RCAR_CAN_EIFR_BORIF	(1 << 4) /* Bus-Off Recovery Detect Flag */
+#define RCAR_CAN_EIFR_BOEIF	(1 << 3) /* Bus-Off Entry Detect Flag */
+#define RCAR_CAN_EIFR_EPIF	(1 << 2) /* Error Passive Detect Flag */
+#define RCAR_CAN_EIFR_EWIF	(1 << 1) /* Error Warning Detect Flag */
+#define RCAR_CAN_EIFR_BEIF	(1 << 0) /* Bus Error Detect Flag */
+
+/* Error Code Store Register bits */
+#define RCAR_CAN_ECSR_EDPM	(1 << 7) /* Error Display Mode Select Bit */
+#define RCAR_CAN_ECSR_ADEF	(1 << 6) /* ACK Delimiter Error Flag */
+#define RCAR_CAN_ECSR_BE0F	(1 << 5) /* Bit Error (dominant) Flag */
+#define RCAR_CAN_ECSR_BE1F	(1 << 4) /* Bit Error (recessive) Flag */
+#define RCAR_CAN_ECSR_CEF	(1 << 3) /* CRC Error Flag */
+#define RCAR_CAN_ECSR_AEF	(1 << 2) /* ACK Error Flag */
+#define RCAR_CAN_ECSR_FEF	(1 << 1) /* Form Error Flag */
+#define RCAR_CAN_ECSR_SEF	(1 << 0) /* Stuff Error Flag */
+
+#define RCAR_CAN_NAPI_WEIGHT	4
+#define MAX_STR_READS		0x100
+
+static void tx_failure_cleanup(struct net_device *ndev)
+{
+	int i;
+
+	for (i = 0; i < RCAR_CAN_FIFO_DEPTH; i++)
+		can_free_echo_skb(ndev, i);
+}
+
+static void rcar_can_error(struct net_device *ndev)
+{
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+	u8 eifr, txerr = 0, rxerr = 0;
+
+	/* Propagate the error condition to the CAN stack */
+	skb = alloc_can_err_skb(ndev, &cf);
+
+	eifr = readb(&priv->regs->eifr);
+	if (eifr & (RCAR_CAN_EIFR_EWIF | RCAR_CAN_EIFR_EPIF)) {
+		txerr = readb(&priv->regs->tecr);
+		rxerr = readb(&priv->regs->recr);
+		if (skb) {
+			cf->can_id |= CAN_ERR_CRTL;
+			cf->data[6] = txerr;
+			cf->data[7] = rxerr;
+		}
+	}
+	if (eifr & RCAR_CAN_EIFR_BEIF) {
+		int rx_errors = 0, tx_errors = 0;
+		u8 ecsr;
+
+		netdev_dbg(priv->ndev, "Bus error interrupt:\n");
+		if (skb) {
+			cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_PROT;
+			cf->data[2] = CAN_ERR_PROT_UNSPEC;
+		}
+		ecsr = readb(&priv->regs->ecsr);
+		if (ecsr & RCAR_CAN_ECSR_ADEF) {
+			netdev_dbg(priv->ndev, "ACK Delimiter Error\n");
+			tx_errors++;
+			writeb(~RCAR_CAN_ECSR_ADEF, &priv->regs->ecsr);
+			if (skb)
+				cf->data[3] |= CAN_ERR_PROT_LOC_ACK_DEL;
+		}
+		if (ecsr & RCAR_CAN_ECSR_BE0F) {
+			netdev_dbg(priv->ndev, "Bit Error (dominant)\n");
+			tx_errors++;
+			writeb(~RCAR_CAN_ECSR_BE0F, &priv->regs->ecsr);
+			if (skb)
+				cf->data[2] |= CAN_ERR_PROT_BIT0;
+		}
+		if (ecsr & RCAR_CAN_ECSR_BE1F) {
+			netdev_dbg(priv->ndev, "Bit Error (recessive)\n");
+			tx_errors++;
+			writeb(~RCAR_CAN_ECSR_BE1F, &priv->regs->ecsr);
+			if (skb)
+				cf->data[2] |= CAN_ERR_PROT_BIT1;
+		}
+		if (ecsr & RCAR_CAN_ECSR_CEF) {
+			netdev_dbg(priv->ndev, "CRC Error\n");
+			rx_errors++;
+			writeb(~RCAR_CAN_ECSR_CEF, &priv->regs->ecsr);
+			if (skb)
+				cf->data[3] |= CAN_ERR_PROT_LOC_CRC_SEQ;
+		}
+		if (ecsr & RCAR_CAN_ECSR_AEF) {
+			netdev_dbg(priv->ndev, "ACK Error\n");
+			tx_errors++;
+			writeb(~RCAR_CAN_ECSR_AEF, &priv->regs->ecsr);
+			if (skb) {
+				cf->can_id |= CAN_ERR_ACK;
+				cf->data[3] |= CAN_ERR_PROT_LOC_ACK;
+			}
+		}
+		if (ecsr & RCAR_CAN_ECSR_FEF) {
+			netdev_dbg(priv->ndev, "Form Error\n");
+			rx_errors++;
+			writeb(~RCAR_CAN_ECSR_FEF, &priv->regs->ecsr);
+			if (skb)
+				cf->data[2] |= CAN_ERR_PROT_FORM;
+		}
+		if (ecsr & RCAR_CAN_ECSR_SEF) {
+			netdev_dbg(priv->ndev, "Stuff Error\n");
+			rx_errors++;
+			writeb(~RCAR_CAN_ECSR_SEF, &priv->regs->ecsr);
+			if (skb)
+				cf->data[2] |= CAN_ERR_PROT_STUFF;
+		}
+
+		priv->can.can_stats.bus_error++;
+		ndev->stats.rx_errors += rx_errors;
+		ndev->stats.tx_errors += tx_errors;
+		writeb(~RCAR_CAN_EIFR_BEIF, &priv->regs->eifr);
+	}
+	if (eifr & RCAR_CAN_EIFR_EWIF) {
+		netdev_dbg(priv->ndev, "Error warning interrupt\n");
+		priv->can.state = CAN_STATE_ERROR_WARNING;
+		priv->can.can_stats.error_warning++;
+		/* Clear interrupt condition */
+		writeb(~RCAR_CAN_EIFR_EWIF, &priv->regs->eifr);
+		if (skb)
+			cf->data[1] = txerr > rxerr ? CAN_ERR_CRTL_TX_WARNING :
+					      CAN_ERR_CRTL_RX_WARNING;
+	}
+	if (eifr & RCAR_CAN_EIFR_EPIF) {
+		netdev_dbg(priv->ndev, "Error passive interrupt\n");
+		priv->can.state = CAN_STATE_ERROR_PASSIVE;
+		priv->can.can_stats.error_passive++;
+		/* Clear interrupt condition */
+		writeb(~RCAR_CAN_EIFR_EPIF, &priv->regs->eifr);
+		if (skb)
+			cf->data[1] = txerr > rxerr ? CAN_ERR_CRTL_TX_PASSIVE :
+					      CAN_ERR_CRTL_RX_PASSIVE;
+	}
+	if (eifr & RCAR_CAN_EIFR_BOEIF) {
+		netdev_dbg(priv->ndev, "Bus-off entry interrupt\n");
+		tx_failure_cleanup(ndev);
+		priv->ier = RCAR_CAN_IER_ERSIE;
+		writeb(priv->ier, &priv->regs->ier);
+		priv->can.state = CAN_STATE_BUS_OFF;
+		/* Clear interrupt condition */
+		writeb(~RCAR_CAN_EIFR_BOEIF, &priv->regs->eifr);
+		can_bus_off(ndev);
+		if (skb)
+			cf->can_id |= CAN_ERR_BUSOFF;
+	}
+	if (eifr & RCAR_CAN_EIFR_ORIF) {
+		netdev_dbg(priv->ndev, "Receive overrun error interrupt\n");
+		ndev->stats.rx_over_errors++;
+		ndev->stats.rx_errors++;
+		writeb(~RCAR_CAN_EIFR_ORIF, &priv->regs->eifr);
+		if (skb) {
+			cf->can_id |= CAN_ERR_CRTL;
+			cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
+		}
+	}
+	if (eifr & RCAR_CAN_EIFR_OLIF) {
+		netdev_dbg(priv->ndev,
+			   "Overload Frame Transmission error interrupt\n");
+		ndev->stats.rx_over_errors++;
+		ndev->stats.rx_errors++;
+		writeb(~RCAR_CAN_EIFR_OLIF, &priv->regs->eifr);
+		if (skb) {
+			cf->can_id |= CAN_ERR_PROT;
+			cf->data[2] |= CAN_ERR_PROT_OVERLOAD;
+		}
+	}
+
+	if (skb) {
+		stats->rx_packets++;
+		stats->rx_bytes += cf->can_dlc;
+		netif_rx(skb);
+	}
+}
+
+static void rcar_can_tx_done(struct net_device *ndev)
+{
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	u8 isr;
+
+	while (1) {
+		u8 unsent = readb(&priv->regs->tfcr);
+
+		unsent = (unsent & RCAR_CAN_TFCR_TFUST) >>
+			  RCAR_CAN_TFCR_TFUST_SHIFT;
+		if (priv->tx_head - priv->tx_tail <= unsent)
+			break;
+		stats->tx_packets++;
+		stats->tx_bytes += priv->tx_dlc[priv->tx_tail %
+						RCAR_CAN_FIFO_DEPTH];
+		priv->tx_dlc[priv->tx_tail % RCAR_CAN_FIFO_DEPTH] = 0;
+		can_get_echo_skb(ndev, priv->tx_tail % RCAR_CAN_FIFO_DEPTH);
+		priv->tx_tail++;
+		netif_wake_queue(ndev);
+	}
+	/* Clear interrupt */
+	isr = readb(&priv->regs->isr);
+	writeb(isr & ~RCAR_CAN_ISR_TXFF, &priv->regs->isr);
+	can_led_event(ndev, CAN_LED_EVENT_TX);
+}
+
+static irqreturn_t rcar_can_interrupt(int irq, void *dev_id)
+{
+	struct net_device *ndev = dev_id;
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	u8 isr;
+
+	isr = readb(&priv->regs->isr);
+	if (!(isr & priv->ier))
+		return IRQ_NONE;
+
+	if (isr & RCAR_CAN_ISR_ERSF)
+		rcar_can_error(ndev);
+
+	if (isr & RCAR_CAN_ISR_TXFF)
+		rcar_can_tx_done(ndev);
+
+	if (isr & RCAR_CAN_ISR_RXFF) {
+		if (napi_schedule_prep(&priv->napi)) {
+			/* Disable Rx FIFO interrupts */
+			priv->ier &= ~RCAR_CAN_IER_RXFIE;
+			writeb(priv->ier, &priv->regs->ier);
+			__napi_schedule(&priv->napi);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void rcar_can_set_bittiming(struct net_device *dev)
+{
+	struct rcar_can_priv *priv = netdev_priv(dev);
+	struct can_bittiming *bt = &priv->can.bittiming;
+	u32 bcr;
+
+	bcr = RCAR_CAN_BCR_TSEG1(bt->phase_seg1 + bt->prop_seg - 1) |
+	      RCAR_CAN_BCR_BPR(bt->brp - 1) | RCAR_CAN_BCR_SJW(bt->sjw - 1) |
+	      RCAR_CAN_BCR_TSEG2(bt->phase_seg2 - 1);
+	/* Don't overwrite CLKR with 32-bit BCR access; CLKR has 8-bit access.
+	 * All the registers are big-endian but they get byte-swapped on 32-bit
+	 * read/write (but not on 8-bit, contrary to the manuals)...
+	 */
+	writel((bcr << 8) | priv->clock_select, &priv->regs->bcr);
+}
+
+static void rcar_can_start(struct net_device *ndev)
+{
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	u16 ctlr;
+	int i;
+
+	/* Set controller to known mode:
+	 * - FIFO mailbox mode
+	 * - accept all messages
+	 * - overrun mode
+	 * CAN is in sleep mode after MCU hardware or software reset.
+	 */
+	ctlr = readw(&priv->regs->ctlr);
+	ctlr &= ~RCAR_CAN_CTLR_SLPM;
+	writew(ctlr, &priv->regs->ctlr);
+	/* Go to reset mode */
+	ctlr |= RCAR_CAN_CTLR_CANM_FORCE_RESET;
+	writew(ctlr, &priv->regs->ctlr);
+	for (i = 0; i < MAX_STR_READS; i++) {
+		if (readw(&priv->regs->str) & RCAR_CAN_STR_RSTST)
+			break;
+	}
+	rcar_can_set_bittiming(ndev);
+	ctlr |= RCAR_CAN_CTLR_IDFM_MIXED; /* Select mixed ID mode */
+	ctlr |= RCAR_CAN_CTLR_BOM_ENT;	/* Entry to halt mode automatically */
+					/* at bus-off */
+	ctlr |= RCAR_CAN_CTLR_MBM;	/* Select FIFO mailbox mode */
+	ctlr |= RCAR_CAN_CTLR_MLM;	/* Overrun mode */
+	writew(ctlr, &priv->regs->ctlr);
+
+	/* Accept all SID and EID */
+	writel(0, &priv->regs->mkr_2_9[6]);
+	writel(0, &priv->regs->mkr_2_9[7]);
+	/* In FIFO mailbox mode, write "0" to bits 24 to 31 */
+	writel(0, &priv->regs->mkivlr1);
+	/* Accept all frames */
+	writel(0, &priv->regs->fidcr[0]);
+	writel(RCAR_CAN_FIDCR_IDE | RCAR_CAN_FIDCR_RTR, &priv->regs->fidcr[1]);
+	/* Enable and configure FIFO mailbox interrupts */
+	writel(RCAR_CAN_MIER1_RXFIE | RCAR_CAN_MIER1_TXFIE, &priv->regs->mier1);
+
+	priv->ier = RCAR_CAN_IER_ERSIE | RCAR_CAN_IER_RXFIE |
+		    RCAR_CAN_IER_TXFIE;
+	writeb(priv->ier, &priv->regs->ier);
+
+	/* Accumulate error codes */
+	writeb(RCAR_CAN_ECSR_EDPM, &priv->regs->ecsr);
+	/* Enable error interrupts */
+	writeb(RCAR_CAN_EIER_EWIE | RCAR_CAN_EIER_EPIE | RCAR_CAN_EIER_BOEIE |
+	       (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING ?
+	       RCAR_CAN_EIER_BEIE : 0) | RCAR_CAN_EIER_ORIE |
+	       RCAR_CAN_EIER_OLIE, &priv->regs->eier);
+	priv->can.state = CAN_STATE_ERROR_ACTIVE;
+
+	/* Go to operation mode */
+	writew(ctlr & ~RCAR_CAN_CTLR_CANM, &priv->regs->ctlr);
+	for (i = 0; i < MAX_STR_READS; i++) {
+		if (!(readw(&priv->regs->str) & RCAR_CAN_STR_RSTST))
+			break;
+	}
+	/* Enable Rx and Tx FIFO */
+	writeb(RCAR_CAN_RFCR_RFE, &priv->regs->rfcr);
+	writeb(RCAR_CAN_TFCR_TFE, &priv->regs->tfcr);
+}
+
+static int rcar_can_open(struct net_device *ndev)
+{
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	int err;
+
+	err = clk_prepare_enable(priv->clk);
+	if (err) {
+		netdev_err(ndev, "clk_prepare_enable() failed, error %d\n",
+			   err);
+		goto out;
+	}
+	err = open_candev(ndev);
+	if (err) {
+		netdev_err(ndev, "open_candev() failed, error %d\n", err);
+		goto out_clock;
+	}
+	napi_enable(&priv->napi);
+	err = request_irq(ndev->irq, rcar_can_interrupt, 0, ndev->name, ndev);
+	if (err) {
+		netdev_err(ndev, "error requesting interrupt %x\n", ndev->irq);
+		goto out_close;
+	}
+	can_led_event(ndev, CAN_LED_EVENT_OPEN);
+	rcar_can_start(ndev);
+	netif_start_queue(ndev);
+	return 0;
+out_close:
+	napi_disable(&priv->napi);
+	close_candev(ndev);
+out_clock:
+	clk_disable_unprepare(priv->clk);
+out:
+	return err;
+}
+
+static void rcar_can_stop(struct net_device *ndev)
+{
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	u16 ctlr;
+	int i;
+
+	/* Go to (force) reset mode */
+	ctlr = readw(&priv->regs->ctlr);
+	ctlr |= RCAR_CAN_CTLR_CANM_FORCE_RESET;
+	writew(ctlr, &priv->regs->ctlr);
+	for (i = 0; i < MAX_STR_READS; i++) {
+		if (readw(&priv->regs->str) & RCAR_CAN_STR_RSTST)
+			break;
+	}
+	writel(0, &priv->regs->mier0);
+	writel(0, &priv->regs->mier1);
+	writeb(0, &priv->regs->ier);
+	writeb(0, &priv->regs->eier);
+	/* Go to sleep mode */
+	ctlr |= RCAR_CAN_CTLR_SLPM;
+	writew(ctlr, &priv->regs->ctlr);
+	priv->can.state = CAN_STATE_STOPPED;
+}
+
+static int rcar_can_close(struct net_device *ndev)
+{
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+
+	netif_stop_queue(ndev);
+	rcar_can_stop(ndev);
+	free_irq(ndev->irq, ndev);
+	napi_disable(&priv->napi);
+	clk_disable_unprepare(priv->clk);
+	close_candev(ndev);
+	can_led_event(ndev, CAN_LED_EVENT_STOP);
+	return 0;
+}
+
+static netdev_tx_t rcar_can_start_xmit(struct sk_buff *skb,
+				       struct net_device *ndev)
+{
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	u32 data, i;
+
+	if (can_dropped_invalid_skb(ndev, skb))
+		return NETDEV_TX_OK;
+
+	if (cf->can_id & CAN_EFF_FLAG)	/* Extended frame format */
+		data = (cf->can_id & CAN_EFF_MASK) | RCAR_CAN_IDE;
+	else				/* Standard frame format */
+		data = (cf->can_id & CAN_SFF_MASK) << RCAR_CAN_SID_SHIFT;
+
+	if (cf->can_id & CAN_RTR_FLAG) { /* Remote transmission request */
+		data |= RCAR_CAN_RTR;
+	} else {
+		for (i = 0; i < cf->can_dlc; i++)
+			writeb(cf->data[i],
+			       &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].data[i]);
+	}
+
+	writel(data, &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].id);
+
+	writeb(cf->can_dlc, &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].dlc);
+
+	priv->tx_dlc[priv->tx_head % RCAR_CAN_FIFO_DEPTH] = cf->can_dlc;
+	can_put_echo_skb(skb, ndev, priv->tx_head % RCAR_CAN_FIFO_DEPTH);
+	priv->tx_head++;
+	/* Start Tx: write 0xff to the TFPCR register to increment
+	 * the CPU-side pointer for the transmit FIFO to the next
+	 * mailbox location
+	 */
+	writeb(0xff, &priv->regs->tfpcr);
+	/* Stop the queue if we've filled all FIFO entries */
+	if (priv->tx_head - priv->tx_tail >= RCAR_CAN_FIFO_DEPTH)
+		netif_stop_queue(ndev);
+
+	return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops rcar_can_netdev_ops = {
+	.ndo_open = rcar_can_open,
+	.ndo_stop = rcar_can_close,
+	.ndo_start_xmit = rcar_can_start_xmit,
+};
+
+static void rcar_can_rx_pkt(struct rcar_can_priv *priv)
+{
+	struct net_device_stats *stats = &priv->ndev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+	u32 data;
+	u8 dlc;
+
+	skb = alloc_can_skb(priv->ndev, &cf);
+	if (!skb) {
+		stats->rx_dropped++;
+		return;
+	}
+
+	data = readl(&priv->regs->mb[RCAR_CAN_RX_FIFO_MBX].id);
+	if (data & RCAR_CAN_IDE)
+		cf->can_id = (data & CAN_EFF_MASK) | CAN_EFF_FLAG;
+	else
+		cf->can_id = (data >> RCAR_CAN_SID_SHIFT) & CAN_SFF_MASK;
+
+	dlc = readb(&priv->regs->mb[RCAR_CAN_RX_FIFO_MBX].dlc);
+	cf->can_dlc = get_can_dlc(dlc);
+	if (data & RCAR_CAN_RTR) {
+		cf->can_id |= CAN_RTR_FLAG;
+	} else {
+		for (dlc = 0; dlc < cf->can_dlc; dlc++)
+			cf->data[dlc] =
+			readb(&priv->regs->mb[RCAR_CAN_RX_FIFO_MBX].data[dlc]);
+	}
+
+	can_led_event(priv->ndev, CAN_LED_EVENT_RX);
+
+	stats->rx_bytes += cf->can_dlc;
+	stats->rx_packets++;
+	netif_receive_skb(skb);
+}
+
+static int rcar_can_rx_poll(struct napi_struct *napi, int quota)
+{
+	struct rcar_can_priv *priv = container_of(napi,
+						  struct rcar_can_priv, napi);
+	int num_pkts;
+
+	for (num_pkts = 0; num_pkts < quota; num_pkts++) {
+		u8 rfcr, isr;
+
+		isr = readb(&priv->regs->isr);
+		/* Clear interrupt bit */
+		if (isr & RCAR_CAN_ISR_RXFF)
+			writeb(isr & ~RCAR_CAN_ISR_RXFF, &priv->regs->isr);
+		rfcr = readb(&priv->regs->rfcr);
+		if (rfcr & RCAR_CAN_RFCR_RFEST)
+			break;
+		rcar_can_rx_pkt(priv);
+		/* Write 0xff to the RFPCR register to increment
+		 * the CPU-side pointer for the receive FIFO
+		 * to the next mailbox location
+		 */
+		writeb(0xff, &priv->regs->rfpcr);
+	}
+	/* All packets processed */
+	if (num_pkts < quota) {
+		napi_complete(napi);
+		priv->ier |= RCAR_CAN_IER_RXFIE;
+		writeb(priv->ier, &priv->regs->ier);
+	}
+	return num_pkts;
+}
+
+static int rcar_can_do_set_mode(struct net_device *ndev, enum can_mode mode)
+{
+	switch (mode) {
+	case CAN_MODE_START:
+		rcar_can_start(ndev);
+		netif_wake_queue(ndev);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int rcar_can_get_berr_counter(const struct net_device *dev,
+				     struct can_berr_counter *bec)
+{
+	struct rcar_can_priv *priv = netdev_priv(dev);
+	int err;
+
+	err = clk_prepare_enable(priv->clk);
+	if (err)
+		return err;
+	bec->txerr = readb(&priv->regs->tecr);
+	bec->rxerr = readb(&priv->regs->recr);
+	clk_disable_unprepare(priv->clk);
+	return 0;
+}
+
+static int rcar_can_probe(struct platform_device *pdev)
+{
+	struct rcar_can_platform_data *pdata;
+	struct rcar_can_priv *priv;
+	struct net_device *ndev;
+	struct resource *mem;
+	void __iomem *addr;
+	int err = -ENODEV;
+	int irq;
+
+	pdata = dev_get_platdata(&pdev->dev);
+	if (!pdata) {
+		dev_err(&pdev->dev, "No platform data provided!\n");
+		goto fail;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (!irq) {
+		dev_err(&pdev->dev, "No IRQ resource\n");
+		goto fail;
+	}
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	addr = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(addr)) {
+		err = PTR_ERR(addr);
+		goto fail;
+	}
+
+	ndev = alloc_candev(sizeof(struct rcar_can_priv), RCAR_CAN_FIFO_DEPTH);
+	if (!ndev) {
+		dev_err(&pdev->dev, "alloc_candev() failed\n");
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	priv = netdev_priv(ndev);
+
+	priv->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(priv->clk)) {
+		err = PTR_ERR(priv->clk);
+		dev_err(&pdev->dev, "cannot get clock: %d\n", err);
+		goto fail_clk;
+	}
+
+	ndev->netdev_ops = &rcar_can_netdev_ops;
+	ndev->irq = irq;
+	ndev->flags |= IFF_ECHO;
+	priv->ndev = ndev;
+	priv->regs = addr;
+	priv->clock_select = pdata->clock_select;
+	priv->can.clock.freq = clk_get_rate(priv->clk);
+	priv->can.bittiming_const = &rcar_can_bittiming_const;
+	priv->can.do_set_mode = rcar_can_do_set_mode;
+	priv->can.do_get_berr_counter = rcar_can_get_berr_counter;
+	priv->can.ctrlmode_supported = CAN_CTRLMODE_BERR_REPORTING;
+	platform_set_drvdata(pdev, ndev);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+
+	netif_napi_add(ndev, &priv->napi, rcar_can_rx_poll,
+		       RCAR_CAN_NAPI_WEIGHT);
+	err = register_candev(ndev);
+	if (err) {
+		dev_err(&pdev->dev, "register_candev() failed, error %d\n",
+			err);
+		goto fail_candev;
+	}
+
+	devm_can_led_init(ndev);
+
+	dev_info(&pdev->dev, "device registered (reg_base=%p, irq=%u)\n",
+		 priv->regs, ndev->irq);
+
+	return 0;
+fail_candev:
+	netif_napi_del(&priv->napi);
+fail_clk:
+	free_candev(ndev);
+fail:
+	return err;
+}
+
+static int rcar_can_remove(struct platform_device *pdev)
+{
+	struct net_device *ndev = platform_get_drvdata(pdev);
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+
+	unregister_candev(ndev);
+	netif_napi_del(&priv->napi);
+	free_candev(ndev);
+	return 0;
+}
+
+static int __maybe_unused rcar_can_suspend(struct device *dev)
+{
+	struct net_device *ndev = dev_get_drvdata(dev);
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	u16 ctlr;
+
+	if (netif_running(ndev)) {
+		netif_stop_queue(ndev);
+		netif_device_detach(ndev);
+	}
+	ctlr = readw(&priv->regs->ctlr);
+	ctlr |= RCAR_CAN_CTLR_CANM_HALT;
+	writew(ctlr, &priv->regs->ctlr);
+	ctlr |= RCAR_CAN_CTLR_SLPM;
+	writew(ctlr, &priv->regs->ctlr);
+	priv->can.state = CAN_STATE_SLEEPING;
+
+	clk_disable(priv->clk);
+	return 0;
+}
+
+static int __maybe_unused rcar_can_resume(struct device *dev)
+{
+	struct net_device *ndev = dev_get_drvdata(dev);
+	struct rcar_can_priv *priv = netdev_priv(ndev);
+	u16 ctlr;
+	int err;
+
+	err = clk_enable(priv->clk);
+	if (err) {
+		netdev_err(ndev, "clk_enable() failed, error %d\n", err);
+		return err;
+	}
+
+	ctlr = readw(&priv->regs->ctlr);
+	ctlr &= ~RCAR_CAN_CTLR_SLPM;
+	writew(ctlr, &priv->regs->ctlr);
+	ctlr &= ~RCAR_CAN_CTLR_CANM;
+	writew(ctlr, &priv->regs->ctlr);
+	priv->can.state = CAN_STATE_ERROR_ACTIVE;
+
+	if (netif_running(ndev)) {
+		netif_device_attach(ndev);
+		netif_start_queue(ndev);
+	}
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(rcar_can_pm_ops, rcar_can_suspend, rcar_can_resume);
+
+static struct platform_driver rcar_can_driver = {
+	.driver = {
+		.name = RCAR_CAN_DRV_NAME,
+		.owner = THIS_MODULE,
+		.pm = &rcar_can_pm_ops,
+	},
+	.probe = rcar_can_probe,
+	.remove = rcar_can_remove,
+};
+
+module_platform_driver(rcar_can_driver);
+
+MODULE_AUTHOR("Cogent Embedded, Inc.");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CAN driver for Renesas R-Car SoC");
+MODULE_ALIAS("platform:" RCAR_CAN_DRV_NAME);
diff --git a/include/linux/can/platform/rcar_can.h b/include/linux/can/platform/rcar_can.h
new file mode 100644
index 000000000000..0f4a2f3df504
--- /dev/null
+++ b/include/linux/can/platform/rcar_can.h
@@ -0,0 +1,17 @@
+#ifndef _CAN_PLATFORM_RCAR_CAN_H_
+#define _CAN_PLATFORM_RCAR_CAN_H_
+
+#include <linux/types.h>
+
+/* Clock Select Register settings */
+enum CLKR {
+	CLKR_CLKP1 = 0,	/* Peripheral clock (clkp1) */
+	CLKR_CLKP2 = 1,	/* Peripheral clock (clkp2) */
+	CLKR_CLKEXT = 3	/* Externally input clock */
+};
+
+struct rcar_can_platform_data {
+	enum CLKR clock_select;	/* Clock source select */
+};
+
+#endif	/* !_CAN_PLATFORM_RCAR_CAN_H_ */
-- 
cgit v1.2.3-71-gd317


From 42193e3efb632c84d686acacd7b2327f2b1f8c63 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Thu, 15 May 2014 20:31:56 +0200
Subject: can: unify identifiers to ensure unique include processing

Armin pointed me to the fact that the identifier which is used to ensure the
unique include processing in lunux/include/uapi/linux/can.h is CAN_H.
This clashed with his own source as includes from libraries and APIs should
use an underscore '_' at the identifier start.

This patch fixes the protection identifiers in all CAN relavant includes.

Reported-by: Armin Burchardt <armin@uni-bremen.de>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/core.h             | 6 +++---
 include/linux/can/dev.h              | 6 +++---
 include/linux/can/led.h              | 6 +++---
 include/linux/can/platform/cc770.h   | 6 +++---
 include/linux/can/platform/mcp251x.h | 6 +++---
 include/linux/can/platform/sja1000.h | 6 +++---
 include/linux/can/platform/ti_hecc.h | 6 +++---
 include/linux/can/skb.h              | 6 +++---
 include/uapi/linux/can.h             | 6 +++---
 include/uapi/linux/can/bcm.h         | 6 +++---
 include/uapi/linux/can/error.h       | 6 +++---
 include/uapi/linux/can/gw.h          | 6 +++---
 include/uapi/linux/can/netlink.h     | 6 +++---
 include/uapi/linux/can/raw.h         | 6 +++---
 14 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/core.h b/include/linux/can/core.h
index 78c6c52073ad..a0875001b13c 100644
--- a/include/linux/can/core.h
+++ b/include/linux/can/core.h
@@ -10,8 +10,8 @@
  *
  */
 
-#ifndef CAN_CORE_H
-#define CAN_CORE_H
+#ifndef _CAN_CORE_H
+#define _CAN_CORE_H
 
 #include <linux/can.h>
 #include <linux/skbuff.h>
@@ -58,4 +58,4 @@ extern void can_rx_unregister(struct net_device *dev, canid_t can_id,
 extern int can_send(struct sk_buff *skb, int loop);
 extern int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 
-#endif /* CAN_CORE_H */
+#endif /* !_CAN_CORE_H */
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 3ce5e526525f..6992afc6ba7f 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -10,8 +10,8 @@
  *
  */
 
-#ifndef CAN_DEV_H
-#define CAN_DEV_H
+#ifndef _CAN_DEV_H
+#define _CAN_DEV_H
 
 #include <linux/can.h>
 #include <linux/can/netlink.h>
@@ -132,4 +132,4 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 struct sk_buff *alloc_can_err_skb(struct net_device *dev,
 				  struct can_frame **cf);
 
-#endif /* CAN_DEV_H */
+#endif /* !_CAN_DEV_H */
diff --git a/include/linux/can/led.h b/include/linux/can/led.h
index 9c1167baf273..e0475c5cbb92 100644
--- a/include/linux/can/led.h
+++ b/include/linux/can/led.h
@@ -6,8 +6,8 @@
  * published by the Free Software Foundation.
  */
 
-#ifndef CAN_LED_H
-#define CAN_LED_H
+#ifndef _CAN_LED_H
+#define _CAN_LED_H
 
 #include <linux/if.h>
 #include <linux/leds.h>
@@ -48,4 +48,4 @@ static inline void can_led_notifier_exit(void)
 
 #endif
 
-#endif
+#endif /* !_CAN_LED_H */
diff --git a/include/linux/can/platform/cc770.h b/include/linux/can/platform/cc770.h
index 7702641f87ee..78b2d44f04cf 100644
--- a/include/linux/can/platform/cc770.h
+++ b/include/linux/can/platform/cc770.h
@@ -1,5 +1,5 @@
-#ifndef _CAN_PLATFORM_CC770_H_
-#define _CAN_PLATFORM_CC770_H_
+#ifndef _CAN_PLATFORM_CC770_H
+#define _CAN_PLATFORM_CC770_H
 
 /* CPU Interface Register (0x02) */
 #define CPUIF_CEN	0x01	/* Clock Out Enable */
@@ -30,4 +30,4 @@ struct cc770_platform_data {
 	u8 bcr;		/* Bus Configuration Register */
 };
 
-#endif	/* !_CAN_PLATFORM_CC770_H_ */
+#endif	/* !_CAN_PLATFORM_CC770_H */
diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h
index dc029dba7a03..d44fcae274ff 100644
--- a/include/linux/can/platform/mcp251x.h
+++ b/include/linux/can/platform/mcp251x.h
@@ -1,5 +1,5 @@
-#ifndef __CAN_PLATFORM_MCP251X_H__
-#define __CAN_PLATFORM_MCP251X_H__
+#ifndef _CAN_PLATFORM_MCP251X_H
+#define _CAN_PLATFORM_MCP251X_H
 
 /*
  *
@@ -18,4 +18,4 @@ struct mcp251x_platform_data {
 	unsigned long oscillator_frequency;
 };
 
-#endif /* __CAN_PLATFORM_MCP251X_H__ */
+#endif /* !_CAN_PLATFORM_MCP251X_H */
diff --git a/include/linux/can/platform/sja1000.h b/include/linux/can/platform/sja1000.h
index 96f8fcc78d78..93570b61ec6c 100644
--- a/include/linux/can/platform/sja1000.h
+++ b/include/linux/can/platform/sja1000.h
@@ -1,5 +1,5 @@
-#ifndef _CAN_PLATFORM_SJA1000_H_
-#define _CAN_PLATFORM_SJA1000_H_
+#ifndef _CAN_PLATFORM_SJA1000_H
+#define _CAN_PLATFORM_SJA1000_H
 
 /* clock divider register */
 #define CDR_CLKOUT_MASK 0x07
@@ -32,4 +32,4 @@ struct sja1000_platform_data {
 	u8 cdr;		/* clock divider register */
 };
 
-#endif	/* !_CAN_PLATFORM_SJA1000_H_ */
+#endif	/* !_CAN_PLATFORM_SJA1000_H */
diff --git a/include/linux/can/platform/ti_hecc.h b/include/linux/can/platform/ti_hecc.h
index af17cb3f7a84..a52f47ca6c8a 100644
--- a/include/linux/can/platform/ti_hecc.h
+++ b/include/linux/can/platform/ti_hecc.h
@@ -1,5 +1,5 @@
-#ifndef __CAN_PLATFORM_TI_HECC_H__
-#define __CAN_PLATFORM_TI_HECC_H__
+#ifndef _CAN_PLATFORM_TI_HECC_H
+#define _CAN_PLATFORM_TI_HECC_H
 
 /*
  * TI HECC (High End CAN Controller) driver platform header
@@ -41,4 +41,4 @@ struct ti_hecc_platform_data {
 	u32 version;
 	void (*transceiver_switch) (int);
 };
-#endif
+#endif /* !_CAN_PLATFORM_TI_HECC_H */
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index f9bbbb472663..cc00d15c6107 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -7,8 +7,8 @@
  *
  */
 
-#ifndef CAN_SKB_H
-#define CAN_SKB_H
+#ifndef _CAN_SKB_H
+#define _CAN_SKB_H
 
 #include <linux/types.h>
 #include <linux/skbuff.h>
@@ -80,4 +80,4 @@ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb)
 	return skb;
 }
 
-#endif /* CAN_SKB_H */
+#endif /* !_CAN_SKB_H */
diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index 5d9d1d140718..41892f720057 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -42,8 +42,8 @@
  * DAMAGE.
  */
 
-#ifndef CAN_H
-#define CAN_H
+#ifndef _UAPI_CAN_H
+#define _UAPI_CAN_H
 
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -191,4 +191,4 @@ struct can_filter {
 
 #define CAN_INV_FILTER 0x20000000U /* to be set in can_filter.can_id */
 
-#endif /* CAN_H */
+#endif /* !_UAPI_CAN_H */
diff --git a/include/uapi/linux/can/bcm.h b/include/uapi/linux/can/bcm.h
index 382251a1d214..89ddb9dc9bdf 100644
--- a/include/uapi/linux/can/bcm.h
+++ b/include/uapi/linux/can/bcm.h
@@ -41,8 +41,8 @@
  * DAMAGE.
  */
 
-#ifndef CAN_BCM_H
-#define CAN_BCM_H
+#ifndef _UAPI_CAN_BCM_H
+#define _UAPI_CAN_BCM_H
 
 #include <linux/types.h>
 #include <linux/can.h>
@@ -95,4 +95,4 @@ enum {
 #define TX_RESET_MULTI_IDX  0x0200
 #define RX_RTR_FRAME        0x0400
 
-#endif /* CAN_BCM_H */
+#endif /* !_UAPI_CAN_BCM_H */
diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index b63204545320..c247446ab25a 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -41,8 +41,8 @@
  * DAMAGE.
  */
 
-#ifndef CAN_ERROR_H
-#define CAN_ERROR_H
+#ifndef _UAPI_CAN_ERROR_H
+#define _UAPI_CAN_ERROR_H
 
 #define CAN_ERR_DLC 8 /* dlc for error message frames */
 
@@ -120,4 +120,4 @@
 
 /* controller specific additional information / data[5..7] */
 
-#endif /* CAN_ERROR_H */
+#endif /* _UAPI_CAN_ERROR_H */
diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h
index 844c8964bdfe..3e6184cf2f6d 100644
--- a/include/uapi/linux/can/gw.h
+++ b/include/uapi/linux/can/gw.h
@@ -41,8 +41,8 @@
  * DAMAGE.
  */
 
-#ifndef CAN_GW_H
-#define CAN_GW_H
+#ifndef _UAPI_CAN_GW_H
+#define _UAPI_CAN_GW_H
 
 #include <linux/types.h>
 #include <linux/can.h>
@@ -200,4 +200,4 @@ enum {
  *         Beware of sending unpacked or aligned structs!
  */
 
-#endif
+#endif /* !_UAPI_CAN_GW_H */
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 7e2e1863db16..813d11f54977 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -15,8 +15,8 @@
  * GNU General Public License for more details.
  */
 
-#ifndef CAN_NETLINK_H
-#define CAN_NETLINK_H
+#ifndef _UAPI_CAN_NETLINK_H
+#define _UAPI_CAN_NETLINK_H
 
 #include <linux/types.h>
 
@@ -130,4 +130,4 @@ enum {
 
 #define IFLA_CAN_MAX	(__IFLA_CAN_MAX - 1)
 
-#endif /* CAN_NETLINK_H */
+#endif /* !_UAPI_CAN_NETLINK_H */
diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h
index c7d8c334e0ce..78ec76fd89a6 100644
--- a/include/uapi/linux/can/raw.h
+++ b/include/uapi/linux/can/raw.h
@@ -42,8 +42,8 @@
  * DAMAGE.
  */
 
-#ifndef CAN_RAW_H
-#define CAN_RAW_H
+#ifndef _UAPI_CAN_RAW_H
+#define _UAPI_CAN_RAW_H
 
 #include <linux/can.h>
 
@@ -59,4 +59,4 @@ enum {
 	CAN_RAW_FD_FRAMES,	/* allow CAN FD frames (default:off) */
 };
 
-#endif
+#endif /* !_UAPI_CAN_RAW_H */
-- 
cgit v1.2.3-71-gd317


From 5533e0114425dcdb878f11b291f2727af8667a7c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 May 2014 19:33:07 -0400
Subject: cgroup: disallow debug controller on the default hierarchy

The debug controller, as its name suggests, exposes cgroup core
internals to userland to aid debugging.  Unfortunately, except for the
name, there's no provision to prevent its usage in production
configurations and the controller is widely enabled and mounted
leaking internal details to userland.  Like most other debug
information, the information exposed by debug isn't interesting even
for debugging itself once the related parts are working reliably.

This controller has no reason for existing.  This patch implements
cgrp_dfl_root_inhibit_ss_mask which can suppress specific subsystems
on the default hierarchy and adds the debug subsystem to it so that it
can be gradually deprecated as usages move towards the unified
hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h        |  2 ++
 include/linux/cgroup_subsys.h | 11 +++++++----
 kernel/cgroup.c               | 21 ++++++++++++++++++---
 3 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4afe544d3547..8a111dd42d7a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -305,6 +305,8 @@ enum {
 	 *   the flag is not created.
 	 *
 	 * - blkcg: blk-throttle becomes properly hierarchical.
+	 *
+	 * - debug: disallowed on the default hierarchy.
 	 */
 	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
 
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 768fe44e19f0..98c4f9b12b03 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -7,10 +7,6 @@
 SUBSYS(cpuset)
 #endif
 
-#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
-SUBSYS(debug)
-#endif
-
 #if IS_ENABLED(CONFIG_CGROUP_SCHED)
 SUBSYS(cpu)
 #endif
@@ -50,6 +46,13 @@ SUBSYS(net_prio)
 #if IS_ENABLED(CONFIG_CGROUP_HUGETLB)
 SUBSYS(hugetlb)
 #endif
+
+/*
+ * The following subsystems are not supported on the default hierarchy.
+ */
+#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
+SUBSYS(debug)
+#endif
 /*
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 082bb842b11a..a5f75ac4e793 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -148,6 +148,13 @@ struct cgroup_root cgrp_dfl_root;
  */
 static bool cgrp_dfl_root_visible;
 
+/* some controllers are not supported in the default hierarchy */
+static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
+#ifdef CONFIG_CGROUP_DEBUG
+	| (1 << debug_cgrp_id)
+#endif
+	;
+
 /* The list of hierarchy roots */
 
 static LIST_HEAD(cgroup_roots);
@@ -1126,6 +1133,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
 static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
 {
 	struct cgroup_subsys *ss;
+	unsigned int tmp_ss_mask;
 	int ssid, i, ret;
 
 	lockdep_assert_held(&cgroup_mutex);
@@ -1143,7 +1151,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
 			return -EBUSY;
 	}
 
-	ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
+	/* skip creating root files on dfl_root for inhibited subsystems */
+	tmp_ss_mask = ss_mask;
+	if (dst_root == &cgrp_dfl_root)
+		tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
+
+	ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
 	if (ret) {
 		if (dst_root != &cgrp_dfl_root)
 			return ret;
@@ -2426,7 +2439,8 @@ static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 
-	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
+	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+			     ~cgrp_dfl_root_inhibit_ss_mask);
 	return 0;
 }
 
@@ -2564,7 +2578,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 		if (tok[0] == '\0')
 			continue;
 		for_each_subsys(ss, ssid) {
-			if (ss->disabled || strcmp(tok + 1, ss->name))
+			if (ss->disabled || strcmp(tok + 1, ss->name) ||
+			    ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
 				continue;
 
 			if (*tok == '+') {
-- 
cgit v1.2.3-71-gd317


From 4f4aa2ec24dc45881849833a439558d3a378028c Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 18 May 2014 00:22:38 +0200
Subject: ssb: sprom: add dev_id field for value overriding standard ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some devices may have different features despite sharing the same ID
(e.g. PCI ID). For example 14e4:4331 is usually a dual band, but this
can be "limited". Device with "pci/x/y/devid=0x4332" supports 2.4 GHz
only. Similarly 0x4333 will mean support for 5 GHz only.
Add entry in SPROM so info described above can be extracted and stored.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Acked-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 arch/mips/bcm47xx/sprom.c | 1 +
 include/linux/ssb/ssb.h   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/arch/mips/bcm47xx/sprom.c b/arch/mips/bcm47xx/sprom.c
index a8b5408dd349..da4cdb16844e 100644
--- a/arch/mips/bcm47xx/sprom.c
+++ b/arch/mips/bcm47xx/sprom.c
@@ -168,6 +168,7 @@ static void nvram_read_alpha2(const char *prefix, const char *name,
 static void bcm47xx_fill_sprom_r1234589(struct ssb_sprom *sprom,
 					const char *prefix, bool fallback)
 {
+	nvram_read_u16(prefix, NULL, "devid", &sprom->dev_id, 0, fallback);
 	nvram_read_u8(prefix, NULL, "ledbh0", &sprom->gpio0, 0xff, fallback);
 	nvram_read_u8(prefix, NULL, "ledbh1", &sprom->gpio1, 0xff, fallback);
 	nvram_read_u8(prefix, NULL, "ledbh2", &sprom->gpio2, 0xff, fallback);
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 07ef9b82b66d..4568a5cc9ab8 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -33,6 +33,7 @@ struct ssb_sprom {
 	u8 et1phyaddr;		/* MII address for enet1 */
 	u8 et0mdcport;		/* MDIO for enet0 */
 	u8 et1mdcport;		/* MDIO for enet1 */
+	u16 dev_id;		/* Device ID overriding e.g. PCI ID */
 	u16 board_rev;		/* Board revision number from SPROM. */
 	u16 board_num;		/* Board number from SPROM. */
 	u16 board_type;		/* Board type from SPROM. */
-- 
cgit v1.2.3-71-gd317


From 66507c7bc8895f0da6b4ad87e96d61a9f7d7a118 Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Thu, 1 May 2014 20:51:19 -0400
Subject: mtd: nand: Add support to use nand_base poi databuf as bounce buffer

nand_base can be passed a kmap()'d buffers from highmem by
filesystems like jffs2. This results in failure to map the
physical address of the DMA buffer on various contoller
driver on different platforms. This change adds a chip option
to use preallocated databuf as bounce buffers used in
nand_do_read_ops() and nand_do_write_ops().
This allows for specific nand controller driver to set this
option as needed.

Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/nand/nand_base.c | 37 +++++++++++++++++++++++++++++++------
 include/linux/mtd/nand.h     |  5 +++++
 2 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 7853b9b0a05e..1b844b8c621f 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -37,6 +37,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
@@ -1500,6 +1501,7 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 		mtd->oobavail : mtd->oobsize;
 
 	uint8_t *bufpoi, *oob, *buf;
+	int use_bufpoi;
 	unsigned int max_bitflips = 0;
 	int retry_mode = 0;
 	bool ecc_fail = false;
@@ -1522,9 +1524,20 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 		bytes = min(mtd->writesize - col, readlen);
 		aligned = (bytes == mtd->writesize);
 
+		if (!aligned)
+			use_bufpoi = 1;
+		else if (chip->options & NAND_USE_BOUNCE_BUFFER)
+			use_bufpoi = !virt_addr_valid(buf);
+		else
+			use_bufpoi = 0;
+
 		/* Is the current page in the buffer? */
 		if (realpage != chip->pagebuf || oob) {
-			bufpoi = aligned ? buf : chip->buffers->databuf;
+			bufpoi = use_bufpoi ? chip->buffers->databuf : buf;
+
+			if (use_bufpoi && aligned)
+				pr_debug("%s: using read bounce buffer for buf@%p\n",
+						 __func__, buf);
 
 read_retry:
 			chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
@@ -1546,7 +1559,7 @@ read_retry:
 				ret = chip->ecc.read_page(mtd, chip, bufpoi,
 							  oob_required, page);
 			if (ret < 0) {
-				if (!aligned)
+				if (use_bufpoi)
 					/* Invalidate page cache */
 					chip->pagebuf = -1;
 				break;
@@ -1555,7 +1568,7 @@ read_retry:
 			max_bitflips = max_t(unsigned int, max_bitflips, ret);
 
 			/* Transfer not aligned data */
-			if (!aligned) {
+			if (use_bufpoi) {
 				if (!NAND_HAS_SUBPAGE_READ(chip) && !oob &&
 				    !(mtd->ecc_stats.failed - ecc_failures) &&
 				    (ops->mode != MTD_OPS_RAW)) {
@@ -2375,11 +2388,23 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 		int bytes = mtd->writesize;
 		int cached = writelen > bytes && page != blockmask;
 		uint8_t *wbuf = buf;
+		int use_bufpoi;
+		int part_pagewr = (column || writelen < (mtd->writesize - 1));
+
+		if (part_pagewr)
+			use_bufpoi = 1;
+		else if (chip->options & NAND_USE_BOUNCE_BUFFER)
+			use_bufpoi = !virt_addr_valid(buf);
+		else
+			use_bufpoi = 0;
 
-		/* Partial page write? */
-		if (unlikely(column || writelen < (mtd->writesize - 1))) {
+		/* Partial page write?, or need to use bounce buffer */
+		if (use_bufpoi) {
+			pr_debug("%s: using write bounce buffer for buf@%p\n",
+					 __func__, buf);
 			cached = 0;
-			bytes = min_t(int, bytes - column, (int) writelen);
+			if (part_pagewr)
+				bytes = min_t(int, bytes - column, writelen);
 			chip->pagebuf = -1;
 			memset(chip->buffers->databuf, 0xff, mtd->writesize);
 			memcpy(&chip->buffers->databuf[column], buf, bytes);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7a922e6c4e4b..2f0af2891f0f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -175,6 +175,11 @@ typedef enum {
 #define NAND_OWN_BUFFERS	0x00020000
 /* Chip may not exist, so silence any errors in scan */
 #define NAND_SCAN_SILENT_NODEV	0x00040000
+/*
+ * This option could be defined by controller drivers to protect against
+ * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers
+ */
+#define NAND_USE_BOUNCE_BUFFER	0x00080000
 /*
  * Autodetect nand buswidth with readid/onfi.
  * This suppose the driver will configure the hardware in 8 bits mode
-- 
cgit v1.2.3-71-gd317


From 27c9fd607587e6c3b517590df4cd35ac85f3d0bd Mon Sep 17 00:00:00 2001
From: pekon gupta <pekon@ti.com>
Date: Mon, 19 May 2014 13:24:39 +0530
Subject: mtd: nand: omap: add support for BCH16_ECC - GPMC driver updates

This patch add support for BCH16_ECC in GPMC (controller) driver:
- extends configuration space to include BCH16 registers
- extends parsing of DT binding for selecting BCH16 ecc-scheme

Signed-off-by: Pekon Gupta <pekon@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 arch/arm/mach-omap2/gpmc.c                   | 15 +++++++++++++++
 include/linux/platform_data/mtd-nand-omap2.h |  5 +++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/gpmc.c b/arch/arm/mach-omap2/gpmc.c
index ab43755364f5..9b27773db040 100644
--- a/arch/arm/mach-omap2/gpmc.c
+++ b/arch/arm/mach-omap2/gpmc.c
@@ -68,6 +68,9 @@
 #define	GPMC_ECC_BCH_RESULT_1	0x244	/* not available on OMAP2 */
 #define	GPMC_ECC_BCH_RESULT_2	0x248	/* not available on OMAP2 */
 #define	GPMC_ECC_BCH_RESULT_3	0x24c	/* not available on OMAP2 */
+#define	GPMC_ECC_BCH_RESULT_4	0x300	/* not available on OMAP2 */
+#define	GPMC_ECC_BCH_RESULT_5	0x304	/* not available on OMAP2 */
+#define	GPMC_ECC_BCH_RESULT_6	0x308	/* not available on OMAP2 */
 
 /* GPMC ECC control settings */
 #define GPMC_ECC_CTRL_ECCCLEAR		0x100
@@ -666,6 +669,12 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 					   GPMC_BCH_SIZE * i;
 		reg->gpmc_bch_result3[i] = gpmc_base + GPMC_ECC_BCH_RESULT_3 +
 					   GPMC_BCH_SIZE * i;
+		reg->gpmc_bch_result4[i] = gpmc_base + GPMC_ECC_BCH_RESULT_4 +
+					   i * GPMC_BCH_SIZE;
+		reg->gpmc_bch_result5[i] = gpmc_base + GPMC_ECC_BCH_RESULT_5 +
+					   i * GPMC_BCH_SIZE;
+		reg->gpmc_bch_result6[i] = gpmc_base + GPMC_ECC_BCH_RESULT_6 +
+					   i * GPMC_BCH_SIZE;
 	}
 }
 
@@ -1401,6 +1410,12 @@ static int gpmc_probe_nand_child(struct platform_device *pdev,
 		else
 			gpmc_nand_data->ecc_opt =
 				OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
+	else if (!strcmp(s, "bch16"))
+		if (gpmc_nand_data->elm_of_node)
+			gpmc_nand_data->ecc_opt =
+				OMAP_ECC_BCH16_CODE_HW;
+		else
+			pr_err("%s: BCH16 requires ELM support\n", __func__);
 	else
 		pr_err("%s: ti,nand-ecc-opt invalid value\n", __func__);
 
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index 3e9dd6676b97..660c029d694f 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -31,6 +31,8 @@ enum omap_ecc {
 	OMAP_ECC_BCH8_CODE_HW_DETECTION_SW,
 	/* 8-bit  ECC calculation by GPMC, Error detection by ELM */
 	OMAP_ECC_BCH8_CODE_HW,
+	/* 16-bit ECC calculation by GPMC, Error detection by ELM */
+	OMAP_ECC_BCH16_CODE_HW,
 };
 
 struct gpmc_nand_regs {
@@ -50,6 +52,9 @@ struct gpmc_nand_regs {
 	void __iomem	*gpmc_bch_result1[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result2[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result3[GPMC_BCH_NUM_REMAINDER];
+	void __iomem	*gpmc_bch_result4[GPMC_BCH_NUM_REMAINDER];
+	void __iomem	*gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER];
+	void __iomem	*gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER];
 };
 
 struct omap_nand_platform_data {
-- 
cgit v1.2.3-71-gd317


From 2be589e4b28457f148640dc6addf6da24af64b7f Mon Sep 17 00:00:00 2001
From: pekon gupta <pekon@ti.com>
Date: Mon, 19 May 2014 13:24:40 +0530
Subject: mtd: nand: omap: add support for BCH16_ECC - ELM driver updates

ELM hardware engine is used to detect ECC errors for BCHx ecc-schemes
(like BCH4/BCH8/BCH16). This patch extends configuration of ELM registers
for adding support of BCH16_HW ecc-scheme.

Signed-off-by: Pekon Gupta <pekon@ti.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/devices/elm.c         | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/platform_data/elm.h |  3 ++-
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/elm.c b/drivers/mtd/devices/elm.c
index 0a037b15c11b..7df86948e6d4 100644
--- a/drivers/mtd/devices/elm.c
+++ b/drivers/mtd/devices/elm.c
@@ -213,6 +213,28 @@ static void elm_load_syndrome(struct elm_info *info,
 				val = cpu_to_be32(*(u32 *) &ecc[0]) >> 12;
 				elm_write_reg(info, offset, val);
 				break;
+			case BCH16_ECC:
+				val = cpu_to_be32(*(u32 *) &ecc[22]);
+				elm_write_reg(info, offset, val);
+				offset += 4;
+				val = cpu_to_be32(*(u32 *) &ecc[18]);
+				elm_write_reg(info, offset, val);
+				offset += 4;
+				val = cpu_to_be32(*(u32 *) &ecc[14]);
+				elm_write_reg(info, offset, val);
+				offset += 4;
+				val = cpu_to_be32(*(u32 *) &ecc[10]);
+				elm_write_reg(info, offset, val);
+				offset += 4;
+				val = cpu_to_be32(*(u32 *) &ecc[6]);
+				elm_write_reg(info, offset, val);
+				offset += 4;
+				val = cpu_to_be32(*(u32 *) &ecc[2]);
+				elm_write_reg(info, offset, val);
+				offset += 4;
+				val = cpu_to_be32(*(u32 *) &ecc[0]) >> 16;
+				elm_write_reg(info, offset, val);
+				break;
 			default:
 				pr_err("invalid config bch_type\n");
 			}
@@ -436,6 +458,13 @@ static int elm_context_save(struct elm_info *info)
 	for (i = 0; i < ERROR_VECTOR_MAX; i++) {
 		offset = i * SYNDROME_FRAGMENT_REG_SIZE;
 		switch (bch_type) {
+		case BCH16_ECC:
+			regs->elm_syndrome_fragment_6[i] = elm_read_reg(info,
+					ELM_SYNDROME_FRAGMENT_6 + offset);
+			regs->elm_syndrome_fragment_5[i] = elm_read_reg(info,
+					ELM_SYNDROME_FRAGMENT_5 + offset);
+			regs->elm_syndrome_fragment_4[i] = elm_read_reg(info,
+					ELM_SYNDROME_FRAGMENT_4 + offset);
 		case BCH8_ECC:
 			regs->elm_syndrome_fragment_3[i] = elm_read_reg(info,
 					ELM_SYNDROME_FRAGMENT_3 + offset);
@@ -474,6 +503,13 @@ static int elm_context_restore(struct elm_info *info)
 	for (i = 0; i < ERROR_VECTOR_MAX; i++) {
 		offset = i * SYNDROME_FRAGMENT_REG_SIZE;
 		switch (bch_type) {
+		case BCH16_ECC:
+			elm_write_reg(info, ELM_SYNDROME_FRAGMENT_6 + offset,
+					regs->elm_syndrome_fragment_6[i]);
+			elm_write_reg(info, ELM_SYNDROME_FRAGMENT_5 + offset,
+					regs->elm_syndrome_fragment_5[i]);
+			elm_write_reg(info, ELM_SYNDROME_FRAGMENT_4 + offset,
+					regs->elm_syndrome_fragment_4[i]);
 		case BCH8_ECC:
 			elm_write_reg(info, ELM_SYNDROME_FRAGMENT_3 + offset,
 					regs->elm_syndrome_fragment_3[i]);
diff --git a/include/linux/platform_data/elm.h b/include/linux/platform_data/elm.h
index 4edb40676b3f..780d1e97f620 100644
--- a/include/linux/platform_data/elm.h
+++ b/include/linux/platform_data/elm.h
@@ -21,6 +21,7 @@
 enum bch_ecc {
 	BCH4_ECC = 0,
 	BCH8_ECC,
+	BCH16_ECC,
 };
 
 /* ELM support 8 error syndrome process */
@@ -38,7 +39,7 @@ struct elm_errorvec {
 	bool error_reported;
 	bool error_uncorrectable;
 	int error_count;
-	int error_loc[ERROR_VECTOR_MAX];
+	int error_loc[16];
 };
 
 void elm_decode_bch_error_page(struct device *dev, u8 *ecc_calc,
-- 
cgit v1.2.3-71-gd317


From 7d10d2610cc02d432168ca0c5d964cd9e85c1b06 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Mon, 19 May 2014 09:21:09 +0200
Subject: net: cdc_ncm: fix 64bit division build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The upper timer_interval limit is arbitrary and much higher
than anything usable in the real world.  Reducing it from 15s
to ~4s to make the timer_interval fit in an u32 does not make
much difference.  The limit is still outside the practical
bounds.

This eliminates the need for a 64bit timer_interval, fixing a
build error related to 64bit division:

 drivers/built-in.o: In function `cdc_ncm_get_coalesce':
 ak8975.c:(.text+0x1ac994): undefined reference to `__aeabi_uldivmod'

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c   | 4 ++--
 include/linux/usb/cdc_ncm.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 2d0caf1eea25..ad2a386a6e92 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -138,7 +138,7 @@ static int cdc_ncm_get_coalesce(struct net_device *netdev,
 	ec->tx_max_coalesced_frames = ctx->tx_max / ctx->max_datagram_size;
 
 	/* the timer will fire CDC_NCM_TIMER_PENDING_CNT times in a row */
-	ec->tx_coalesce_usecs = (ctx->timer_interval * CDC_NCM_TIMER_PENDING_CNT) / NSEC_PER_USEC;
+	ec->tx_coalesce_usecs = ctx->timer_interval / (NSEC_PER_USEC / CDC_NCM_TIMER_PENDING_CNT);
 	return 0;
 }
 
@@ -164,7 +164,7 @@ static int cdc_ncm_set_coalesce(struct net_device *netdev,
 		return -EINVAL;
 
 	spin_lock_bh(&ctx->mtx);
-	ctx->timer_interval = ec->tx_coalesce_usecs * NSEC_PER_USEC / CDC_NCM_TIMER_PENDING_CNT;
+	ctx->timer_interval = ec->tx_coalesce_usecs * (NSEC_PER_USEC / CDC_NCM_TIMER_PENDING_CNT);
 	if (!ctx->timer_interval)
 		ctx->tx_timer_pending = 0;
 	spin_unlock_bh(&ctx->mtx);
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 8c5e38819828..7c9b484735c5 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -78,7 +78,7 @@
 #define	CDC_NCM_TIMER_PENDING_CNT		2
 #define CDC_NCM_TIMER_INTERVAL_USEC		400UL
 #define CDC_NCM_TIMER_INTERVAL_MIN		5UL
-#define CDC_NCM_TIMER_INTERVAL_MAX		(15UL * USEC_PER_SEC)
+#define CDC_NCM_TIMER_INTERVAL_MAX		(U32_MAX / NSEC_PER_USEC)
 
 #define cdc_ncm_comm_intf_is_mbim(x)  ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \
 				       (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE)
@@ -104,7 +104,7 @@ struct cdc_ncm_ctx {
 	spinlock_t mtx;
 	atomic_t stop;
 
-	u64 timer_interval;
+	u32 timer_interval;
 	u32 max_ndp_size;
 
 	u32 tx_timer_pending;
-- 
cgit v1.2.3-71-gd317


From 3796ce1d4d4b330a75005c5eda105603ce9d4071 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Mon, 19 May 2014 22:42:32 +0200
Subject: pwm: add period and polarity to struct pwm_lookup

Add period and polarity members to struct pwm_lookup so that platforms
using the lookup table can be treated the same way as those using the
device tree.

Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 8 +++++++-
 include/linux/pwm.h | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index a80471399c20..4b66bf09ee55 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -661,10 +661,16 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 		}
 	}
 
+	mutex_unlock(&pwm_lookup_lock);
+
 	if (chip)
 		pwm = pwm_request_from_chip(chip, index, con_id ?: dev_id);
+	if (IS_ERR(pwm))
+		return pwm;
+
+	pwm_set_period(pwm, p->period);
+	pwm_set_polarity(pwm, p->polarity);
 
-	mutex_unlock(&pwm_lookup_lock);
 
 	return pwm;
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 4717f54051cb..2f45e2fe5b93 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -274,6 +274,8 @@ struct pwm_lookup {
 	unsigned int index;
 	const char *dev_id;
 	const char *con_id;
+	unsigned int period;
+	enum pwm_polarity polarity;
 };
 
 #define PWM_LOOKUP(_provider, _index, _dev_id, _con_id)	\
-- 
cgit v1.2.3-71-gd317


From dc671157139918eaf61f73db1bd6dd02960b66e2 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Mon, 19 May 2014 22:42:34 +0200
Subject: pwm: renesas-tpu: remove unused struct tpu_pwm_platform_data

The struct is not used anymore and the polarity initialization will be
done using the PWM lookup table (or device tree).

Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/pwm-renesas-tpu.c                 | 19 +++----------------
 include/linux/platform_data/pwm-renesas-tpu.h | 16 ----------------
 2 files changed, 3 insertions(+), 32 deletions(-)
 delete mode 100644 include/linux/platform_data/pwm-renesas-tpu.h

(limited to 'include/linux')

diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c
index cc13ff4cfab0..3b71b42e89d5 100644
--- a/drivers/pwm/pwm-renesas-tpu.c
+++ b/drivers/pwm/pwm-renesas-tpu.c
@@ -21,13 +21,14 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
-#include <linux/platform_data/pwm-renesas-tpu.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/pwm.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 
+#define TPU_CHANNEL_MAX		4
+
 #define TPU_TSTR		0x00	/* Timer start register (shared) */
 
 #define TPU_TCRn		0x00	/* Timer control register */
@@ -87,7 +88,6 @@ struct tpu_pwm_device {
 
 struct tpu_device {
 	struct platform_device *pdev;
-	enum pwm_polarity polarities[TPU_CHANNEL_MAX];
 	struct pwm_chip chip;
 	spinlock_t lock;
 
@@ -229,7 +229,7 @@ static int tpu_pwm_request(struct pwm_chip *chip, struct pwm_device *_pwm)
 
 	pwm->tpu = tpu;
 	pwm->channel = _pwm->hwpwm;
-	pwm->polarity = tpu->polarities[pwm->channel];
+	pwm->polarity = PWM_POLARITY_NORMAL;
 	pwm->prescaler = 0;
 	pwm->period = 0;
 	pwm->duty = 0;
@@ -388,16 +388,6 @@ static const struct pwm_ops tpu_pwm_ops = {
  * Probe and remove
  */
 
-static void tpu_parse_pdata(struct tpu_device *tpu)
-{
-	struct tpu_pwm_platform_data *pdata = tpu->pdev->dev.platform_data;
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(tpu->polarities); ++i)
-		tpu->polarities[i] = pdata ? pdata->channels[i].polarity
-				   : PWM_POLARITY_NORMAL;
-}
-
 static int tpu_probe(struct platform_device *pdev)
 {
 	struct tpu_device *tpu;
@@ -411,9 +401,6 @@ static int tpu_probe(struct platform_device *pdev)
 	spin_lock_init(&tpu->lock);
 	tpu->pdev = pdev;
 
-	/* Initialize device configuration from platform data. */
-	tpu_parse_pdata(tpu);
-
 	/* Map memory, get clock and pin control. */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	tpu->base = devm_ioremap_resource(&pdev->dev, res);
diff --git a/include/linux/platform_data/pwm-renesas-tpu.h b/include/linux/platform_data/pwm-renesas-tpu.h
deleted file mode 100644
index a7220b10ddab..000000000000
--- a/include/linux/platform_data/pwm-renesas-tpu.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef __PWM_RENESAS_TPU_H__
-#define __PWM_RENESAS_TPU_H__
-
-#include <linux/pwm.h>
-
-#define TPU_CHANNEL_MAX		4
-
-struct tpu_pwm_channel_data {
-	enum pwm_polarity polarity;
-};
-
-struct tpu_pwm_platform_data {
-	struct tpu_pwm_channel_data channels[TPU_CHANNEL_MAX];
-};
-
-#endif /* __PWM_RENESAS_TPU_H__ */
-- 
cgit v1.2.3-71-gd317


From 4284402924cc55e182008ca7e9d4fb1e891ff5ae Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Mon, 19 May 2014 22:42:37 +0200
Subject: pwm: modify PWM_LOOKUP to initialize all struct pwm_lookup members

Now that PWM_LOOKUP is not used anymore, modify it to initialize all the
members of struct pwm_lookup.

Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 Documentation/pwm.txt | 3 ++-
 include/linux/pwm.h   | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pwm.txt b/Documentation/pwm.txt
index 0527f615b115..ca895fd211e4 100644
--- a/Documentation/pwm.txt
+++ b/Documentation/pwm.txt
@@ -19,7 +19,8 @@ should instead register a static mapping that can be used to match PWM
 consumers to providers, as given in the following example:
 
 	static struct pwm_lookup board_pwm_lookup[] = {
-		PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL),
+		PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL,
+			   50000, PWM_POLARITY_NORMAL),
 	};
 
 	static void __init board_init(void)
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 2f45e2fe5b93..e90628cac8fa 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -278,12 +278,14 @@ struct pwm_lookup {
 	enum pwm_polarity polarity;
 };
 
-#define PWM_LOOKUP(_provider, _index, _dev_id, _con_id)	\
+#define PWM_LOOKUP(_provider, _index, _dev_id, _con_id, _period, _polarity) \
 	{						\
 		.provider = _provider,			\
 		.index = _index,			\
 		.dev_id = _dev_id,			\
 		.con_id = _con_id,			\
+		.period = _period,			\
+		.polarity = _polarity			\
 	}
 
 #if IS_ENABLED(CONFIG_PWM)
-- 
cgit v1.2.3-71-gd317


From 5b3e507820c6e120bc2680c0d35f9d9d81fcb98d Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Date: Wed, 14 May 2014 14:58:08 -0300
Subject: mtd: nand: pxa3xx: Use ECC strength and step size devicetree binding

This commit adds support for the user to specify the ECC strength
and step size through the devicetree. We keep the previous behavior,
when there is no DT parameter provided.

Signed-off-by: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/nand/pxa3xx_nand.c                | 17 +++++++++++++++--
 include/linux/platform_data/mtd-nand-pxa3xx.h |  3 +++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index 3b66a6460d67..2a9add06c2d5 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -1519,8 +1519,13 @@ KEEP_CONFIG:
 		}
 	}
 
-	ecc_strength = chip->ecc_strength_ds;
-	ecc_step = chip->ecc_step_ds;
+	if (pdata->ecc_strength && pdata->ecc_step_size) {
+		ecc_strength = pdata->ecc_strength;
+		ecc_step = pdata->ecc_step_size;
+	} else {
+		ecc_strength = chip->ecc_strength_ds;
+		ecc_step = chip->ecc_step_ds;
+	}
 
 	/* Set default ECC strength requirements on non-ONFI devices */
 	if (ecc_strength < 1 && ecc_step < 1) {
@@ -1729,6 +1734,14 @@ static int pxa3xx_nand_probe_dt(struct platform_device *pdev)
 	of_property_read_u32(np, "num-cs", &pdata->num_cs);
 	pdata->flash_bbt = of_get_nand_on_flash_bbt(np);
 
+	pdata->ecc_strength = of_get_nand_ecc_strength(np);
+	if (pdata->ecc_strength < 0)
+		pdata->ecc_strength = 0;
+
+	pdata->ecc_step_size = of_get_nand_ecc_step_size(np);
+	if (pdata->ecc_step_size < 0)
+		pdata->ecc_step_size = 0;
+
 	pdev->dev.platform_data = pdata;
 
 	return 0;
diff --git a/include/linux/platform_data/mtd-nand-pxa3xx.h b/include/linux/platform_data/mtd-nand-pxa3xx.h
index a94147124929..ac4ea2e641c7 100644
--- a/include/linux/platform_data/mtd-nand-pxa3xx.h
+++ b/include/linux/platform_data/mtd-nand-pxa3xx.h
@@ -58,6 +58,9 @@ struct pxa3xx_nand_platform_data {
 	/* use an flash-based bad block table */
 	bool	flash_bbt;
 
+	/* requested ECC strength and ECC step size */
+	int ecc_strength, ecc_step_size;
+
 	const struct mtd_partition		*parts[NUM_CHIP_SELECT];
 	unsigned int				nr_parts[NUM_CHIP_SELECT];
 
-- 
cgit v1.2.3-71-gd317


From 5fe821a9dee241fa450703ab7015d970ee0cfb8d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 19 May 2014 14:56:14 -0700
Subject: net: filter: cleanup invocation of internal BPF

Kernel API for classic BPF socket filters is:

sk_unattached_filter_create() - validate classic BPF, convert, JIT
SK_RUN_FILTER() - run it
sk_unattached_filter_destroy() - destroy socket filter

Cleanup internal BPF kernel API as following:

sk_filter_select_runtime() - final step of internal BPF creation.
  Try to JIT internal BPF program, if JIT is not available select interpreter
SK_RUN_FILTER() - run it
sk_filter_free() - free internal BPF program

Disallow direct calls to BPF interpreter. Execution of the BPF program should
be done with SK_RUN_FILTER() macro.

Example of internal BPF create, run, destroy:

  struct sk_filter *fp;

  fp = kzalloc(sk_filter_size(prog_len), GFP_KERNEL);
  memcpy(fp->insni, prog, prog_len * sizeof(fp->insni[0]));
  fp->len = prog_len;

  sk_filter_select_runtime(fp);

  SK_RUN_FILTER(fp, ctx);

  sk_filter_free(fp);

Sockets, seccomp, testsuite, tracing are using different ways to populate
sk_filter, so first steps of program creation are not common.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  6 ++----
 kernel/seccomp.c       |  6 ++----
 lib/test_bpf.c         |  4 ++--
 net/core/filter.c      | 44 ++++++++++++++++++++++++++++----------------
 4 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9d5ae0a2c954..7977b3958e25 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -184,10 +184,8 @@ static inline unsigned int sk_filter_size(unsigned int proglen)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
-			      const struct sock_filter_int *insni);
-u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
-			  const struct sock_filter_int *insni);
+void sk_filter_select_runtime(struct sk_filter *fp);
+void sk_filter_free(struct sk_filter *fp);
 
 int sk_convert_filter(struct sock_filter *prog, int len,
 		      struct sock_filter_int *new_prog, int *new_len);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7e02d624cc50..1036b6f2fded 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -273,10 +273,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 
 	atomic_set(&filter->usage, 1);
 	filter->prog->len = new_len;
-	filter->prog->bpf_func = (void *)sk_run_filter_int_seccomp;
 
-	/* JIT internal BPF into native HW instructions */
-	bpf_int_jit_compile(filter->prog);
+	sk_filter_select_runtime(filter->prog);
 
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
@@ -340,7 +338,7 @@ void put_seccomp_filter(struct task_struct *tsk)
 	while (orig && atomic_dec_and_test(&orig->usage)) {
 		struct seccomp_filter *freeme = orig;
 		orig = orig->prev;
-		bpf_jit_free(freeme->prog);
+		sk_filter_free(freeme->prog);
 		kfree(freeme);
 	}
 }
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 3603ebcd5d65..e160934430eb 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -1489,7 +1489,7 @@ static __init int test_bpf(void)
 			memcpy(fp_ext->insns, tests[i].insns_int,
 			       fprog.len * 8);
 			fp->len = fprog.len;
-			fp->bpf_func = sk_run_filter_int_skb;
+			sk_filter_select_runtime(fp);
 		} else {
 			err = sk_unattached_filter_create(&fp, &fprog);
 			if (tests[i].data_type == EXPECTED_FAIL) {
@@ -1516,7 +1516,7 @@ static __init int test_bpf(void)
 		if (tests[i].data_type != SKB_INT)
 			sk_unattached_filter_destroy(fp);
 		else
-			kfree(fp);
+			sk_filter_free(fp);
 
 		if (err) {
 			pr_cont("FAIL %d\n", err);
diff --git a/net/core/filter.c b/net/core/filter.c
index 32c5b44c537e..7067cb240d3e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -153,7 +153,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
  * keep, 0 for none. @ctx is the data we are operating on, @insn is the
  * array of filter instructions.
  */
-unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
+static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 {
 	u64 stack[MAX_BPF_STACK / sizeof(u64)];
 	u64 regs[MAX_BPF_REG], tmp;
@@ -571,15 +571,6 @@ load_byte:
 		return 0;
 }
 
-u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
-			      const struct sock_filter_int *insni)
-    __attribute__ ((alias ("__sk_run_filter")));
-
-u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
-			  const struct sock_filter_int *insni)
-    __attribute__ ((alias ("__sk_run_filter")));
-EXPORT_SYMBOL_GPL(sk_run_filter_int_skb);
-
 /* Helper to find the offset of pkt_type in sk_buff structure. We want
  * to make sure its still a 3bit field starting at a byte boundary;
  * taken from arch/x86/net/bpf_jit_comp.c.
@@ -1397,7 +1388,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu)
 	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
 
 	sk_release_orig_filter(fp);
-	bpf_jit_free(fp);
+	sk_filter_free(fp);
 }
 
 /**
@@ -1497,7 +1488,6 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 		goto out_err_free;
 	}
 
-	fp->bpf_func = sk_run_filter_int_skb;
 	fp->len = new_len;
 
 	/* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
@@ -1510,6 +1500,8 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 		 */
 		goto out_err_free;
 
+	sk_filter_select_runtime(fp);
+
 	kfree(old_prog);
 	return fp;
 
@@ -1528,6 +1520,29 @@ void __weak bpf_int_jit_compile(struct sk_filter *prog)
 {
 }
 
+/**
+ *	sk_filter_select_runtime - select execution runtime for BPF program
+ *	@fp: sk_filter populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via SK_RUN_FILTER() macro
+ */
+void sk_filter_select_runtime(struct sk_filter *fp)
+{
+	fp->bpf_func = (void *) __sk_run_filter;
+
+	/* Probe if internal BPF can be JITed */
+	bpf_int_jit_compile(fp);
+}
+EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
+
+/* free internal BPF program */
+void sk_filter_free(struct sk_filter *fp)
+{
+	bpf_jit_free(fp);
+}
+EXPORT_SYMBOL_GPL(sk_filter_free);
+
 static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
 					     struct sock *sk)
 {
@@ -1548,12 +1563,9 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
 	/* JIT compiler couldn't process this filter, so do the
 	 * internal BPF translation for the optimized interpreter.
 	 */
-	if (!fp->jited) {
+	if (!fp->jited)
 		fp = __sk_migrate_filter(fp, sk);
 
-		/* Probe if internal BPF can be jit-ed */
-		bpf_int_jit_compile(fp);
-	}
 	return fp;
 }
 
-- 
cgit v1.2.3-71-gd317


From ba730340f96c01160b5f26f81e8fb38f8cb1821c Mon Sep 17 00:00:00 2001
From: Alexander Popov <a13xp0p0v88@gmail.com>
Date: Thu, 15 May 2014 18:15:31 +0400
Subject: dmaengine: fix comment typo

Fix comment typo.

Signed-off-by: Alexander Popov <a13xp0p0v88@gmail.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 8300fb87b84a..cbb168e04dc1 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -292,7 +292,7 @@ struct dma_chan_dev {
 };
 
 /**
- * enum dma_slave_buswidth - defines bus with of the DMA slave
+ * enum dma_slave_buswidth - defines bus width of the DMA slave
  * device, source or target buses
  */
 enum dma_slave_buswidth {
-- 
cgit v1.2.3-71-gd317


From bf3b5ec66bd03d66e9ea729aaca013ea1047a797 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 12:55:30 +0100
Subject: mmc: sdio_irq: rework sdio irq handling

Rather than the SDIO support spawning it's own thread for handling card
interrupts, use the generic IRQ infrastructure for this, triggering it
from the host interface's interrupt handling directly.

This avoids a race between the parent thread waiting to receive an
interrupt response from the card, and the slow startup from the sdio
irq thread, which can occur as a result of high system load (eg, while
udev is running.)

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
[Ulf Hansson] Resolved conflict
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/core/sdio_irq.c | 41 +++++++++++++++++++++++++++++++----------
 include/linux/mmc/host.h    |  3 +++
 2 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index aaa90460ed23..5cc13c8d35bb 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -90,6 +90,15 @@ static int process_sdio_pending_irqs(struct mmc_host *host)
 	return ret;
 }
 
+void sdio_run_irqs(struct mmc_host *host)
+{
+	mmc_claim_host(host);
+	host->sdio_irq_pending = true;
+	process_sdio_pending_irqs(host);
+	mmc_release_host(host);
+}
+EXPORT_SYMBOL_GPL(sdio_run_irqs);
+
 static int sdio_irq_thread(void *_host)
 {
 	struct mmc_host *host = _host;
@@ -189,14 +198,20 @@ static int sdio_card_irq_get(struct mmc_card *card)
 	WARN_ON(!host->claimed);
 
 	if (!host->sdio_irqs++) {
-		atomic_set(&host->sdio_irq_thread_abort, 0);
-		host->sdio_irq_thread =
-			kthread_run(sdio_irq_thread, host, "ksdioirqd/%s",
-				mmc_hostname(host));
-		if (IS_ERR(host->sdio_irq_thread)) {
-			int err = PTR_ERR(host->sdio_irq_thread);
-			host->sdio_irqs--;
-			return err;
+		if (!(host->caps2 & MMC_CAP2_SDIO_IRQ_NOTHREAD)) {
+			atomic_set(&host->sdio_irq_thread_abort, 0);
+			host->sdio_irq_thread =
+				kthread_run(sdio_irq_thread, host,
+					    "ksdioirqd/%s", mmc_hostname(host));
+			if (IS_ERR(host->sdio_irq_thread)) {
+				int err = PTR_ERR(host->sdio_irq_thread);
+				host->sdio_irqs--;
+				return err;
+			}
+		} else {
+			mmc_host_clk_hold(host);
+			host->ops->enable_sdio_irq(host, 1);
+			mmc_host_clk_release(host);
 		}
 	}
 
@@ -211,8 +226,14 @@ static int sdio_card_irq_put(struct mmc_card *card)
 	BUG_ON(host->sdio_irqs < 1);
 
 	if (!--host->sdio_irqs) {
-		atomic_set(&host->sdio_irq_thread_abort, 1);
-		kthread_stop(host->sdio_irq_thread);
+		if (!(host->caps2 & MMC_CAP2_SDIO_IRQ_NOTHREAD)) {
+			atomic_set(&host->sdio_irq_thread_abort, 1);
+			kthread_stop(host->sdio_irq_thread);
+		} else {
+			mmc_host_clk_hold(host);
+			host->ops->enable_sdio_irq(host, 0);
+			mmc_host_clk_release(host);
+		}
 	}
 
 	return 0;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index cd595275e118..7960424d0bc0 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -282,6 +282,7 @@ struct mmc_host {
 #define MMC_CAP2_HS400_1_2V	(1 << 16)	/* Can support HS400 1.2V */
 #define MMC_CAP2_HS400		(MMC_CAP2_HS400_1_8V | \
 				 MMC_CAP2_HS400_1_2V)
+#define MMC_CAP2_SDIO_IRQ_NOTHREAD (1 << 17)
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
@@ -397,6 +398,8 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host)
 	wake_up_process(host->sdio_irq_thread);
 }
 
+void sdio_run_irqs(struct mmc_host *host);
+
 #ifdef CONFIG_REGULATOR
 int mmc_regulator_get_ocrmask(struct regulator *supply);
 int mmc_regulator_set_ocr(struct mmc_host *mmc,
-- 
cgit v1.2.3-71-gd317


From 781e989cf593c71d26bdca74f5e77b3651fc060e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 12:55:46 +0100
Subject: mmc: sdhci: convert to new SDIO IRQ handling

Use a generic threaded interrupt handler for SDIO interrupt handling,
rather than allowing the SDIO core code to buggily spawn its own
thread.  This results in host drivers to be more in control of how
SDIO interrupts are acknowledged in the hardware, rather than having
the internals of the SDIO core placed upon them, possibly resulting
in sub-standard handling.

At least one SDHCI implementation specifies a very specific sequence
to deal with a card interrupt.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/sdhci.c  | 62 +++++++++++++++++++++++++++++------------------
 include/linux/mmc/sdhci.h |  2 ++
 2 files changed, 41 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 4b6cca2130bc..4a0622d52ae5 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -2428,10 +2428,10 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
 
 static irqreturn_t sdhci_irq(int irq, void *dev_id)
 {
-	irqreturn_t result;
+	irqreturn_t result = IRQ_NONE;
 	struct sdhci_host *host = dev_id;
 	u32 intmask, mask, unexpected = 0;
-	int cardint = 0, max_loops = 16;
+	int max_loops = 16;
 
 	spin_lock(&host->lock);
 
@@ -2490,8 +2490,11 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id)
 			pr_err("%s: Card is consuming too much power!\n",
 				mmc_hostname(host->mmc));
 
-		if (intmask & SDHCI_INT_CARD_INT)
-			cardint = 1;
+		if (intmask & SDHCI_INT_CARD_INT) {
+			sdhci_enable_sdio_irq_nolock(host, false);
+			host->thread_isr |= SDHCI_INT_CARD_INT;
+			result = IRQ_WAKE_THREAD;
+		}
 
 		intmask &= ~(SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE |
 			     SDHCI_INT_CMD_MASK | SDHCI_INT_DATA_MASK |
@@ -2503,17 +2506,10 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id)
 			sdhci_writel(host, intmask, SDHCI_INT_STATUS);
 		}
 
-		result = IRQ_HANDLED;
+		if (result == IRQ_NONE)
+			result = IRQ_HANDLED;
 
 		intmask = sdhci_readl(host, SDHCI_INT_STATUS);
-
-		/*
-		 * If we know we'll call the driver to signal SDIO IRQ,
-		 * disregard further indications of Card Interrupt in
-		 * the status to avoid a needless loop.
-		 */
-		if (cardint)
-			intmask &= ~SDHCI_INT_CARD_INT;
 	} while (intmask && --max_loops);
 out:
 	spin_unlock(&host->lock);
@@ -2523,15 +2519,33 @@ out:
 			   mmc_hostname(host->mmc), unexpected);
 		sdhci_dumpregs(host);
 	}
-	/*
-	 * We have to delay this as it calls back into the driver.
-	 */
-	if (cardint)
-		mmc_signal_sdio_irq(host->mmc);
 
 	return result;
 }
 
+static irqreturn_t sdhci_thread_irq(int irq, void *dev_id)
+{
+	struct sdhci_host *host = dev_id;
+	unsigned long flags;
+	u32 isr;
+
+	spin_lock_irqsave(&host->lock, flags);
+	isr = host->thread_isr;
+	host->thread_isr = 0;
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	if (isr & SDHCI_INT_CARD_INT) {
+		sdio_run_irqs(host->mmc);
+
+		spin_lock_irqsave(&host->lock, flags);
+		if (host->flags & SDHCI_SDIO_IRQ_ENABLED)
+			sdhci_enable_sdio_irq_nolock(host, true);
+		spin_unlock_irqrestore(&host->lock, flags);
+	}
+
+	return isr ? IRQ_HANDLED : IRQ_NONE;
+}
+
 /*****************************************************************************\
  *                                                                           *
  * Suspend/resume                                                            *
@@ -2601,8 +2615,9 @@ int sdhci_resume_host(struct sdhci_host *host)
 	}
 
 	if (!device_may_wakeup(mmc_dev(host->mmc))) {
-		ret = request_irq(host->irq, sdhci_irq, IRQF_SHARED,
-				  mmc_hostname(host->mmc), host);
+		ret = request_threaded_irq(host->irq, sdhci_irq,
+					   sdhci_thread_irq, IRQF_SHARED,
+					   mmc_hostname(host->mmc), host);
 		if (ret)
 			return ret;
 	} else {
@@ -2681,7 +2696,7 @@ int sdhci_runtime_suspend_host(struct sdhci_host *host)
 	sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK);
 	spin_unlock_irqrestore(&host->lock, flags);
 
-	synchronize_irq(host->irq);
+	synchronize_hardirq(host->irq);
 
 	spin_lock_irqsave(&host->lock, flags);
 	host->runtime_suspended = true;
@@ -2937,6 +2952,7 @@ int sdhci_add_host(struct sdhci_host *host)
 	mmc->max_busy_timeout = (1 << 27) / host->timeout_clk;
 
 	mmc->caps |= MMC_CAP_SDIO_IRQ | MMC_CAP_ERASE | MMC_CAP_CMD23;
+	mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD;
 
 	if (host->quirks & SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12)
 		host->flags |= SDHCI_AUTO_CMD12;
@@ -3226,8 +3242,8 @@ int sdhci_add_host(struct sdhci_host *host)
 
 	sdhci_init(host, 0);
 
-	ret = request_irq(host->irq, sdhci_irq, IRQF_SHARED,
-		mmc_hostname(mmc), host);
+	ret = request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq,
+				   IRQF_SHARED,	mmc_hostname(mmc), host);
 	if (ret) {
 		pr_err("%s: Failed to request IRQ %d: %d\n",
 		       mmc_hostname(mmc), host->irq, ret);
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 7be12b883485..d1aa97b77dd9 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -177,6 +177,8 @@ struct sdhci_host {
 	unsigned int            ocr_avail_mmc;
 	u32 ocr_mask;		/* available voltages */
 
+	u32			thread_isr;
+
 	wait_queue_head_t	buf_ready_int;	/* Waitqueue for Buffer Read Ready interrupt */
 	unsigned int		tuning_done;	/* Condition flag set when CMD19 succeeds */
 
-- 
cgit v1.2.3-71-gd317


From 3560db8e247aa35bc6b287ec7ec51cd41abd512e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 12:55:51 +0100
Subject: mmc: sdhci: push card_tasklet into threaded irq handler

There's no requirement to have the card tasklet separate now that we
have a threaded interrupt handler, so kill this and move the called
code into the threaded part of the handler.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/sdhci.c  | 23 +++++++++--------------
 include/linux/mmc/sdhci.h |  3 +--
 2 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 4a0622d52ae5..8def3919b32c 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -2128,15 +2128,6 @@ static const struct mmc_host_ops sdhci_ops = {
  *                                                                           *
 \*****************************************************************************/
 
-static void sdhci_tasklet_card(unsigned long param)
-{
-	struct sdhci_host *host = (struct sdhci_host*)param;
-
-	sdhci_card_event(host->mmc);
-
-	mmc_detect_change(host->mmc, msecs_to_jiffies(200));
-}
-
 static void sdhci_tasklet_finish(unsigned long param)
 {
 	struct sdhci_host *host;
@@ -2477,7 +2468,10 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id)
 
 			sdhci_writel(host, intmask & (SDHCI_INT_CARD_INSERT |
 				     SDHCI_INT_CARD_REMOVE), SDHCI_INT_STATUS);
-			tasklet_schedule(&host->card_tasklet);
+
+			host->thread_isr |= intmask & (SDHCI_INT_CARD_INSERT |
+						       SDHCI_INT_CARD_REMOVE);
+			result = IRQ_WAKE_THREAD;
 		}
 
 		if (intmask & SDHCI_INT_CMD_MASK)
@@ -2534,6 +2528,11 @@ static irqreturn_t sdhci_thread_irq(int irq, void *dev_id)
 	host->thread_isr = 0;
 	spin_unlock_irqrestore(&host->lock, flags);
 
+	if (isr & (SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE)) {
+		sdhci_card_event(host->mmc);
+		mmc_detect_change(host->mmc, msecs_to_jiffies(200));
+	}
+
 	if (isr & SDHCI_INT_CARD_INT) {
 		sdio_run_irqs(host->mmc);
 
@@ -3224,8 +3223,6 @@ int sdhci_add_host(struct sdhci_host *host)
 	/*
 	 * Init tasklets.
 	 */
-	tasklet_init(&host->card_tasklet,
-		sdhci_tasklet_card, (unsigned long)host);
 	tasklet_init(&host->finish_tasklet,
 		sdhci_tasklet_finish, (unsigned long)host);
 
@@ -3290,7 +3287,6 @@ reset:
 	free_irq(host->irq, host);
 #endif
 untasklet:
-	tasklet_kill(&host->card_tasklet);
 	tasklet_kill(&host->finish_tasklet);
 
 	return ret;
@@ -3334,7 +3330,6 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
 	del_timer_sync(&host->timer);
 
-	tasklet_kill(&host->card_tasklet);
 	tasklet_kill(&host->finish_tasklet);
 
 	if (host->vmmc) {
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index d1aa97b77dd9..f1c8e14e8751 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -164,8 +164,7 @@ struct sdhci_host {
 	dma_addr_t adma_addr;	/* Mapped ADMA descr. table */
 	dma_addr_t align_addr;	/* Mapped bounce buffer */
 
-	struct tasklet_struct card_tasklet;	/* Tasklet structures */
-	struct tasklet_struct finish_tasklet;
+	struct tasklet_struct finish_tasklet;	/* Tasklet structures */
 
 	struct timer_list timer;	/* Timer for timeouts */
 
-- 
cgit v1.2.3-71-gd317


From b537f94ce19583de1882f539a5cc49aa99260aca Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 12:56:01 +0100
Subject: mmc: sdhci: more efficient interrupt enable register handling

Rather than wasting cycles read-modify-writing the interrupt enable
registers, cache the value locally instead.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/sdhci.c  | 98 +++++++++++++++++++++++------------------------
 include/linux/mmc/sdhci.h |  3 ++
 2 files changed, 50 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 0ecbcc4c29d2..4a98ee29d136 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -131,27 +131,6 @@ static void sdhci_dumpregs(struct sdhci_host *host)
  *                                                                           *
 \*****************************************************************************/
 
-static void sdhci_clear_set_irqs(struct sdhci_host *host, u32 clear, u32 set)
-{
-	u32 ier;
-
-	ier = sdhci_readl(host, SDHCI_INT_ENABLE);
-	ier &= ~clear;
-	ier |= set;
-	sdhci_writel(host, ier, SDHCI_INT_ENABLE);
-	sdhci_writel(host, ier, SDHCI_SIGNAL_ENABLE);
-}
-
-static void sdhci_unmask_irqs(struct sdhci_host *host, u32 irqs)
-{
-	sdhci_clear_set_irqs(host, 0, irqs);
-}
-
-static void sdhci_mask_irqs(struct sdhci_host *host, u32 irqs)
-{
-	sdhci_clear_set_irqs(host, irqs, 0);
-}
-
 static void sdhci_set_card_detection(struct sdhci_host *host, bool enable)
 {
 	u32 present, irqs;
@@ -165,9 +144,12 @@ static void sdhci_set_card_detection(struct sdhci_host *host, bool enable)
 	irqs = present ? SDHCI_INT_CARD_REMOVE : SDHCI_INT_CARD_INSERT;
 
 	if (enable)
-		sdhci_unmask_irqs(host, irqs);
+		host->ier |= irqs;
 	else
-		sdhci_mask_irqs(host, irqs);
+		host->ier &= ~irqs;
+
+	sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+	sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
 }
 
 static void sdhci_enable_card_detection(struct sdhci_host *host)
@@ -183,17 +165,12 @@ static void sdhci_disable_card_detection(struct sdhci_host *host)
 static void sdhci_reset(struct sdhci_host *host, u8 mask)
 {
 	unsigned long timeout;
-	u32 uninitialized_var(ier);
-
 	if (host->quirks & SDHCI_QUIRK_NO_CARD_NO_RESET) {
 		if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) &
 			SDHCI_CARD_PRESENT))
 			return;
 	}
 
-	if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET)
-		ier = sdhci_readl(host, SDHCI_INT_ENABLE);
-
 	if (host->ops->platform_reset_enter)
 		host->ops->platform_reset_enter(host, mask);
 
@@ -224,8 +201,10 @@ static void sdhci_reset(struct sdhci_host *host, u8 mask)
 	if (host->ops->platform_reset_exit)
 		host->ops->platform_reset_exit(host, mask);
 
-	if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET)
-		sdhci_clear_set_irqs(host, SDHCI_INT_ALL_MASK, ier);
+	if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET) {
+		sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+		sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
+	}
 
 	if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
 		if ((host->ops->enable_dma) && (mask & SDHCI_RESET_ALL))
@@ -242,11 +221,14 @@ static void sdhci_init(struct sdhci_host *host, int soft)
 	else
 		sdhci_reset(host, SDHCI_RESET_ALL);
 
-	sdhci_clear_set_irqs(host, SDHCI_INT_ALL_MASK,
-		SDHCI_INT_BUS_POWER | SDHCI_INT_DATA_END_BIT |
-		SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_TIMEOUT | SDHCI_INT_INDEX |
-		SDHCI_INT_END_BIT | SDHCI_INT_CRC | SDHCI_INT_TIMEOUT |
-		SDHCI_INT_DATA_END | SDHCI_INT_RESPONSE);
+	host->ier = SDHCI_INT_BUS_POWER | SDHCI_INT_DATA_END_BIT |
+		    SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_TIMEOUT |
+		    SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC |
+		    SDHCI_INT_TIMEOUT | SDHCI_INT_DATA_END |
+		    SDHCI_INT_RESPONSE;
+
+	sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+	sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
 
 	if (soft) {
 		/* force clock reconfiguration */
@@ -721,9 +703,12 @@ static void sdhci_set_transfer_irqs(struct sdhci_host *host)
 	u32 dma_irqs = SDHCI_INT_DMA_END | SDHCI_INT_ADMA_ERROR;
 
 	if (host->flags & SDHCI_REQ_USE_DMA)
-		sdhci_clear_set_irqs(host, pio_irqs, dma_irqs);
+		host->ier = (host->ier & ~pio_irqs) | dma_irqs;
 	else
-		sdhci_clear_set_irqs(host, dma_irqs, pio_irqs);
+		host->ier = (host->ier & ~dma_irqs) | pio_irqs;
+
+	sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+	sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
 }
 
 static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd)
@@ -1713,9 +1698,12 @@ static void sdhci_enable_sdio_irq_nolock(struct sdhci_host *host, int enable)
 {
 	if (!(host->flags & SDHCI_DEVICE_DEAD)) {
 		if (enable)
-			sdhci_unmask_irqs(host, SDHCI_INT_CARD_INT);
+			host->ier |= SDHCI_INT_CARD_INT;
 		else
-			sdhci_mask_irqs(host, SDHCI_INT_CARD_INT);
+			host->ier &= ~SDHCI_INT_CARD_INT;
+
+		sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+		sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
 		mmiowb();
 	}
 }
@@ -1857,7 +1845,6 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode)
 {
 	struct sdhci_host *host;
 	u16 ctrl;
-	u32 ier;
 	int tuning_loop_counter = MAX_TUNING_LOOP;
 	unsigned long timeout;
 	int err = 0;
@@ -1911,8 +1898,8 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode)
 	 * to make sure we don't hit a controller bug, we _only_
 	 * enable Buffer Read Ready interrupt here.
 	 */
-	ier = sdhci_readl(host, SDHCI_INT_ENABLE);
-	sdhci_clear_set_irqs(host, ier, SDHCI_INT_DATA_AVAIL);
+	sdhci_writel(host, SDHCI_INT_DATA_AVAIL, SDHCI_INT_ENABLE);
+	sdhci_writel(host, SDHCI_INT_DATA_AVAIL, SDHCI_SIGNAL_ENABLE);
 
 	/*
 	 * Issue CMD19 repeatedly till Execute Tuning is set to 0 or the number
@@ -2047,7 +2034,8 @@ out:
 	if (err && (host->flags & SDHCI_USING_RETUNING_TIMER))
 		err = 0;
 
-	sdhci_clear_set_irqs(host, SDHCI_INT_DATA_AVAIL, ier);
+	sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+	sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
 	spin_unlock_irqrestore(&host->lock, flags);
 	sdhci_runtime_pm_put(host);
 
@@ -2460,10 +2448,12 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id)
 			 * More testing are needed here to ensure it works
 			 * for other platforms though.
 			 */
-			sdhci_mask_irqs(host, present ? SDHCI_INT_CARD_INSERT :
-							SDHCI_INT_CARD_REMOVE);
-			sdhci_unmask_irqs(host, present ? SDHCI_INT_CARD_REMOVE :
-							  SDHCI_INT_CARD_INSERT);
+			host->ier &= ~(SDHCI_INT_CARD_INSERT |
+				       SDHCI_INT_CARD_REMOVE);
+			host->ier |= present ? SDHCI_INT_CARD_REMOVE :
+					       SDHCI_INT_CARD_INSERT;
+			sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+			sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
 
 			sdhci_writel(host, intmask & (SDHCI_INT_CARD_INSERT |
 				     SDHCI_INT_CARD_REMOVE), SDHCI_INT_STATUS);
@@ -2592,7 +2582,9 @@ int sdhci_suspend_host(struct sdhci_host *host)
 	}
 
 	if (!device_may_wakeup(mmc_dev(host->mmc))) {
-		sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK);
+		host->ier = 0;
+		sdhci_writel(host, 0, SDHCI_INT_ENABLE);
+		sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE);
 		free_irq(host->irq, host);
 	} else {
 		sdhci_enable_irq_wakeups(host);
@@ -2691,7 +2683,9 @@ int sdhci_runtime_suspend_host(struct sdhci_host *host)
 	}
 
 	spin_lock_irqsave(&host->lock, flags);
-	sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK & ~SDHCI_INT_CARD_INT);
+	host->ier &= SDHCI_INT_CARD_INT;
+	sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+	sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
 	spin_unlock_irqrestore(&host->lock, flags);
 
 	synchronize_hardirq(host->irq);
@@ -3282,7 +3276,8 @@ int sdhci_add_host(struct sdhci_host *host)
 #ifdef SDHCI_USE_LEDS_CLASS
 reset:
 	sdhci_reset(host, SDHCI_RESET_ALL);
-	sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK);
+	sdhci_writel(host, 0, SDHCI_INT_ENABLE);
+	sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE);
 	free_irq(host->irq, host);
 #endif
 untasklet:
@@ -3324,7 +3319,8 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 	if (!dead)
 		sdhci_reset(host, SDHCI_RESET_ALL);
 
-	sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK);
+	sdhci_writel(host, 0, SDHCI_INT_ENABLE);
+	sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE);
 	free_irq(host->irq, host);
 
 	del_timer_sync(&host->timer);
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index f1c8e14e8751..9361d8ef509d 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -178,6 +178,9 @@ struct sdhci_host {
 
 	u32			thread_isr;
 
+	/* cached registers */
+	u32			ier;
+
 	wait_queue_head_t	buf_ready_int;	/* Waitqueue for Buffer Read Ready interrupt */
 	unsigned int		tuning_done;	/* Condition flag set when CMD19 succeeds */
 
-- 
cgit v1.2.3-71-gd317


From 0718e59ae259f7c48155b4e852d8b0632d59028e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 12:57:18 +0100
Subject: mmc: sdhci: move FSL ESDHC reset handling quirk into esdhc code

The Freescale esdhc driver is the only driver which needs the interrupt
registers restored after a reset.  Move this quirk to be part of the
ESDHC driver implementation.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/sdhci-esdhc-imx.c | 10 +++++++++-
 drivers/mmc/host/sdhci-esdhc.h     |  3 +--
 drivers/mmc/host/sdhci.c           |  5 -----
 include/linux/mmc/sdhci.h          |  2 --
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index b1d74fa33c5f..812c5772d900 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -876,6 +876,14 @@ static int esdhc_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs)
 	return esdhc_change_pinstate(host, uhs);
 }
 
+static void esdhc_reset(struct sdhci_host *host, u8 mask)
+{
+	sdhci_reset(host, mask);
+
+	sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
+	sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
+}
+
 static struct sdhci_ops sdhci_esdhc_ops = {
 	.read_l = esdhc_readl_le,
 	.read_w = esdhc_readw_le,
@@ -888,7 +896,7 @@ static struct sdhci_ops sdhci_esdhc_ops = {
 	.get_ro = esdhc_pltfm_get_ro,
 	.set_bus_width = esdhc_pltfm_set_bus_width,
 	.set_uhs_signaling = esdhc_set_uhs_signaling,
-	.reset = sdhci_reset,
+	.reset = esdhc_reset,
 };
 
 static const struct sdhci_pltfm_data sdhci_esdhc_imx_pdata = {
diff --git a/drivers/mmc/host/sdhci-esdhc.h b/drivers/mmc/host/sdhci-esdhc.h
index a7d9f95a7b03..de69bddc3afc 100644
--- a/drivers/mmc/host/sdhci-esdhc.h
+++ b/drivers/mmc/host/sdhci-esdhc.h
@@ -22,8 +22,7 @@
 				SDHCI_QUIRK_NO_BUSY_IRQ | \
 				SDHCI_QUIRK_NONSTANDARD_CLOCK | \
 				SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | \
-				SDHCI_QUIRK_PIO_NEEDS_DELAY | \
-				SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET)
+				SDHCI_QUIRK_PIO_NEEDS_DELAY)
 
 #define ESDHC_SYSTEM_CONTROL	0x2c
 #define ESDHC_CLOCK_MASK	0x0000fff0
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 5e25147e92f7..074157e8e73d 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -203,11 +203,6 @@ static void sdhci_do_reset(struct sdhci_host *host, u8 mask)
 
 	host->ops->reset(host, mask);
 
-	if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET) {
-		sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
-		sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
-	}
-
 	if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
 		if ((host->ops->enable_dma) && (mask & SDHCI_RESET_ALL))
 			host->ops->enable_dma(host);
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 9361d8ef509d..02919ef99419 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -61,8 +61,6 @@ struct sdhci_host {
 #define SDHCI_QUIRK_NONSTANDARD_CLOCK			(1<<17)
 /* Controller does not like fast PIO transfers */
 #define SDHCI_QUIRK_PIO_NEEDS_DELAY			(1<<18)
-/* Controller losing signal/interrupt enable states after reset */
-#define SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET		(1<<19)
 /* Controller has to be forced to use block size of 2048 bytes */
 #define SDHCI_QUIRK_FORCE_BLK_SZ_2048			(1<<20)
 /* Controller cannot do multi-block transfers */
-- 
cgit v1.2.3-71-gd317


From 1771059cf5f9c09e37ef6315df8acf120f2642fc Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 12:58:55 +0100
Subject: mmc: sdhci: convert sdhci_set_clock() into a library function

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/sdhci-acpi.c      |  2 ++
 drivers/mmc/host/sdhci-bcm-kona.c  |  1 +
 drivers/mmc/host/sdhci-bcm2835.c   |  1 +
 drivers/mmc/host/sdhci-cns3xxx.c   |  3 +--
 drivers/mmc/host/sdhci-dove.c      |  1 +
 drivers/mmc/host/sdhci-esdhc.h     |  1 -
 drivers/mmc/host/sdhci-of-arasan.c |  1 +
 drivers/mmc/host/sdhci-of-hlwd.c   |  1 +
 drivers/mmc/host/sdhci-pci.c       |  1 +
 drivers/mmc/host/sdhci-pltfm.c     |  1 +
 drivers/mmc/host/sdhci-pxav2.c     |  1 +
 drivers/mmc/host/sdhci-pxav3.c     |  1 +
 drivers/mmc/host/sdhci-s3c.c       | 19 ++++++++++++++-----
 drivers/mmc/host/sdhci-sirf.c      |  1 +
 drivers/mmc/host/sdhci-spear.c     |  1 +
 drivers/mmc/host/sdhci-tegra.c     |  1 +
 drivers/mmc/host/sdhci.c           | 17 ++++++-----------
 drivers/mmc/host/sdhci.h           |  1 +
 include/linux/mmc/sdhci.h          |  2 --
 19 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c
index aca84a682551..323e2a688563 100644
--- a/drivers/mmc/host/sdhci-acpi.c
+++ b/drivers/mmc/host/sdhci-acpi.c
@@ -102,12 +102,14 @@ static void sdhci_acpi_int_hw_reset(struct sdhci_host *host)
 }
 
 static const struct sdhci_ops sdhci_acpi_ops_dflt = {
+	.set_clock = sdhci_set_clock,
 	.enable_dma = sdhci_acpi_enable_dma,
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_reset,
 };
 
 static const struct sdhci_ops sdhci_acpi_ops_int = {
+	.set_clock = sdhci_set_clock,
 	.enable_dma = sdhci_acpi_enable_dma,
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_reset,
diff --git a/drivers/mmc/host/sdhci-bcm-kona.c b/drivers/mmc/host/sdhci-bcm-kona.c
index 7b97bfab910d..e610811c09b0 100644
--- a/drivers/mmc/host/sdhci-bcm-kona.c
+++ b/drivers/mmc/host/sdhci-bcm-kona.c
@@ -206,6 +206,7 @@ static void sdhci_bcm_kona_init_74_clocks(struct sdhci_host *host,
 }
 
 static struct sdhci_ops sdhci_bcm_kona_ops = {
+	.set_clock = sdhci_set_clock,
 	.get_max_clock = sdhci_bcm_kona_get_max_clk,
 	.get_timeout_clock = sdhci_bcm_kona_get_timeout_clock,
 	.platform_send_init_74_clocks = sdhci_bcm_kona_init_74_clocks,
diff --git a/drivers/mmc/host/sdhci-bcm2835.c b/drivers/mmc/host/sdhci-bcm2835.c
index 289b1c80d5fc..74906d6008e1 100644
--- a/drivers/mmc/host/sdhci-bcm2835.c
+++ b/drivers/mmc/host/sdhci-bcm2835.c
@@ -131,6 +131,7 @@ static const struct sdhci_ops bcm2835_sdhci_ops = {
 	.read_l = bcm2835_sdhci_readl,
 	.read_w = bcm2835_sdhci_readw,
 	.read_b = bcm2835_sdhci_readb,
+	.set_clock = sdhci_set_clock,
 	.get_max_clock = sdhci_pltfm_clk_get_max_clock,
 	.get_min_clock = bcm2835_sdhci_get_min_clock,
 	.set_bus_width = sdhci_set_bus_width,
diff --git a/drivers/mmc/host/sdhci-cns3xxx.c b/drivers/mmc/host/sdhci-cns3xxx.c
index 416f4a4c2e35..587d73ef33ff 100644
--- a/drivers/mmc/host/sdhci-cns3xxx.c
+++ b/drivers/mmc/host/sdhci-cns3xxx.c
@@ -89,8 +89,7 @@ static const struct sdhci_pltfm_data sdhci_cns3xxx_pdata = {
 		  SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK |
 		  SDHCI_QUIRK_INVERTED_WRITE_PROTECT |
 		  SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN |
-		  SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
-		  SDHCI_QUIRK_NONSTANDARD_CLOCK,
+		  SDHCI_QUIRK_BROKEN_TIMEOUT_VAL,
 };
 
 static int sdhci_cns3xxx_probe(struct platform_device *pdev)
diff --git a/drivers/mmc/host/sdhci-dove.c b/drivers/mmc/host/sdhci-dove.c
index 1408cc11d881..8ef4ab52f8e0 100644
--- a/drivers/mmc/host/sdhci-dove.c
+++ b/drivers/mmc/host/sdhci-dove.c
@@ -86,6 +86,7 @@ static u32 sdhci_dove_readl(struct sdhci_host *host, int reg)
 static const struct sdhci_ops sdhci_dove_ops = {
 	.read_w	= sdhci_dove_readw,
 	.read_l	= sdhci_dove_readl,
+	.set_clock = sdhci_set_clock,
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_reset,
 };
diff --git a/drivers/mmc/host/sdhci-esdhc.h b/drivers/mmc/host/sdhci-esdhc.h
index de69bddc3afc..3497cfaf683c 100644
--- a/drivers/mmc/host/sdhci-esdhc.h
+++ b/drivers/mmc/host/sdhci-esdhc.h
@@ -20,7 +20,6 @@
 
 #define ESDHC_DEFAULT_QUIRKS	(SDHCI_QUIRK_FORCE_BLK_SZ_2048 | \
 				SDHCI_QUIRK_NO_BUSY_IRQ | \
-				SDHCI_QUIRK_NONSTANDARD_CLOCK | \
 				SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | \
 				SDHCI_QUIRK_PIO_NEEDS_DELAY)
 
diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c
index faef21740584..f0ee594f25d1 100644
--- a/drivers/mmc/host/sdhci-of-arasan.c
+++ b/drivers/mmc/host/sdhci-of-arasan.c
@@ -52,6 +52,7 @@ static unsigned int sdhci_arasan_get_timeout_clock(struct sdhci_host *host)
 }
 
 static struct sdhci_ops sdhci_arasan_ops = {
+	.set_clock = sdhci_set_clock,
 	.get_max_clock = sdhci_pltfm_clk_get_max_clock,
 	.get_timeout_clock = sdhci_arasan_get_timeout_clock,
 	.set_bus_width = sdhci_set_bus_width,
diff --git a/drivers/mmc/host/sdhci-of-hlwd.c b/drivers/mmc/host/sdhci-of-hlwd.c
index fb01958cb18e..a4a1f0f2c0a0 100644
--- a/drivers/mmc/host/sdhci-of-hlwd.c
+++ b/drivers/mmc/host/sdhci-of-hlwd.c
@@ -58,6 +58,7 @@ static const struct sdhci_ops sdhci_hlwd_ops = {
 	.write_l = sdhci_hlwd_writel,
 	.write_w = sdhci_hlwd_writew,
 	.write_b = sdhci_hlwd_writeb,
+	.set_clock = sdhci_set_clock,
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_reset,
 };
diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index 87f9dd91f68c..b3a28f6b170e 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -1078,6 +1078,7 @@ static void sdhci_pci_hw_reset(struct sdhci_host *host)
 }
 
 static const struct sdhci_ops sdhci_pci_ops = {
+	.set_clock	= sdhci_set_clock,
 	.enable_dma	= sdhci_pci_enable_dma,
 	.set_bus_width	= sdhci_pci_set_bus_width,
 	.reset		= sdhci_reset,
diff --git a/drivers/mmc/host/sdhci-pltfm.c b/drivers/mmc/host/sdhci-pltfm.c
index bfbf467b61c7..1fb89f44bd58 100644
--- a/drivers/mmc/host/sdhci-pltfm.c
+++ b/drivers/mmc/host/sdhci-pltfm.c
@@ -45,6 +45,7 @@ unsigned int sdhci_pltfm_clk_get_max_clock(struct sdhci_host *host)
 EXPORT_SYMBOL_GPL(sdhci_pltfm_clk_get_max_clock);
 
 static const struct sdhci_ops sdhci_pltfm_ops = {
+	.set_clock = sdhci_set_clock,
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_reset,
 };
diff --git a/drivers/mmc/host/sdhci-pxav2.c b/drivers/mmc/host/sdhci-pxav2.c
index 2eee0c8b88eb..db5257bf032e 100644
--- a/drivers/mmc/host/sdhci-pxav2.c
+++ b/drivers/mmc/host/sdhci-pxav2.c
@@ -112,6 +112,7 @@ static void pxav2_mmc_set_bus_width(struct sdhci_host *host, int width)
 }
 
 static const struct sdhci_ops pxav2_sdhci_ops = {
+	.set_clock     = sdhci_set_clock,
 	.get_max_clock = sdhci_pltfm_clk_get_max_clock,
 	.set_bus_width = pxav2_mmc_set_bus_width,
 	.reset         = pxav2_reset,
diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c
index 86564233ae93..8a40e079a57e 100644
--- a/drivers/mmc/host/sdhci-pxav3.c
+++ b/drivers/mmc/host/sdhci-pxav3.c
@@ -225,6 +225,7 @@ static int pxav3_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs)
 }
 
 static const struct sdhci_ops pxav3_sdhci_ops = {
+	.set_clock = sdhci_set_clock,
 	.set_uhs_signaling = pxav3_set_uhs_signaling,
 	.platform_send_init_74_clocks = pxav3_gen_init_74_clocks,
 	.get_max_clock = sdhci_pltfm_clk_get_max_clock,
diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c
index 9d710b748b9c..9e6f1c52982c 100644
--- a/drivers/mmc/host/sdhci-s3c.c
+++ b/drivers/mmc/host/sdhci-s3c.c
@@ -55,6 +55,8 @@ struct sdhci_s3c {
 	struct clk		*clk_io;
 	struct clk		*clk_bus[MAX_BUS_CLK];
 	unsigned long		clk_rates[MAX_BUS_CLK];
+
+	bool			no_divider;
 };
 
 /**
@@ -67,6 +69,7 @@ struct sdhci_s3c {
  */
 struct sdhci_s3c_drv_data {
 	unsigned int	sdhci_quirks;
+	bool		no_divider;
 };
 
 static inline struct sdhci_s3c *to_s3c(struct sdhci_host *host)
@@ -116,7 +119,7 @@ static unsigned int sdhci_s3c_consider_clock(struct sdhci_s3c *ourhost,
 	 * If controller uses a non-standard clock division, find the best clock
 	 * speed possible with selected clock source and skip the division.
 	 */
-	if (ourhost->host->quirks & SDHCI_QUIRK_NONSTANDARD_CLOCK) {
+	if (ourhost->no_divider) {
 		rate = clk_round_rate(clksrc, wanted);
 		return wanted - rate;
 	}
@@ -161,8 +164,10 @@ static void sdhci_s3c_set_clock(struct sdhci_host *host, unsigned int clock)
 	host->mmc->actual_clock = 0;
 
 	/* don't bother if the clock is going off. */
-	if (clock == 0)
+	if (clock == 0) {
+		sdhci_set_clock(host, clock);
 		return;
+	}
 
 	for (src = 0; src < MAX_BUS_CLK; src++) {
 		delta = sdhci_s3c_consider_clock(ourhost, src, clock);
@@ -214,6 +219,8 @@ static void sdhci_s3c_set_clock(struct sdhci_host *host, unsigned int clock)
 	if (clock < 25 * 1000000)
 		ctrl |= (S3C_SDHCI_CTRL3_FCSEL3 | S3C_SDHCI_CTRL3_FCSEL2);
 	writel(ctrl, host->ioaddr + S3C_SDHCI_CONTROL3);
+
+	sdhci_set_clock(host, clock);
 }
 
 /**
@@ -603,8 +610,10 @@ static int sdhci_s3c_probe(struct platform_device *pdev)
 	/* Setup quirks for the controller */
 	host->quirks |= SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC;
 	host->quirks |= SDHCI_QUIRK_NO_HISPD_BIT;
-	if (drv_data)
+	if (drv_data) {
 		host->quirks |= drv_data->sdhci_quirks;
+		sc->no_divider = drv_data->no_divider;
+	}
 
 #ifndef CONFIG_MMC_SDHCI_S3C_DMA
 
@@ -653,7 +662,7 @@ static int sdhci_s3c_probe(struct platform_device *pdev)
 	 * If controller does not have internal clock divider,
 	 * we can use overriding functions instead of default.
 	 */
-	if (host->quirks & SDHCI_QUIRK_NONSTANDARD_CLOCK) {
+	if (sc->no_divider) {
 		sdhci_s3c_ops.set_clock = sdhci_cmu_set_clock;
 		sdhci_s3c_ops.get_min_clock = sdhci_cmu_get_min_clock;
 		sdhci_s3c_ops.get_max_clock = sdhci_cmu_get_max_clock;
@@ -794,7 +803,7 @@ static const struct dev_pm_ops sdhci_s3c_pmops = {
 
 #if defined(CONFIG_CPU_EXYNOS4210) || defined(CONFIG_SOC_EXYNOS4212)
 static struct sdhci_s3c_drv_data exynos4_sdhci_drv_data = {
-	.sdhci_quirks = SDHCI_QUIRK_NONSTANDARD_CLOCK,
+	.no_divider = true,
 };
 #define EXYNOS4_SDHCI_DRV_DATA ((kernel_ulong_t)&exynos4_sdhci_drv_data)
 #else
diff --git a/drivers/mmc/host/sdhci-sirf.c b/drivers/mmc/host/sdhci-sirf.c
index 5d79e10e1ba2..3b775348b470 100644
--- a/drivers/mmc/host/sdhci-sirf.c
+++ b/drivers/mmc/host/sdhci-sirf.c
@@ -28,6 +28,7 @@ static unsigned int sdhci_sirf_get_max_clk(struct sdhci_host *host)
 }
 
 static struct sdhci_ops sdhci_sirf_ops = {
+	.set_clock = sdhci_set_clock,
 	.get_max_clock	= sdhci_sirf_get_max_clk,
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_reset,
diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c
index c2a2bedc8813..8bf64ab36720 100644
--- a/drivers/mmc/host/sdhci-spear.c
+++ b/drivers/mmc/host/sdhci-spear.c
@@ -38,6 +38,7 @@ struct spear_sdhci {
 
 /* sdhci ops */
 static const struct sdhci_ops sdhci_pltfm_ops = {
+	.set_clock = sdhci_set_clock,
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_reset,
 };
diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
index 7754c0319fda..a0a8b5cc3b0c 100644
--- a/drivers/mmc/host/sdhci-tegra.c
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -153,6 +153,7 @@ static const struct sdhci_ops tegra_sdhci_ops = {
 	.read_l     = tegra_sdhci_readl,
 	.read_w     = tegra_sdhci_readw,
 	.write_l    = tegra_sdhci_writel,
+	.set_clock  = sdhci_set_clock,
 	.set_bus_width = tegra_sdhci_set_bus_width,
 	.reset      = tegra_sdhci_reset,
 };
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index d9b91fc17bb0..69e58d071b33 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1112,19 +1112,13 @@ static u16 sdhci_get_preset_value(struct sdhci_host *host)
 	return preset;
 }
 
-static void sdhci_set_clock(struct sdhci_host *host, unsigned int clock)
+void sdhci_set_clock(struct sdhci_host *host, unsigned int clock)
 {
 	int div = 0; /* Initialized for compiler warning */
 	int real_div = div, clk_mul = 1;
 	u16 clk = 0;
 	unsigned long timeout;
 
-	if (host->ops->set_clock) {
-		host->ops->set_clock(host, clock);
-		if (host->quirks & SDHCI_QUIRK_NONSTANDARD_CLOCK)
-			return;
-	}
-
 	host->mmc->actual_clock = 0;
 
 	sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL);
@@ -1221,6 +1215,7 @@ clock_set:
 	clk |= SDHCI_CLOCK_CARD_EN;
 	sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL);
 }
+EXPORT_SYMBOL_GPL(sdhci_set_clock);
 
 static int sdhci_set_power(struct sdhci_host *host, unsigned short power)
 {
@@ -1439,7 +1434,7 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 		sdhci_enable_preset_value(host, false);
 
 	if (!ios->clock || ios->clock != host->clock) {
-		sdhci_set_clock(host, ios->clock);
+		host->ops->set_clock(host, ios->clock);
 		host->clock = ios->clock;
 	}
 
@@ -1510,7 +1505,7 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 			sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL);
 
 			/* Re-enable SD Clock */
-			sdhci_set_clock(host, host->clock);
+			host->ops->set_clock(host, host->clock);
 		}
 
 
@@ -1555,7 +1550,7 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 		}
 
 		/* Re-enable SD Clock */
-		sdhci_set_clock(host, host->clock);
+		host->ops->set_clock(host, host->clock);
 	} else
 		sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL);
 
@@ -2129,7 +2124,7 @@ static void sdhci_tasklet_finish(unsigned long param)
 		/* Some controllers need this kick or reset won't work here */
 		if (host->quirks & SDHCI_QUIRK_CLOCK_BEFORE_RESET)
 			/* This is to force an update */
-			sdhci_set_clock(host, host->clock);
+			host->ops->set_clock(host, host->clock);
 
 		/* Spec says we should do both at the same time, but Ricoh
 		   controllers do not like that. */
diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h
index 7d84cb3b0e00..ac20195f667b 100644
--- a/drivers/mmc/host/sdhci.h
+++ b/drivers/mmc/host/sdhci.h
@@ -400,6 +400,7 @@ static inline bool sdhci_sdio_irq_enabled(struct sdhci_host *host)
 	return !!(host->flags & SDHCI_SDIO_IRQ_ENABLED);
 }
 
+void sdhci_set_clock(struct sdhci_host *host, unsigned int clock);
 void sdhci_set_bus_width(struct sdhci_host *host, int width);
 void sdhci_reset(struct sdhci_host *host, u8 mask);
 
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 02919ef99419..72a90baf111f 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -57,8 +57,6 @@ struct sdhci_host {
 #define SDHCI_QUIRK_BROKEN_CARD_DETECTION		(1<<15)
 /* Controller reports inverted write-protect state */
 #define SDHCI_QUIRK_INVERTED_WRITE_PROTECT		(1<<16)
-/* Controller has nonstandard clock management */
-#define SDHCI_QUIRK_NONSTANDARD_CLOCK			(1<<17)
 /* Controller does not like fast PIO transfers */
 #define SDHCI_QUIRK_PIO_NEEDS_DELAY			(1<<18)
 /* Controller has to be forced to use block size of 2048 bytes */
-- 
cgit v1.2.3-71-gd317


From d975f121011a58223c7936ab483c3374a83236c3 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 12:59:31 +0100
Subject: mmc: sdhci: cache timing information locally

Rather than reading back the timing information from the registers,
cache it locally.  This allows implementations to translate the UHS
timing by overriding the set_uhs_signaling() method as required
without also having to emulate the SDHCI_HOST_CONTROL2 register.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
[Ulf Hansson] Resolved conflict
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/sdhci.c  | 23 ++++++++++++-----------
 include/linux/mmc/sdhci.h |  2 ++
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 0073aae0adcb..956799c75df2 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1083,24 +1083,23 @@ static void sdhci_finish_command(struct sdhci_host *host)
 
 static u16 sdhci_get_preset_value(struct sdhci_host *host)
 {
-	u16 ctrl, preset = 0;
+	u16 preset = 0;
 
-	ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
-
-	switch (ctrl & SDHCI_CTRL_UHS_MASK) {
-	case SDHCI_CTRL_UHS_SDR12:
+	switch (host->timing) {
+	case MMC_TIMING_UHS_SDR12:
 		preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR12);
 		break;
-	case SDHCI_CTRL_UHS_SDR25:
+	case MMC_TIMING_UHS_SDR25:
 		preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR25);
 		break;
-	case SDHCI_CTRL_UHS_SDR50:
+	case MMC_TIMING_UHS_SDR50:
 		preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR50);
 		break;
-	case SDHCI_CTRL_UHS_SDR104:
+	case MMC_TIMING_UHS_SDR104:
+	case MMC_TIMING_MMC_HS200:
 		preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR104);
 		break;
-	case SDHCI_CTRL_UHS_DDR50:
+	case MMC_TIMING_UHS_DDR50:
 		preset = sdhci_readw(host, SDHCI_PRESET_FOR_DDR50);
 		break;
 	default:
@@ -1538,6 +1537,7 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 		sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL);
 
 		host->ops->set_uhs_signaling(host, ios->timing);
+		host->timing = ios->timing;
 
 		if (!(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN) &&
 				((ios->timing == MMC_TIMING_UHS_SDR12) ||
@@ -1842,12 +1842,13 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode)
 	 * If the Host Controller supports the HS200 mode then the
 	 * tuning function has to be executed.
 	 */
-	if (((ctrl & SDHCI_CTRL_UHS_MASK) == SDHCI_CTRL_UHS_SDR50) &&
+	if (host->timing == MMC_TIMING_UHS_SDR50 &&
 	    (host->flags & SDHCI_SDR50_NEEDS_TUNING ||
 	     host->flags & SDHCI_SDR104_NEEDS_TUNING))
 		requires_tuning_nonuhs = true;
 
-	if (((ctrl & SDHCI_CTRL_UHS_MASK) == SDHCI_CTRL_UHS_SDR104) ||
+	if (host->timing == MMC_TIMING_MMC_HS200 ||
+	    host->timing == MMC_TIMING_UHS_SDR104 ||
 	    requires_tuning_nonuhs)
 		ctrl |= SDHCI_CTRL_EXEC_TUNING;
 	else {
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 72a90baf111f..7f3efbab8732 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -172,6 +172,8 @@ struct sdhci_host {
 	unsigned int            ocr_avail_mmc;
 	u32 ocr_mask;		/* available voltages */
 
+	unsigned		timing;		/* Current timing */
+
 	u32			thread_isr;
 
 	/* cached registers */
-- 
cgit v1.2.3-71-gd317


From da91a8f9c0f56d75b35bfe2e2456187ab55b3639 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 25 Apr 2014 13:00:12 +0100
Subject: mmc: sdhci: track whether preset mode is currently enabled in
 hardware

Track whether preset mode is currently enabled in hardware, and use that
when making decisions elsewhere in the code rather than reading the
register and checking the bit.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Markus Pargmann <mpa@pengutronix.de>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/sdhci.c  | 44 ++++++++++++++++++++++++++------------------
 include/linux/mmc/sdhci.h |  1 +
 2 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index effd9e5d1d81..447eef8217c7 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -205,9 +205,14 @@ static void sdhci_do_reset(struct sdhci_host *host, u8 mask)
 
 	host->ops->reset(host, mask);
 
-	if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
-		if ((host->ops->enable_dma) && (mask & SDHCI_RESET_ALL))
-			host->ops->enable_dma(host);
+	if (mask & SDHCI_RESET_ALL) {
+		if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
+			if (host->ops->enable_dma)
+				host->ops->enable_dma(host);
+		}
+
+		/* Resetting the controller clears many */
+		host->preset_enabled = false;
 	}
 }
 
@@ -1126,8 +1131,7 @@ void sdhci_set_clock(struct sdhci_host *host, unsigned int clock)
 		return;
 
 	if (host->version >= SDHCI_SPEC_300) {
-		if (sdhci_readw(host, SDHCI_HOST_CONTROL2) &
-			SDHCI_CTRL_PRESET_VAL_ENABLE) {
+		if (host->preset_enabled) {
 			u16 pre_val;
 
 			clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL);
@@ -1493,13 +1497,13 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 		    (ios->timing == MMC_TIMING_UHS_SDR25))
 			ctrl |= SDHCI_CTRL_HISPD;
 
-		ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2);
-		if (!(ctrl_2 & SDHCI_CTRL_PRESET_VAL_ENABLE)) {
+		if (!host->preset_enabled) {
 			sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL);
 			/*
 			 * We only need to set Driver Strength if the
 			 * preset value enable is not set.
 			 */
+			ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2);
 			ctrl_2 &= ~SDHCI_CTRL_DRV_TYPE_MASK;
 			if (ios->drv_type == MMC_SET_DRIVER_TYPE_A)
 				ctrl_2 |= SDHCI_CTRL_DRV_TYPE_A;
@@ -2018,26 +2022,30 @@ out:
 
 static void sdhci_enable_preset_value(struct sdhci_host *host, bool enable)
 {
-	u16 ctrl;
-
 	/* Host Controller v3.00 defines preset value registers */
 	if (host->version < SDHCI_SPEC_300)
 		return;
 
-	ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
-
 	/*
 	 * We only enable or disable Preset Value if they are not already
 	 * enabled or disabled respectively. Otherwise, we bail out.
 	 */
-	if (enable && !(ctrl & SDHCI_CTRL_PRESET_VAL_ENABLE)) {
-		ctrl |= SDHCI_CTRL_PRESET_VAL_ENABLE;
-		sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
-		host->flags |= SDHCI_PV_ENABLED;
-	} else if (!enable && (ctrl & SDHCI_CTRL_PRESET_VAL_ENABLE)) {
-		ctrl &= ~SDHCI_CTRL_PRESET_VAL_ENABLE;
+	if (host->preset_enabled != enable) {
+		u16 ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
+
+		if (enable)
+			ctrl |= SDHCI_CTRL_PRESET_VAL_ENABLE;
+		else
+			ctrl &= ~SDHCI_CTRL_PRESET_VAL_ENABLE;
+
 		sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
-		host->flags &= ~SDHCI_PV_ENABLED;
+
+		if (enable)
+			host->flags |= SDHCI_PV_ENABLED;
+		else
+			host->flags &= ~SDHCI_PV_ENABLED;
+
+		host->preset_enabled = enable;
 	}
 }
 
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 7f3efbab8732..08abe9941884 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -143,6 +143,7 @@ struct sdhci_host {
 
 	bool runtime_suspended;	/* Host is runtime suspended */
 	bool bus_on;		/* Bus power prevents runtime suspend */
+	bool preset_enabled;	/* Preset is enabled */
 
 	struct mmc_request *mrq;	/* Current request */
 	struct mmc_command *cmd;	/* Current command */
-- 
cgit v1.2.3-71-gd317


From ee526d515ad12e9fee2d2dbfc7f626c0a5c7f417 Mon Sep 17 00:00:00 2001
From: Balaji T K <balajitk@ti.com>
Date: Fri, 9 May 2014 22:16:53 +0530
Subject: mmc: omap_hsmmc: split omap-dma header file

moving dmaengine consumer specific function to omap-dmaengine.h
to Resolve build failure seen with sh-allmodconfig:
    include/linux/omap-dma.h:171:8: error: expected identifier before numeric constant
    make[4]: *** [drivers/mmc/host/omap_hsmmc.o] Error 1

Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Cc: Tony Lindgren <tony@atomide.com>
Signed-off-by: Balaji T K <balajitk@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <chris@printf.net>
---
 drivers/mmc/host/omap_hsmmc.c  |  2 +-
 include/linux/omap-dma.h       | 19 +------------------
 include/linux/omap-dmaengine.h | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/omap-dmaengine.h

(limited to 'include/linux')

diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index cba71d69a79c..6b7b75585926 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -31,7 +31,7 @@
 #include <linux/of.h>
 #include <linux/of_gpio.h>
 #include <linux/of_device.h>
-#include <linux/omap-dma.h>
+#include <linux/omap-dmaengine.h>
 #include <linux/mmc/host.h>
 #include <linux/mmc/core.h>
 #include <linux/mmc/mmc.h>
diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h
index 41a13e70f41f..999f52d3d1e7 100644
--- a/include/linux/omap-dma.h
+++ b/include/linux/omap-dma.h
@@ -1,23 +1,6 @@
-/*
- * OMAP DMA Engine support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
 #ifndef __LINUX_OMAP_DMA_H
 #define __LINUX_OMAP_DMA_H
-
-struct dma_chan;
-
-#if defined(CONFIG_DMA_OMAP) || defined(CONFIG_DMA_OMAP_MODULE)
-bool omap_dma_filter_fn(struct dma_chan *, void *);
-#else
-static inline bool omap_dma_filter_fn(struct dma_chan *c, void *d)
-{
-	return false;
-}
-#endif
+#include <linux/omap-dmaengine.h>
 
 /*
  *  Legacy OMAP DMA handling defines and functions
diff --git a/include/linux/omap-dmaengine.h b/include/linux/omap-dmaengine.h
new file mode 100644
index 000000000000..2b0b6aa01922
--- /dev/null
+++ b/include/linux/omap-dmaengine.h
@@ -0,0 +1,21 @@
+/*
+ * OMAP DMA Engine support
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __LINUX_OMAP_DMAENGINE_H
+#define __LINUX_OMAP_DMAENGINE_H
+
+struct dma_chan;
+
+#if defined(CONFIG_DMA_OMAP) || defined(CONFIG_DMA_OMAP_MODULE)
+bool omap_dma_filter_fn(struct dma_chan *, void *);
+#else
+static inline bool omap_dma_filter_fn(struct dma_chan *c, void *d)
+{
+	return false;
+}
+#endif
+#endif /* __LINUX_OMAP_DMAENGINE_H */
-- 
cgit v1.2.3-71-gd317


From 73e4354444eef5251e5cdfd388ab02ef9f2e727e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 22 May 2014 16:42:41 +0800
Subject: workqueue: declare system_highpri_wq

system_highpri_wq is exported to modules via EXPORT_SYMBOL_GPL(),
but it was forgotten to be declared in workqueue.h. So we add the declaration
and a short description for it.

tj: Minor comment tweak.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d93d28b2ec73..b263b29bd98b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -340,6 +340,9 @@ enum {
  * short queue flush time.  Don't queue works which can run for too
  * long.
  *
+ * system_highpri_wq is similar to system_wq but for work items which
+ * require WQ_HIGHPRI.
+ *
  * system_long_wq is similar to system_wq but may host long running
  * works.  Queue flushing might take relatively long.
  *
@@ -358,6 +361,7 @@ enum {
  * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
  */
 extern struct workqueue_struct *system_wq;
+extern struct workqueue_struct *system_highpri_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
-- 
cgit v1.2.3-71-gd317


From 79bc251f0e0aea67bc230c530f7fa57f66f9cdf3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 22 May 2014 16:43:44 +0800
Subject: workqueue: remove unused WORK_CPU_END

WORK_CPU_END is totally unused since 4e8b22bd1a37 ("workqueue: fix
pool ID allocation leakage and remove BUILD_BUG_ON() in
init_workqueues"). It should be removed.

After it is removed, the comment "special cpu IDs" is not precise due to
there is only one special CPU ID (WORK_CPU_UNBOUND) left, so we also
change this comment to the description for WORK_CPU_UNBOUND.

tj: Minor description and comment tweaks.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b263b29bd98b..b8aee9453f22 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -56,9 +56,8 @@ enum {
 	WORK_NR_COLORS		= (1 << WORK_STRUCT_COLOR_BITS) - 1,
 	WORK_NO_COLOR		= WORK_NR_COLORS,
 
-	/* special cpu IDs */
+	/* not bound to any CPU, prefer the local CPU */
 	WORK_CPU_UNBOUND	= NR_CPUS,
-	WORK_CPU_END		= NR_CPUS + 1,
 
 	/*
 	 * Reserve 7 bits off of pwq pointer w/ debugobjects turned off.
-- 
cgit v1.2.3-71-gd317


From cafebac153ae54fd0aba5d4ad28af995532c5375 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 22 May 2014 16:43:56 +0800
Subject: workqueue: remove unused work_clear_pending()

In 8930caba3dbd ("workqueue: disable irq while manipulating PENDING"),
setting last CPU and clearing PENDING got merged into a single
operation (set_work_cpu_and_clear_pending()), which resulted that the
internal routine work_clear_pending() is not used any more.

tj: Minor description tweak.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b8aee9453f22..a0cc2e95ed1b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -273,13 +273,6 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #define delayed_work_pending(w) \
 	work_pending(&(w)->work)
 
-/**
- * work_clear_pending - for internal use only, mark a work item as not pending
- * @work: The work item in question
- */
-#define work_clear_pending(work) \
-	clear_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
-
 /*
  * Workqueue flags and constants.  For details, please refer to
  * Documentation/workqueue.txt.
-- 
cgit v1.2.3-71-gd317


From ca8a22634381537c92b5a10308652e1c38fd9edf Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 22 May 2014 10:41:08 -0400
Subject: tcp: make cwnd-limited checks measurement-based, and gentler

Experience with the recent e114a710aa50 ("tcp: fix cwnd limited
checking to improve congestion control") has shown that there are
common cases where that commit can cause cwnd to be much larger than
necessary. This leads to TSO autosizing cooking skbs that are too
large, among other things.

The main problems seemed to be:

(1) That commit attempted to predict the future behavior of the
connection by looking at the write queue (if TSO or TSQ limit
sending). That prediction sometimes overestimated future outstanding
packets.

(2) That commit always allowed cwnd to grow to twice the number of
outstanding packets (even in congestion avoidance, where this is not
needed).

This commit improves both of these, by:

(1) Switching to a measurement-based approach where we explicitly
track the largest number of packets in flight during the past window
("max_packets_out"), and remember whether we were cwnd-limited at the
moment we finished sending that flight.

(2) Only allowing cwnd to grow to twice the number of outstanding
packets ("max_packets_out") in slow start. In congestion avoidance
mode we now only allow cwnd to grow if it was fully utilized.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  6 ++++--
 include/net/tcp.h     | 11 ++++++++---
 net/ipv4/tcp_output.c | 37 +++++++++++++++++++++++--------------
 3 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bc35e4709e8e..a0513210798f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -197,7 +197,8 @@ struct tcp_sock {
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
-		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
+		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
+		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
@@ -209,6 +210,8 @@ struct tcp_sock {
 
 	u32	packets_out;	/* Packets which are "in flight"	*/
 	u32	retrans_out;	/* Retransmitted packets out		*/
+	u32	max_packets_out;  /* max packets_out in last window */
+	u32	max_packets_seq;  /* right edge of max_packets_out flight */
 
 	u16	urg_data;	/* Saved octet of OOB data and control flags */
 	u8	ecn_flags;	/* ECN status bits.			*/
@@ -230,7 +233,6 @@ struct tcp_sock {
 	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 	u32	snd_cwnd_used;
 	u32	snd_cwnd_stamp;
-	u32	lsnd_pending;	/* packets inflight or unsent since last xmit */
 	u32	prior_cwnd;	/* Congestion window at start of Recovery. */
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f5d6ca4a9d28..e80abe4486cb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -971,8 +971,9 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
 
 /* We follow the spirit of RFC2861 to validate cwnd but implement a more
  * flexible approach. The RFC suggests cwnd should not be raised unless
- * it was fully used previously. But we allow cwnd to grow as long as the
- * application has used half the cwnd.
+ * it was fully used previously. And that's exactly what we do in
+ * congestion avoidance mode. But in slow start we allow cwnd to grow
+ * as long as the application has used half the cwnd.
  * Example :
  *    cwnd is 10 (IW10), but application sends 9 frames.
  *    We allow cwnd to reach 18 when all frames are ACKed.
@@ -985,7 +986,11 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
-	return tp->snd_cwnd < 2 * tp->lsnd_pending;
+	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		return tp->snd_cwnd < 2 * tp->max_packets_out;
+
+	return tp->is_cwnd_limited;
 }
 
 static inline void tcp_check_probe_timer(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3d61c52bdf79..d463c35db33d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1402,11 +1402,19 @@ static void tcp_cwnd_application_limited(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs)
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	tp->lsnd_pending = tp->packets_out + unsent_segs;
+	/* Track the maximum number of outstanding packets in each
+	 * window, and remember whether we were cwnd-limited then.
+	 */
+	if (!before(tp->snd_una, tp->max_packets_seq) ||
+	    tp->packets_out > tp->max_packets_out) {
+		tp->max_packets_out = tp->packets_out;
+		tp->max_packets_seq = tp->snd_nxt;
+		tp->is_cwnd_limited = is_cwnd_limited;
+	}
 
 	if (tcp_is_cwnd_limited(sk)) {
 		/* Network is feed fully. */
@@ -1660,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  *
  * This algorithm is from John Heffner.
  */
-static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+				 bool *is_cwnd_limited)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1724,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	if (!tp->tso_deferred)
 		tp->tso_deferred = 1 | (jiffies << 1);
 
+	if (cong_win < send_win && cong_win < skb->len)
+		*is_cwnd_limited = true;
+
 	return true;
 
 send_now:
@@ -1881,9 +1893,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int tso_segs, sent_pkts, unsent_segs = 0;
+	unsigned int tso_segs, sent_pkts;
 	int cwnd_quota;
 	int result;
+	bool is_cwnd_limited = false;
 
 	sent_pkts = 0;
 
@@ -1908,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
 		cwnd_quota = tcp_cwnd_test(tp, skb);
 		if (!cwnd_quota) {
+			is_cwnd_limited = true;
 			if (push_one == 2)
 				/* Force out a loss probe pkt. */
 				cwnd_quota = 1;
@@ -1924,8 +1938,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 						      nonagle : TCP_NAGLE_PUSH))))
 				break;
 		} else {
-			if (!push_one && tcp_tso_should_defer(sk, skb))
-				goto compute_unsent_segs;
+			if (!push_one &&
+			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
+				break;
 		}
 
 		/* TCP Small Queues :
@@ -1950,14 +1965,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			 * there is no smp_mb__after_set_bit() yet
 			 */
 			smp_mb__after_clear_bit();
-			if (atomic_read(&sk->sk_wmem_alloc) > limit) {
-				u32 unsent_bytes;
-
-compute_unsent_segs:
-				unsent_bytes = tp->write_seq - tp->snd_nxt;
-				unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
+			if (atomic_read(&sk->sk_wmem_alloc) > limit)
 				break;
-			}
 		}
 
 		limit = mss_now;
@@ -1997,7 +2006,7 @@ repair:
 		/* Send one loss probe per tail loss episode. */
 		if (push_one != 2)
 			tcp_schedule_loss_probe(sk);
-		tcp_cwnd_validate(sk, unsent_segs);
+		tcp_cwnd_validate(sk, is_cwnd_limited);
 		return false;
 	}
 	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
-- 
cgit v1.2.3-71-gd317


From da08143b85203b581f4a6461b149186b0e9592df Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Tue, 20 May 2014 08:29:25 +0200
Subject: vlan: more careful checksum features handling

When combining real_dev's features and vlan_features, simple
bitwise AND is used. This doesn't work well for checksum
offloading features as if one set has NETIF_F_HW_CSUM and the
other NETIF_F_IP_CSUM and/or NETIF_F_IPV6_CSUM, we end up with
no checksum offloading. However, from the logical point of view
(how can_checksum_protocol() works), NETIF_F_HW_CSUM contains
the functionality of NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM so
that the result should be IP/IPV6.

Add helper function netdev_intersect_features() implementing
this logic and use it in vlan_dev_fix_features().

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 ++++++++++++++
 net/8021q/vlan_dev.c      |  4 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2dea98cbbdba..f4ad247fd324 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3153,6 +3153,20 @@ const char *netdev_drivername(const struct net_device *dev);
 
 void linkwatch_run_queue(void);
 
+static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
+							  netdev_features_t f2)
+{
+	if (f1 & NETIF_F_GEN_CSUM)
+		f1 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+	if (f2 & NETIF_F_GEN_CSUM)
+		f2 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+	f1 &= f2;
+	if (f1 & NETIF_F_GEN_CSUM)
+		f1 &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+
+	return f1;
+}
+
 static inline netdev_features_t netdev_get_wanted_features(
 	struct net_device *dev)
 {
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 8f025afa29fd..4181fb71ba77 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -678,9 +678,9 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
 	struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
 	netdev_features_t old_features = features;
 
-	features &= real_dev->vlan_features;
+	features = netdev_intersect_features(features, real_dev->vlan_features);
 	features |= NETIF_F_RXCSUM;
-	features &= real_dev->features;
+	features = netdev_intersect_features(features, real_dev->features);
 
 	features |= old_features & NETIF_F_SOFT_FEATURES;
 	features |= NETIF_F_LLTX;
-- 
cgit v1.2.3-71-gd317


From ea3429c77d4e34cb2983b90e49a5506fedf70b98 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 22 May 2014 09:47:50 -0700
Subject: of: mdio: remove of_phy_connect_fixed_link

All in-tree drivers have been converted to use the new pair of
functions: of_is_fixed_phy_link() plus of_phy_register_fixed_link(), we
can now safely remove of_phy_connect_fixed_link.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/of/of_mdio.c    | 38 --------------------------------------
 include/linux/of_mdio.h | 10 ----------
 2 files changed, 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 1def0bb5cb37..4c1e01ed16dc 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -245,44 +245,6 @@ struct phy_device *of_phy_connect(struct net_device *dev,
 }
 EXPORT_SYMBOL(of_phy_connect);
 
-/**
- * of_phy_connect_fixed_link - Parse fixed-link property and return a dummy phy
- * @dev: pointer to net_device claiming the phy
- * @hndlr: Link state callback for the network device
- * @iface: PHY data interface type
- *
- * This function is a temporary stop-gap and will be removed soon.  It is
- * only to support the fs_enet, ucc_geth and gianfar Ethernet drivers.  Do
- * not call this function from new drivers.
- */
-struct phy_device *of_phy_connect_fixed_link(struct net_device *dev,
-					     void (*hndlr)(struct net_device *),
-					     phy_interface_t iface)
-{
-	struct device_node *net_np;
-	char bus_id[MII_BUS_ID_SIZE + 3];
-	struct phy_device *phy;
-	const __be32 *phy_id;
-	int sz;
-
-	if (!dev->dev.parent)
-		return NULL;
-
-	net_np = dev->dev.parent->of_node;
-	if (!net_np)
-		return NULL;
-
-	phy_id = of_get_property(net_np, "fixed-link", &sz);
-	if (!phy_id || sz < sizeof(*phy_id))
-		return NULL;
-
-	sprintf(bus_id, PHY_ID_FMT, "fixed-0", be32_to_cpu(phy_id[0]));
-
-	phy = phy_connect(dev, bus_id, hndlr, iface);
-	return IS_ERR(phy) ? NULL : phy;
-}
-EXPORT_SYMBOL(of_phy_connect_fixed_link);
-
 /**
  * of_phy_attach - Attach to a PHY without starting the state machine
  * @dev: pointer to net_device claiming the phy
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index 0aa367e316cb..d449018d0726 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -22,9 +22,6 @@ extern struct phy_device *of_phy_connect(struct net_device *dev,
 struct phy_device *of_phy_attach(struct net_device *dev,
 				 struct device_node *phy_np, u32 flags,
 				 phy_interface_t iface);
-extern struct phy_device *of_phy_connect_fixed_link(struct net_device *dev,
-					 void (*hndlr)(struct net_device *),
-					 phy_interface_t iface);
 
 extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np);
 
@@ -59,13 +56,6 @@ static inline struct phy_device *of_phy_attach(struct net_device *dev,
 	return NULL;
 }
 
-static inline struct phy_device *of_phy_connect_fixed_link(struct net_device *dev,
-							   void (*hndlr)(struct net_device *),
-							   phy_interface_t iface)
-{
-	return NULL;
-}
-
 static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np)
 {
 	return NULL;
-- 
cgit v1.2.3-71-gd317


From 16e4d93f6de7063800f3f5e68f064b0ff8fae9b7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 19 May 2014 13:40:22 -0400
Subject: NFSD: Ignore client's source port on RDMA transports

An NFS/RDMA client's source port is meaningless for RDMA transports.
The transport layer typically sets the source port value on the
connection to a random ephemeral port.

Currently, NFS server administrators must specify the "insecure"
export option to enable clients to access exports via RDMA.

But this means NFS clients can access such an export via IP using an
ephemeral port, which may not be desirable.

This patch eliminates the need to specify the "insecure" export
option to allow NFS/RDMA clients access to an export.

BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=250
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          | 1 +
 net/sunrpc/svc_xprt.c                    | 2 +-
 net/sunrpc/svcsock.c                     | 9 +++++++++
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 7 +++++++
 4 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index b05963f09ebf..0cec1b94c670 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -24,6 +24,7 @@ struct svc_xprt_ops {
 	void		(*xpo_release_rqst)(struct svc_rqst *);
 	void		(*xpo_detach)(struct svc_xprt *);
 	void		(*xpo_free)(struct svc_xprt *);
+	int		(*xpo_secure_port)(struct svc_rqst *);
 };
 
 struct svc_xprt_class {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 06c6ff0cb911..614956f1777e 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -793,7 +793,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 
 	clear_bit(XPT_OLD, &xprt->xpt_flags);
 
-	rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+	rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp);
 	rqstp->rq_chandle.defer = svc_defer;
 
 	if (serv->sv_stats)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 43bcb4699d69..0cb34f5d58dc 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -400,6 +400,12 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
 	release_sock(sock->sk);
 #endif
 }
+
+static int svc_sock_secure_port(struct svc_rqst *rqstp)
+{
+	return svc_port_is_privileged(svc_addr(rqstp));
+}
+
 /*
  * INET callback when data has been received on the socket.
  */
@@ -678,6 +684,7 @@ static struct svc_xprt_ops svc_udp_ops = {
 	.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
 	.xpo_has_wspace = svc_udp_has_wspace,
 	.xpo_accept = svc_udp_accept,
+	.xpo_secure_port = svc_sock_secure_port,
 };
 
 static struct svc_xprt_class svc_udp_class = {
@@ -1234,6 +1241,7 @@ static struct svc_xprt_ops svc_tcp_bc_ops = {
 	.xpo_detach = svc_bc_tcp_sock_detach,
 	.xpo_free = svc_bc_sock_free,
 	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+	.xpo_secure_port = svc_sock_secure_port,
 };
 
 static struct svc_xprt_class svc_tcp_bc_class = {
@@ -1272,6 +1280,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
 	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
 	.xpo_has_wspace = svc_tcp_has_wspace,
 	.xpo_accept = svc_tcp_accept,
+	.xpo_secure_port = svc_sock_secure_port,
 };
 
 static struct svc_xprt_class svc_tcp_class = {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 25688fa2207f..02db8d9cc994 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -65,6 +65,7 @@ static void dto_tasklet_func(unsigned long data);
 static void svc_rdma_detach(struct svc_xprt *xprt);
 static void svc_rdma_free(struct svc_xprt *xprt);
 static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static int svc_rdma_secure_port(struct svc_rqst *);
 static void rq_cq_reap(struct svcxprt_rdma *xprt);
 static void sq_cq_reap(struct svcxprt_rdma *xprt);
 
@@ -82,6 +83,7 @@ static struct svc_xprt_ops svc_rdma_ops = {
 	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
 	.xpo_has_wspace = svc_rdma_has_wspace,
 	.xpo_accept = svc_rdma_accept,
+	.xpo_secure_port = svc_rdma_secure_port,
 };
 
 struct svc_xprt_class svc_rdma_class = {
@@ -1207,6 +1209,11 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
 	return 1;
 }
 
+static int svc_rdma_secure_port(struct svc_rqst *rqstp)
+{
+	return 1;
+}
+
 /*
  * Attempt to register the kvec representing the RPC memory with the
  * device.
-- 
cgit v1.2.3-71-gd317


From ef11ce24875a8a540adc185e7bce3d7d49c8296f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 12 May 2014 11:22:47 +1000
Subject: SUNRPC: track whether a request is coming from a loop-back interface.

If an incoming NFS request is coming from the local host, then
nfsd will need to perform some special handling.  So detect that
possibility and make the source visible in rq_local.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h      |  1 +
 include/linux/sunrpc/svc_xprt.h |  1 +
 net/sunrpc/sunrpc.h             | 13 +++++++++++++
 net/sunrpc/svcsock.c            |  5 +++++
 4 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 04e763221246..a0dbbd1e00e9 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -254,6 +254,7 @@ struct svc_rqst {
 	u32			rq_prot;	/* IP protocol */
 	unsigned short
 				rq_secure  : 1;	/* secure port */
+	unsigned short		rq_local   : 1;	/* local request */
 
 	void *			rq_argp;	/* decoded arguments */
 	void *			rq_resp;	/* xdr'd results */
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0cec1b94c670..7235040a19b2 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -64,6 +64,7 @@ struct svc_xprt {
 #define	XPT_DETACHED	10		/* detached from tempsocks list */
 #define XPT_LISTENER	11		/* listening endpoint */
 #define XPT_CACHE_AUTH	12		/* cache auth info */
+#define XPT_LOCAL	13		/* connection from loopback interface */
 
 	struct svc_serv		*xpt_server;	/* service for transport */
 	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 14c9f6d1c5ff..f2b7cb540e61 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -43,6 +43,19 @@ static inline int rpc_reply_expected(struct rpc_task *task)
 		(task->tk_msg.rpc_proc->p_decode != NULL);
 }
 
+static inline int sock_is_loopback(struct sock *sk)
+{
+	struct dst_entry *dst;
+	int loopback = 0;
+	rcu_read_lock();
+	dst = rcu_dereference(sk->sk_dst_cache);
+	if (dst && dst->dev &&
+	    (dst->dev->features & NETIF_F_LOOPBACK))
+		loopback = 1;
+	rcu_read_unlock();
+	return loopback;
+}
+
 int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
 		    struct page *headpage, unsigned long headoffset,
 		    struct page *tailpage, unsigned long tailoffset);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0cb34f5d58dc..f3b8eb309d01 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -874,6 +874,10 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
 	}
 	svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
 
+	if (sock_is_loopback(newsock->sk))
+		set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
+	else
+		clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
 	if (serv->sv_stats)
 		serv->sv_stats->nettcpconn++;
 
@@ -1119,6 +1123,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 
 	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
+	rqstp->rq_local	      = !!test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags);
 
 	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
 	calldir = p[1];
-- 
cgit v1.2.3-71-gd317


From ecc8fb11cdb37d108d4597ba0f6bdff77c6019af Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Thu, 22 May 2014 15:55:39 +0300
Subject: net/mlx4_core: Deprecate use_prio module parameter

use_prio was added as part of an infrastructure for running FCoE in A0 mode.
FCoE didn't get into Mellanox Upstream driver, and when it will, it won't be
using A0 steering mode.

Therefore we can safely deprecate this module parameter without hurting any
existing user.

CC: Carol Soto <clsoto@linux.vnet.ibm.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 8 ++++----
 include/linux/mlx4/device.h               | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a56f6012258d..08ff5dd9298f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -132,8 +132,7 @@ MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)");
 
 static bool use_prio;
 module_param_named(use_prio, use_prio, bool, 0444);
-MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
-		  "(0/1, default 0)");
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports (deprecated)");
 
 int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
 module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
@@ -290,7 +289,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 
 	dev->caps.log_num_macs  = log_num_mac;
 	dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS;
-	dev->caps.log_num_prios = use_prio ? 3 : 0;
 
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
 		dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
@@ -358,7 +356,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
 		(1 << dev->caps.log_num_macs) *
 		(1 << dev->caps.log_num_vlans) *
-		(1 << dev->caps.log_num_prios) *
 		dev->caps.num_ports;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
 
@@ -2775,6 +2772,9 @@ static int __init mlx4_verify_params(void)
 		pr_warning("mlx4_core: log_num_vlan - obsolete module param, using %d\n",
 			   MLX4_LOG_NUM_VLANS);
 
+	if (use_prio != 0)
+		pr_warn("mlx4_core: use_prio - obsolete module param, ignored\n");
+
 	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) {
 		pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);
 		return -1;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c0468e6f0442..ca38871a585c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -449,7 +449,6 @@ struct mlx4_caps {
 	int                     reserved_qps_base[MLX4_NUM_QP_REGION];
 	int                     log_num_macs;
 	int                     log_num_vlans;
-	int                     log_num_prios;
 	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
 	u8			supported_type[MLX4_MAX_PORTS + 1];
 	u8                      suggested_type[MLX4_MAX_PORTS + 1];
-- 
cgit v1.2.3-71-gd317


From ed616689a3d95eb6c9bdbb1ef74b0f50cbdf276a Mon Sep 17 00:00:00 2001
From: Sucheta Chakraborty <sucheta.chakraborty@qlogic.com>
Date: Thu, 22 May 2014 09:59:05 -0400
Subject: net-next:v4: Add support to configure SR-IOV VF minimum and maximum
 Tx rate through ip tool.

o min_tx_rate puts lower limit on the VF bandwidth. VF is guaranteed
  to have a bandwidth of at least this value.
  max_tx_rate puts cap on the VF bandwidth. VF can have a bandwidth
  of up to this value.

o A new handler set_vf_rate for attr IFLA_VF_RATE has been introduced
  which takes 4 arguments:
  netdev, VF number, min_tx_rate, max_tx_rate

o ndo_set_vf_rate replaces ndo_set_vf_tx_rate handler.

o Drivers that currently implement ndo_set_vf_tx_rate should now call
  ndo_set_vf_rate instead and reject attempt to set a minimum bandwidth
  greater than 0 for IFLA_VF_TX_RATE when IFLA_VF_RATE is not yet
  implemented by driver.

o If user enters only one of either min_tx_rate or max_tx_rate, then,
  userland should read back the other value from driver and set both
  for IFLA_VF_RATE.
  Drivers that have not yet implemented IFLA_VF_RATE should always
  return min_tx_rate as 0 when read from ip tool.

o If both IFLA_VF_TX_RATE and IFLA_VF_RATE options are specified, then
  IFLA_VF_RATE should override.

o Idea is to have consistent display of rate values to user.

o Usage example: -

  ./ip link set p4p1 vf 0 rate 900

  ./ip link show p4p1
  32: p4p1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode
  DEFAULT qlen 1000
    link/ether 00:0e:1e:08:b0:f0 brd ff:ff:ff:ff:ff:ff
    vf 0 MAC 3e:a0:ca:bd:ae:5a, tx rate 900 (Mbps), max_tx_rate 900Mbps
    vf 1 MAC f6:c6:7c:3f:3d:6c
    vf 2 MAC 56:32:43:98:d7:71
    vf 3 MAC d6:be:c3:b5:85:ff
    vf 4 MAC ee:a9:9a:1e:19:14
    vf 5 MAC 4a:d0:4c:07:52:18
    vf 6 MAC 3a:76:44:93:62:f9
    vf 7 MAC 82:e9:e7:e3:15:1a

  ./ip link set p4p1 vf 0 max_tx_rate 300 min_tx_rate 200

  ./ip link show p4p1
  32: p4p1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode
  DEFAULT qlen 1000
    link/ether 00:0e:1e:08:b0:f0 brd ff:ff:ff:ff:ff:ff
    vf 0 MAC 3e:a0:ca:bd:ae:5a, tx rate 300 (Mbps), max_tx_rate 300Mbps,
    min_tx_rate 200Mbps
    vf 1 MAC f6:c6:7c:3f:3d:6c
    vf 2 MAC 56:32:43:98:d7:71
    vf 3 MAC d6:be:c3:b5:85:ff
    vf 4 MAC ee:a9:9a:1e:19:14
    vf 5 MAC 4a:d0:4c:07:52:18
    vf 6 MAC 3a:76:44:93:62:f9
    vf 7 MAC 82:e9:e7:e3:15:1a

  ./ip link set p4p1 vf 0 max_tx_rate 600 rate 300

  ./ip link show p4p1
  32: p4p1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode
  DEFAULT qlen 1000
    link/ether 00:0e:1e:08:b0:f brd ff:ff:ff:ff:ff:ff
    vf 0 MAC 3e:a0:ca:bd:ae:5, tx rate 600 (Mbps), max_tx_rate 600Mbps,
    min_tx_rate 200Mbps
    vf 1 MAC f6:c6:7c:3f:3d:6c
    vf 2 MAC 56:32:43:98:d7:71
    vf 3 MAC d6:be:c3:b5:85:ff
    vf 4 MAC ee:a9:9a:1e:19:14
    vf 5 MAC 4a:d0:4c:07:52:18
    vf 6 MAC 3a:76:44:93:62:f9
    vf 7 MAC 82:e9:e7:e3:15:1a

Signed-off-by: Sucheta Chakraborty <sucheta.chakraborty@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c  |  3 +-
 drivers/net/ethernet/emulex/benet/be_main.c        | 21 +++++----
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 +++++++----
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  3 +-
 drivers/net/ethernet/intel/igb/igb_main.c          | 20 ++++++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c     | 13 ++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h     |  3 +-
 drivers/net/ethernet/mellanox/mlx4/cmd.c           | 11 ++---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic.h        |  1 +
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c   |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h  |  2 +-
 .../ethernet/qlogic/qlcnic/qlcnic_sriov_common.c   |  1 +
 .../net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c   | 52 +++++++++++++++-------
 drivers/net/ethernet/sfc/siena_sriov.c             |  3 +-
 include/linux/if_link.h                            |  3 +-
 include/linux/netdevice.h                          |  8 ++--
 include/uapi/linux/if_link.h                       |  9 +++-
 net/core/rtnetlink.c                               | 38 +++++++++++++---
 20 files changed, 156 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 81cc2d9831c2..8d0479d5be8e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -2576,7 +2576,8 @@ int bnx2x_get_vf_config(struct net_device *dev, int vfidx,
 
 	ivi->vf = vfidx;
 	ivi->qos = 0;
-	ivi->tx_rate = 10000; /* always 10G. TBA take from link struct */
+	ivi->max_tx_rate = 10000; /* always 10G. TBA take from link struct */
+	ivi->min_tx_rate = 0;
 	ivi->spoofchk = 1; /*always enabled */
 	if (vf->state == VF_ENABLED) {
 		/* mac and vlan are in vlan_mac objects */
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index dcc5e5c69743..4693d004a223 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1302,7 +1302,8 @@ static int be_get_vf_config(struct net_device *netdev, int vf,
 		return -EINVAL;
 
 	vi->vf = vf;
-	vi->tx_rate = vf_cfg->tx_rate;
+	vi->max_tx_rate = vf_cfg->tx_rate;
+	vi->min_tx_rate = 0;
 	vi->vlan = vf_cfg->vlan_tag & VLAN_VID_MASK;
 	vi->qos = vf_cfg->vlan_tag >> VLAN_PRIO_SHIFT;
 	memcpy(&vi->mac, vf_cfg->mac_addr, ETH_ALEN);
@@ -1342,7 +1343,8 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
 	return status;
 }
 
-static int be_set_vf_tx_rate(struct net_device *netdev, int vf, int rate)
+static int be_set_vf_tx_rate(struct net_device *netdev, int vf,
+			     int min_tx_rate, int max_tx_rate)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	int status = 0;
@@ -1353,18 +1355,21 @@ static int be_set_vf_tx_rate(struct net_device *netdev, int vf, int rate)
 	if (vf >= adapter->num_vfs)
 		return -EINVAL;
 
-	if (rate < 100 || rate > 10000) {
+	if (min_tx_rate)
+		return -EINVAL;
+
+	if (max_tx_rate < 100 || max_tx_rate > 10000) {
 		dev_err(&adapter->pdev->dev,
-			"tx rate must be between 100 and 10000 Mbps\n");
+			"max tx rate must be between 100 and 10000 Mbps\n");
 		return -EINVAL;
 	}
 
-	status = be_cmd_config_qos(adapter, rate / 10, vf + 1);
+	status = be_cmd_config_qos(adapter, max_tx_rate / 10, vf + 1);
 	if (status)
 		dev_err(&adapter->pdev->dev,
-			"tx rate %d on VF %d failed\n", rate, vf);
+			"max tx rate %d on VF %d failed\n", max_tx_rate, vf);
 	else
-		adapter->vf_cfg[vf].tx_rate = rate;
+		adapter->vf_cfg[vf].tx_rate = max_tx_rate;
 	return status;
 }
 static int be_set_vf_link_state(struct net_device *netdev, int vf,
@@ -4257,7 +4262,7 @@ static const struct net_device_ops be_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= be_vlan_rem_vid,
 	.ndo_set_vf_mac		= be_set_vf_mac,
 	.ndo_set_vf_vlan	= be_set_vf_vlan,
-	.ndo_set_vf_tx_rate	= be_set_vf_tx_rate,
+	.ndo_set_vf_rate	= be_set_vf_tx_rate,
 	.ndo_get_vf_config	= be_get_vf_config,
 	.ndo_set_vf_link_state  = be_set_vf_link_state,
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index e0e5c6a867b1..96f7fabd8758 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6724,7 +6724,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_set_features	= i40e_set_features,
 	.ndo_set_vf_mac		= i40e_ndo_set_vf_mac,
 	.ndo_set_vf_vlan	= i40e_ndo_set_vf_port_vlan,
-	.ndo_set_vf_tx_rate	= i40e_ndo_set_vf_bw,
+	.ndo_set_vf_rate	= i40e_ndo_set_vf_bw,
 	.ndo_get_vf_config	= i40e_ndo_get_vf_config,
 	.ndo_set_vf_link_state	= i40e_ndo_set_vf_link_state,
 #ifdef CONFIG_I40E_VXLAN
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 4d219566a04d..8564b0939dc4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2205,7 +2205,8 @@ error_pvid:
  *
  * configure vf tx rate
  **/
-int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int tx_rate)
+int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+		       int max_tx_rate)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_pf *pf = np->vsi->back;
@@ -2221,6 +2222,12 @@ int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int tx_rate)
 		goto error;
 	}
 
+	if (min_tx_rate) {
+		dev_err(&pf->pdev->dev, "Invalid min tx rate (%d) (greater than 0) specified for vf %d.\n",
+			min_tx_rate, vf_id);
+		return -EINVAL;
+	}
+
 	vf = &(pf->vf[vf_id]);
 	vsi = pf->vsi[vf->lan_vsi_index];
 	if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states)) {
@@ -2243,23 +2250,23 @@ int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int tx_rate)
 		break;
 	}
 
-	if (tx_rate > speed) {
-		dev_err(&pf->pdev->dev, "Invalid tx rate %d specified for vf %d.",
-			tx_rate, vf->vf_id);
+	if (max_tx_rate > speed) {
+		dev_err(&pf->pdev->dev, "Invalid max tx rate %d specified for vf %d.",
+			max_tx_rate, vf->vf_id);
 		ret = -EINVAL;
 		goto error;
 	}
 
 	/* Tx rate credits are in values of 50Mbps, 0 is disabled*/
-	ret = i40e_aq_config_vsi_bw_limit(&pf->hw, vsi->seid, tx_rate / 50, 0,
-					  NULL);
+	ret = i40e_aq_config_vsi_bw_limit(&pf->hw, vsi->seid, max_tx_rate / 50,
+					  0, NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Unable to set tx rate, error code %d.\n",
+		dev_err(&pf->pdev->dev, "Unable to set max tx rate, error code %d.\n",
 			ret);
 		ret = -EIO;
 		goto error;
 	}
-	vf->tx_rate = tx_rate;
+	vf->tx_rate = max_tx_rate;
 error:
 	return ret;
 }
@@ -2301,7 +2308,8 @@ int i40e_ndo_get_vf_config(struct net_device *netdev,
 
 	memcpy(&ivi->mac, vf->default_lan_addr.addr, ETH_ALEN);
 
-	ivi->tx_rate = vf->tx_rate;
+	ivi->max_tx_rate = vf->tx_rate;
+	ivi->min_tx_rate = 0;
 	ivi->vlan = le16_to_cpu(vsi->info.pvid) & I40E_VLAN_MASK;
 	ivi->qos = (le16_to_cpu(vsi->info.pvid) & I40E_PRIORITY_MASK) >>
 		   I40E_VLAN_PRIORITY_SHIFT;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index ba3d1f8414be..5a559be4ba2c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -116,7 +116,8 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf);
 int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac);
 int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
 			      int vf_id, u16 vlan_id, u8 qos);
-int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int tx_rate);
+int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+		       int max_tx_rate);
 int i40e_ndo_get_vf_config(struct net_device *netdev,
 			   int vf_id, struct ifla_vf_info *ivi);
 int i40e_ndo_set_vf_link_state(struct net_device *netdev, int vf_id, int link);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index bfcda8a455f4..1075b3f8c415 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -169,7 +169,7 @@ static void igb_restore_vf_multicasts(struct igb_adapter *adapter);
 static int igb_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac);
 static int igb_ndo_set_vf_vlan(struct net_device *netdev,
 			       int vf, u16 vlan, u8 qos);
-static int igb_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate);
+static int igb_ndo_set_vf_bw(struct net_device *, int, int, int);
 static int igb_ndo_set_vf_spoofchk(struct net_device *netdev, int vf,
 				   bool setting);
 static int igb_ndo_get_vf_config(struct net_device *netdev, int vf,
@@ -2084,7 +2084,7 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= igb_vlan_rx_kill_vid,
 	.ndo_set_vf_mac		= igb_ndo_set_vf_mac,
 	.ndo_set_vf_vlan	= igb_ndo_set_vf_vlan,
-	.ndo_set_vf_tx_rate	= igb_ndo_set_vf_bw,
+	.ndo_set_vf_rate	= igb_ndo_set_vf_bw,
 	.ndo_set_vf_spoofchk	= igb_ndo_set_vf_spoofchk,
 	.ndo_get_vf_config	= igb_ndo_get_vf_config,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -7879,7 +7879,8 @@ static void igb_check_vf_rate_limit(struct igb_adapter *adapter)
 	}
 }
 
-static int igb_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate)
+static int igb_ndo_set_vf_bw(struct net_device *netdev, int vf,
+			     int min_tx_rate, int max_tx_rate)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -7888,15 +7889,19 @@ static int igb_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate)
 	if (hw->mac.type != e1000_82576)
 		return -EOPNOTSUPP;
 
+	if (min_tx_rate)
+		return -EINVAL;
+
 	actual_link_speed = igb_link_mbps(adapter->link_speed);
 	if ((vf >= adapter->vfs_allocated_count) ||
 	    (!(rd32(E1000_STATUS) & E1000_STATUS_LU)) ||
-	    (tx_rate < 0) || (tx_rate > actual_link_speed))
+	    (max_tx_rate < 0) ||
+	    (max_tx_rate > actual_link_speed))
 		return -EINVAL;
 
 	adapter->vf_rate_link_speed = actual_link_speed;
-	adapter->vf_data[vf].tx_rate = (u16)tx_rate;
-	igb_set_vf_rate_limit(hw, vf, tx_rate, actual_link_speed);
+	adapter->vf_data[vf].tx_rate = (u16)max_tx_rate;
+	igb_set_vf_rate_limit(hw, vf, max_tx_rate, actual_link_speed);
 
 	return 0;
 }
@@ -7936,7 +7941,8 @@ static int igb_ndo_get_vf_config(struct net_device *netdev,
 		return -EINVAL;
 	ivi->vf = vf;
 	memcpy(&ivi->mac, adapter->vf_data[vf].vf_mac_addresses, ETH_ALEN);
-	ivi->tx_rate = adapter->vf_data[vf].tx_rate;
+	ivi->max_tx_rate = adapter->vf_data[vf].tx_rate;
+	ivi->min_tx_rate = 0;
 	ivi->vlan = adapter->vf_data[vf].pf_vlan;
 	ivi->qos = adapter->vf_data[vf].pf_qos;
 	ivi->spoofchk = adapter->vf_data[vf].spoofchk_enabled;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8089ea9f2fba..a5332389620a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7926,7 +7926,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_do_ioctl		= ixgbe_ioctl,
 	.ndo_set_vf_mac		= ixgbe_ndo_set_vf_mac,
 	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
-	.ndo_set_vf_tx_rate	= ixgbe_ndo_set_vf_bw,
+	.ndo_set_vf_rate	= ixgbe_ndo_set_vf_bw,
 	.ndo_set_vf_spoofchk	= ixgbe_ndo_set_vf_spoofchk,
 	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
 	.ndo_get_stats64	= ixgbe_get_stats64,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index a01417c06620..3248e208c9dc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1222,7 +1222,8 @@ void ixgbe_check_vf_rate_limit(struct ixgbe_adapter *adapter)
 	}
 }
 
-int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate)
+int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate,
+			int max_tx_rate)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	int link_speed;
@@ -1240,13 +1241,16 @@ int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate)
 	if (link_speed != 10000)
 		return -EINVAL;
 
+	if (min_tx_rate)
+		return -EINVAL;
+
 	/* rate limit cannot be less than 10Mbs or greater than link speed */
-	if (tx_rate && ((tx_rate <= 10) || (tx_rate > link_speed)))
+	if (max_tx_rate && ((max_tx_rate <= 10) || (max_tx_rate > link_speed)))
 		return -EINVAL;
 
 	/* store values */
 	adapter->vf_rate_link_speed = link_speed;
-	adapter->vfinfo[vf].tx_rate = tx_rate;
+	adapter->vfinfo[vf].tx_rate = max_tx_rate;
 
 	/* update hardware configuration */
 	ixgbe_set_vf_rate_limit(adapter, vf);
@@ -1288,7 +1292,8 @@ int ixgbe_ndo_get_vf_config(struct net_device *netdev,
 		return -EINVAL;
 	ivi->vf = vf;
 	memcpy(&ivi->mac, adapter->vfinfo[vf].vf_mac_addresses, ETH_ALEN);
-	ivi->tx_rate = adapter->vfinfo[vf].tx_rate;
+	ivi->max_tx_rate = adapter->vfinfo[vf].tx_rate;
+	ivi->min_tx_rate = 0;
 	ivi->vlan = adapter->vfinfo[vf].pf_vlan;
 	ivi->qos = adapter->vfinfo[vf].pf_qos;
 	ivi->spoofchk = adapter->vfinfo[vf].spoofchk_enabled;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index cea640147604..32c26d586c01 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -44,7 +44,8 @@ void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter);
 int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac);
 int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan,
 			   u8 qos);
-int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int tx_rate);
+int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate,
+			int max_tx_rate);
 int ixgbe_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting);
 int ixgbe_ndo_get_vf_config(struct net_device *netdev,
 			    int vf, struct ifla_vf_info *ivi);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 59c7fd406805..ca8e7cb5a8e4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2486,11 +2486,12 @@ int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_in
 	ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff);
 	ivf->mac[5] = ((s_info->mac)  & 0xff);
 
-	ivf->vlan	= s_info->default_vlan;
-	ivf->qos	= s_info->default_qos;
-	ivf->tx_rate	= s_info->tx_rate;
-	ivf->spoofchk	= s_info->spoofchk;
-	ivf->linkstate	= s_info->link_state;
+	ivf->vlan		= s_info->default_vlan;
+	ivf->qos		= s_info->default_qos;
+	ivf->max_tx_rate	= s_info->tx_rate;
+	ivf->min_tx_rate	= 0;
+	ivf->spoofchk		= s_info->spoofchk;
+	ivf->linkstate		= s_info->link_state;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
index 6e7527e2b595..41abe6070466 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
@@ -1319,6 +1319,7 @@ struct qlcnic_eswitch {
 #define QL_STATUS_INVALID_PARAM	-1
 
 #define MAX_BW			100	/* % of link speed */
+#define MIN_BW			1	/* % of link speed */
 #define MAX_VLAN_ID		4095
 #define MIN_VLAN_ID		2
 #define DEFAULT_MAC_LEARN	1
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index f0a285359e66..f06ba90b4282 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -525,7 +525,7 @@ static const struct net_device_ops qlcnic_netdev_ops = {
 #endif
 #ifdef CONFIG_QLCNIC_SRIOV
 	.ndo_set_vf_mac		= qlcnic_sriov_set_vf_mac,
-	.ndo_set_vf_tx_rate	= qlcnic_sriov_set_vf_tx_rate,
+	.ndo_set_vf_rate	= qlcnic_sriov_set_vf_tx_rate,
 	.ndo_get_vf_config	= qlcnic_sriov_get_vf_config,
 	.ndo_set_vf_vlan	= qlcnic_sriov_set_vf_vlan,
 	.ndo_set_vf_spoofchk	= qlcnic_sriov_set_vf_spoofchk,
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
index 335b50f7bd3e..4677b2edccca 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
@@ -233,7 +233,7 @@ bool qlcnic_sriov_soft_flr_check(struct qlcnic_adapter *,
 void qlcnic_sriov_pf_reset(struct qlcnic_adapter *);
 int qlcnic_sriov_pf_reinit(struct qlcnic_adapter *);
 int qlcnic_sriov_set_vf_mac(struct net_device *, int, u8 *);
-int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int);
+int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int, int);
 int qlcnic_sriov_get_vf_config(struct net_device *, int ,
 			       struct ifla_vf_info *);
 int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8);
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
index 498fa6350c8d..2bdd9deffb38 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
@@ -201,6 +201,7 @@ int qlcnic_sriov_init(struct qlcnic_adapter *adapter, int num_vfs)
 			sriov->vf_info[i].vp = vp;
 			vp->vlan_mode = QLC_GUEST_VLAN_MODE;
 			vp->max_tx_bw = MAX_BW;
+			vp->min_tx_bw = MIN_BW;
 			vp->spoofchk = false;
 			random_ether_addr(vp->mac);
 			dev_info(&adapter->pdev->dev,
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
index 6d2f72f114f2..a29538b86edf 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
@@ -1848,7 +1848,8 @@ int qlcnic_sriov_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
 	return 0;
 }
 
-int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf, int tx_rate)
+int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf,
+				int min_tx_rate, int max_tx_rate)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
@@ -1863,35 +1864,52 @@ int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf, int tx_rate)
 	if (vf >= sriov->num_vfs)
 		return -EINVAL;
 
-	if (tx_rate >= 10000 || tx_rate < 100) {
+	vf_info = &sriov->vf_info[vf];
+	vp = vf_info->vp;
+	vpid = vp->handle;
+
+	if (!min_tx_rate)
+		min_tx_rate = QLC_VF_MIN_TX_RATE;
+
+	if (max_tx_rate &&
+	    (max_tx_rate >= 10000 || max_tx_rate < min_tx_rate)) {
 		netdev_err(netdev,
-			   "Invalid Tx rate, allowed range is [%d - %d]",
-			   QLC_VF_MIN_TX_RATE, QLC_VF_MAX_TX_RATE);
+			   "Invalid max Tx rate, allowed range is [%d - %d]",
+			   min_tx_rate, QLC_VF_MAX_TX_RATE);
 		return -EINVAL;
 	}
 
-	if (tx_rate == 0)
-		tx_rate = 10000;
+	if (!max_tx_rate)
+		max_tx_rate = 10000;
 
-	vf_info = &sriov->vf_info[vf];
-	vp = vf_info->vp;
-	vpid = vp->handle;
+	if (min_tx_rate &&
+	    (min_tx_rate > max_tx_rate || min_tx_rate < QLC_VF_MIN_TX_RATE)) {
+		netdev_err(netdev,
+			   "Invalid min Tx rate, allowed range is [%d - %d]",
+			   QLC_VF_MIN_TX_RATE, max_tx_rate);
+		return -EINVAL;
+	}
 
 	if (test_bit(QLC_BC_VF_STATE, &vf_info->state)) {
 		if (qlcnic_sriov_get_vf_vport_info(adapter, &nic_info, vpid))
 			return -EIO;
 
-		nic_info.max_tx_bw = tx_rate / 100;
+		nic_info.max_tx_bw = max_tx_rate / 100;
+		nic_info.min_tx_bw = min_tx_rate / 100;
 		nic_info.bit_offsets = BIT_0;
 
 		if (qlcnic_sriov_pf_set_vport_info(adapter, &nic_info, vpid))
 			return -EIO;
 	}
 
-	vp->max_tx_bw = tx_rate / 100;
+	vp->max_tx_bw = max_tx_rate / 100;
+	netdev_info(netdev,
+		    "Setting Max Tx rate %d (Mbps), %d %% of PF bandwidth, for VF %d\n",
+		    max_tx_rate, vp->max_tx_bw, vf);
+	vp->min_tx_bw = min_tx_rate / 100;
 	netdev_info(netdev,
-		    "Setting Tx rate %d (Mbps), %d %% of PF bandwidth, for VF %d\n",
-		    tx_rate, vp->max_tx_bw, vf);
+		    "Setting Min Tx rate %d (Mbps), %d %% of PF bandwidth, for VF %d\n",
+		    min_tx_rate, vp->min_tx_bw, vf);
 	return 0;
 }
 
@@ -1990,9 +2008,13 @@ int qlcnic_sriov_get_vf_config(struct net_device *netdev,
 	ivi->qos = vp->qos;
 	ivi->spoofchk = vp->spoofchk;
 	if (vp->max_tx_bw == MAX_BW)
-		ivi->tx_rate = 0;
+		ivi->max_tx_rate = 0;
+	else
+		ivi->max_tx_rate = vp->max_tx_bw * 100;
+	if (vp->min_tx_bw == MIN_BW)
+		ivi->min_tx_rate = 0;
 	else
-		ivi->tx_rate = vp->max_tx_bw * 100;
+		ivi->min_tx_rate = vp->min_tx_bw * 100;
 
 	ivi->vf = vf;
 	return 0;
diff --git a/drivers/net/ethernet/sfc/siena_sriov.c b/drivers/net/ethernet/sfc/siena_sriov.c
index 9a9205e77896..43d2e64546ed 100644
--- a/drivers/net/ethernet/sfc/siena_sriov.c
+++ b/drivers/net/ethernet/sfc/siena_sriov.c
@@ -1633,7 +1633,8 @@ int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i,
 
 	ivi->vf = vf_i;
 	ether_addr_copy(ivi->mac, vf->addr.mac_addr);
-	ivi->tx_rate = 0;
+	ivi->max_tx_rate = 0;
+	ivi->min_tx_rate = 0;
 	tci = ntohs(vf->addr.tci);
 	ivi->vlan = tci & VLAN_VID_MASK;
 	ivi->qos = (tci >> VLAN_PRIO_SHIFT) & 0x7;
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index a86784dec3d3..119130e9298b 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -10,8 +10,9 @@ struct ifla_vf_info {
 	__u8 mac[32];
 	__u32 vlan;
 	__u32 qos;
-	__u32 tx_rate;
 	__u32 spoofchk;
 	__u32 linkstate;
+	__u32 min_tx_rate;
+	__u32 max_tx_rate;
 };
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f4ad247fd324..9ec3a945caf2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -850,7 +850,8 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	SR-IOV management functions.
  * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
  * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
- * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
+ * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
+ *			  int max_tx_rate);
  * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
  * int (*ndo_get_vf_config)(struct net_device *dev,
  *			    int vf, struct ifla_vf_info *ivf);
@@ -1044,8 +1045,9 @@ struct net_device_ops {
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
 						   int queue, u16 vlan, u8 qos);
-	int			(*ndo_set_vf_tx_rate)(struct net_device *dev,
-						      int vf, int rate);
+	int			(*ndo_set_vf_rate)(struct net_device *dev,
+						   int vf, int min_tx_rate,
+						   int max_tx_rate);
 	int			(*ndo_set_vf_spoofchk)(struct net_device *dev,
 						       int vf, bool setting);
 	int			(*ndo_get_vf_config)(struct net_device *dev,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9a7f7ace6649..622e7910b8cc 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -399,9 +399,10 @@ enum {
 	IFLA_VF_UNSPEC,
 	IFLA_VF_MAC,		/* Hardware queue specific attributes */
 	IFLA_VF_VLAN,
-	IFLA_VF_TX_RATE,	/* TX Bandwidth Allocation */
+	IFLA_VF_TX_RATE,	/* Max TX Bandwidth Allocation */
 	IFLA_VF_SPOOFCHK,	/* Spoof Checking on/off switch */
 	IFLA_VF_LINK_STATE,	/* link state enable/disable/auto switch */
+	IFLA_VF_RATE,		/* Min and Max TX Bandwidth Allocation */
 	__IFLA_VF_MAX,
 };
 
@@ -423,6 +424,12 @@ struct ifla_vf_tx_rate {
 	__u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
 };
 
+struct ifla_vf_rate {
+	__u32 vf;
+	__u32 min_tx_rate; /* Min Bandwidth in Mbps */
+	__u32 max_tx_rate; /* Max Bandwidth in Mbps */
+};
+
 struct ifla_vf_spoofchk {
 	__u32 vf;
 	__u32 setting;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9837bebf93ce..d6417464dc66 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -767,8 +767,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 		size += num_vfs *
 			(nla_total_size(sizeof(struct ifla_vf_mac)) +
 			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
-			 nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
-			 nla_total_size(sizeof(struct ifla_vf_spoofchk)));
+			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+			 nla_total_size(sizeof(struct ifla_vf_rate)));
 		return size;
 	} else
 		return 0;
@@ -1034,6 +1034,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			struct ifla_vf_info ivi;
 			struct ifla_vf_mac vf_mac;
 			struct ifla_vf_vlan vf_vlan;
+			struct ifla_vf_rate vf_rate;
 			struct ifla_vf_tx_rate vf_tx_rate;
 			struct ifla_vf_spoofchk vf_spoofchk;
 			struct ifla_vf_link_state vf_linkstate;
@@ -1054,6 +1055,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 				break;
 			vf_mac.vf =
 				vf_vlan.vf =
+				vf_rate.vf =
 				vf_tx_rate.vf =
 				vf_spoofchk.vf =
 				vf_linkstate.vf = ivi.vf;
@@ -1061,7 +1063,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
 			vf_vlan.vlan = ivi.vlan;
 			vf_vlan.qos = ivi.qos;
-			vf_tx_rate.rate = ivi.tx_rate;
+			vf_tx_rate.rate = ivi.max_tx_rate;
+			vf_rate.min_tx_rate = ivi.min_tx_rate;
+			vf_rate.max_tx_rate = ivi.max_tx_rate;
 			vf_spoofchk.setting = ivi.spoofchk;
 			vf_linkstate.link_state = ivi.linkstate;
 			vf = nla_nest_start(skb, IFLA_VF_INFO);
@@ -1071,6 +1075,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			}
 			if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
 			    nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+			    nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+				    &vf_rate) ||
 			    nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
 				    &vf_tx_rate) ||
 			    nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
@@ -1177,6 +1183,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 				    .len = sizeof(struct ifla_vf_tx_rate) },
 	[IFLA_VF_SPOOFCHK]	= { .type = NLA_BINARY,
 				    .len = sizeof(struct ifla_vf_spoofchk) },
+	[IFLA_VF_RATE]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_rate) },
 };
 
 static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1336,11 +1344,29 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
 		}
 		case IFLA_VF_TX_RATE: {
 			struct ifla_vf_tx_rate *ivt;
+			struct ifla_vf_info ivf;
 			ivt = nla_data(vf);
 			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_tx_rate)
-				err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
-							      ivt->rate);
+			if (ops->ndo_get_vf_config)
+				err = ops->ndo_get_vf_config(dev, ivt->vf,
+							     &ivf);
+			if (err)
+				break;
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_rate)
+				err = ops->ndo_set_vf_rate(dev, ivt->vf,
+							   ivf.min_tx_rate,
+							   ivt->rate);
+			break;
+		}
+		case IFLA_VF_RATE: {
+			struct ifla_vf_rate *ivt;
+			ivt = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_rate)
+				err = ops->ndo_set_vf_rate(dev, ivt->vf,
+							   ivt->min_tx_rate,
+							   ivt->max_tx_rate);
 			break;
 		}
 		case IFLA_VF_SPOOFCHK: {
-- 
cgit v1.2.3-71-gd317


From 1c19448c9ba6545b80ded18488a64a7f3d8e6998 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Fri, 23 May 2014 08:47:32 -0700
Subject: net: Make enabling of zero UDP6 csums more restrictive

RFC 6935 permits zero checksums to be used in IPv6 however this is
recommended only for certain tunnel protocols, it does not make
checksums completely optional like they are in IPv4.

This patch restricts the use of IPv6 zero checksums that was previously
intoduced. no_check6_tx and no_check6_rx have been added to control
the use of checksums in UDP6 RX and TX path. The normal
sk_no_check_{rx,tx} settings are not used (this avoids ambiguity when
dealing with a dual stack socket).

A helper function has been added (udp_set_no_check6) which can be
called by tunnel impelmentations to all zero checksums (send on the
socket, and accept them as valid).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h      | 24 +++++++++++++++++++++++-
 include/uapi/linux/udp.h |  2 ++
 net/ipv4/udp.c           | 20 +++++++++++++++++++-
 net/ipv6/udp.c           |  8 ++++----
 4 files changed, 48 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 42278bbf7a88..247cfdcc4b08 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -47,7 +47,9 @@ struct udp_sock {
 #define udp_portaddr_node	inet.sk.__sk_common.skc_portaddr_node
 	int		 pending;	/* Any pending frames ? */
 	unsigned int	 corkflag;	/* Cork is required */
-  	__u16		 encap_type;	/* Is this an Encapsulation socket? */
+	__u8		 encap_type;	/* Is this an Encapsulation socket? */
+	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
+			 no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -76,6 +78,26 @@ static inline struct udp_sock *udp_sk(const struct sock *sk)
 	return (struct udp_sock *)sk;
 }
 
+static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
+{
+	udp_sk(sk)->no_check6_tx = val;
+}
+
+static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
+{
+	udp_sk(sk)->no_check6_rx = val;
+}
+
+static inline bool udp_get_no_check6_tx(struct sock *sk)
+{
+	return udp_sk(sk)->no_check6_tx;
+}
+
+static inline bool udp_get_no_check6_rx(struct sock *sk)
+{
+	return udp_sk(sk)->no_check6_rx;
+}
+
 #define udp_portaddr_for_each_entry(__sk, node, list) \
 	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
 
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index e2bcfd75a30d..16574ea18f0c 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -29,6 +29,8 @@ struct udphdr {
 /* UDP socket options */
 #define UDP_CORK	1	/* Never send partially complete segments */
 #define UDP_ENCAP	100	/* Set the socket to accept encapsulated packets */
+#define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
+#define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 12c6175b29cd..e07d52b8617a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1968,7 +1968,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		       int (*push_pending_frames)(struct sock *))
 {
 	struct udp_sock *up = udp_sk(sk);
-	int val;
+	int val, valbool;
 	int err = 0;
 	int is_udplite = IS_UDPLITE(sk);
 
@@ -1978,6 +1978,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 	if (get_user(val, (int __user *)optval))
 		return -EFAULT;
 
+	valbool = val ? 1 : 0;
+
 	switch (optname) {
 	case UDP_CORK:
 		if (val != 0) {
@@ -2007,6 +2009,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		}
 		break;
 
+	case UDP_NO_CHECK6_TX:
+		up->no_check6_tx = valbool;
+		break;
+
+	case UDP_NO_CHECK6_RX:
+		up->no_check6_rx = valbool;
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
@@ -2089,6 +2099,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		val = up->encap_type;
 		break;
 
+	case UDP_NO_CHECK6_TX:
+		val = up->no_check6_tx;
+		break;
+
+	case UDP_NO_CHECK6_RX:
+		val = up->no_check6_rx;
+		break;
+
 	/* The following two cannot be changed on UDP sockets, the return is
 	 * always 0 (which corresponds to the full checksum coverage of UDP). */
 	case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b8db453133aa..60325236446a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -794,10 +794,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
 	while (sk) {
-		/* If zero checksum and sk_no_check is not on for
+		/* If zero checksum and no_check is not on for
 		 * the socket then skip it.
 		 */
-		if (uh->check || sk->sk_no_check_rx)
+		if (uh->check || udp_sk(sk)->no_check6_rx)
 			stack[count++] = sk;
 
 		sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
@@ -887,7 +887,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		if (!uh->check && !sk->sk_no_check_rx) {
+		if (!uh->check && !udp_sk(sk)->no_check6_rx) {
 			sock_put(sk);
 			udp6_csum_zero_error(skb);
 			goto csum_error;
@@ -1037,7 +1037,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 
 	if (is_udplite)
 		csum = udplite_csum_outgoing(sk, skb);
-	else if (sk->sk_no_check_tx) {   /* UDP csum disabled */
+	else if (up->no_check6_tx) {   /* UDP csum disabled */
 		skb->ip_summed = CHECKSUM_NONE;
 		goto send;
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
-- 
cgit v1.2.3-71-gd317


From 8556ce79d5986a87fee4c29300b4efee07c0f15e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 23 May 2014 18:43:57 +0200
Subject: net: filter: remove DL macro

Lets get rid of this macro. After commit 5bcfedf06f7f ("net: filter:
simplify label names from jump-table"), labels have become more
readable due to omission of BPF_ prefix but at the same time more
generic, so that things like `git grep -n` would not find them. As
a middle path, lets get rid of the DL macro as it's not strictly
needed and would otherwise just hide the full name.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |   3 -
 net/core/filter.c      | 193 +++++++++++++++++++++++++------------------------
 2 files changed, 99 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7977b3958e25..2b0056afd1f7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -37,9 +37,6 @@
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
-/* Placeholder/dummy for 0 */
-#define BPF_0		0
-
 /* Register numbers */
 enum {
 	BPF_REG_0 = 0,
diff --git a/net/core/filter.c b/net/core/filter.c
index 7067cb240d3e..b3b0889fe089 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -160,95 +160,100 @@ static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *ins
 	static const void *jumptable[256] = {
 		[0 ... 255] = &&default_label,
 		/* Now overwrite non-defaults ... */
-#define DL(A, B, C)	[BPF_##A|BPF_##B|BPF_##C] = &&A##_##B##_##C
-		DL(ALU, ADD, X),
-		DL(ALU, ADD, K),
-		DL(ALU, SUB, X),
-		DL(ALU, SUB, K),
-		DL(ALU, AND, X),
-		DL(ALU, AND, K),
-		DL(ALU, OR, X),
-		DL(ALU, OR, K),
-		DL(ALU, LSH, X),
-		DL(ALU, LSH, K),
-		DL(ALU, RSH, X),
-		DL(ALU, RSH, K),
-		DL(ALU, XOR, X),
-		DL(ALU, XOR, K),
-		DL(ALU, MUL, X),
-		DL(ALU, MUL, K),
-		DL(ALU, MOV, X),
-		DL(ALU, MOV, K),
-		DL(ALU, DIV, X),
-		DL(ALU, DIV, K),
-		DL(ALU, MOD, X),
-		DL(ALU, MOD, K),
-		DL(ALU, NEG, 0),
-		DL(ALU, END, TO_BE),
-		DL(ALU, END, TO_LE),
-		DL(ALU64, ADD, X),
-		DL(ALU64, ADD, K),
-		DL(ALU64, SUB, X),
-		DL(ALU64, SUB, K),
-		DL(ALU64, AND, X),
-		DL(ALU64, AND, K),
-		DL(ALU64, OR, X),
-		DL(ALU64, OR, K),
-		DL(ALU64, LSH, X),
-		DL(ALU64, LSH, K),
-		DL(ALU64, RSH, X),
-		DL(ALU64, RSH, K),
-		DL(ALU64, XOR, X),
-		DL(ALU64, XOR, K),
-		DL(ALU64, MUL, X),
-		DL(ALU64, MUL, K),
-		DL(ALU64, MOV, X),
-		DL(ALU64, MOV, K),
-		DL(ALU64, ARSH, X),
-		DL(ALU64, ARSH, K),
-		DL(ALU64, DIV, X),
-		DL(ALU64, DIV, K),
-		DL(ALU64, MOD, X),
-		DL(ALU64, MOD, K),
-		DL(ALU64, NEG, 0),
-		DL(JMP, CALL, 0),
-		DL(JMP, JA, 0),
-		DL(JMP, JEQ, X),
-		DL(JMP, JEQ, K),
-		DL(JMP, JNE, X),
-		DL(JMP, JNE, K),
-		DL(JMP, JGT, X),
-		DL(JMP, JGT, K),
-		DL(JMP, JGE, X),
-		DL(JMP, JGE, K),
-		DL(JMP, JSGT, X),
-		DL(JMP, JSGT, K),
-		DL(JMP, JSGE, X),
-		DL(JMP, JSGE, K),
-		DL(JMP, JSET, X),
-		DL(JMP, JSET, K),
-		DL(JMP, EXIT, 0),
-		DL(STX, MEM, B),
-		DL(STX, MEM, H),
-		DL(STX, MEM, W),
-		DL(STX, MEM, DW),
-		DL(STX, XADD, W),
-		DL(STX, XADD, DW),
-		DL(ST, MEM, B),
-		DL(ST, MEM, H),
-		DL(ST, MEM, W),
-		DL(ST, MEM, DW),
-		DL(LDX, MEM, B),
-		DL(LDX, MEM, H),
-		DL(LDX, MEM, W),
-		DL(LDX, MEM, DW),
-		DL(LD, ABS, W),
-		DL(LD, ABS, H),
-		DL(LD, ABS, B),
-		DL(LD, IND, W),
-		DL(LD, IND, H),
-		DL(LD, IND, B),
-#undef DL
+		/* 32 bit ALU operations */
+		[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+		[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+		[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+		[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+		[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+		[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+		[BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X,
+		[BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K,
+		[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+		[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+		[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+		[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+		[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+		[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+		[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+		[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+		[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+		[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+		[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+		[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+		[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+		[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+		[BPF_ALU | BPF_NEG] = &&ALU_NEG,
+		[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+		[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+		/* 64 bit ALU operations */
+		[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+		[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+		[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+		[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+		[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+		[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+		[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+		[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+		[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+		[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+		[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+		[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+		[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+		[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+		[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+		[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+		[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+		[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+		[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+		[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+		[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+		[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+		[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+		[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+		/* Call instruction */
+		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+		/* Jumps */
+		[BPF_JMP | BPF_JA] = &&JMP_JA,
+		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+		[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+		[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+		/* Program return */
+		[BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+		/* Store instructions */
+		[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+		[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+		[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+		[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+		[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+		[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+		[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+		[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+		[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+		[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+		/* Load instructions */
+		[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+		[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+		[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+		[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+		[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+		[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+		[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+		[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+		[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
 	};
 	void *ptr;
 	int off;
@@ -290,10 +295,10 @@ select_insn:
 	ALU(XOR,  ^)
 	ALU(MUL,  *)
 #undef ALU
-	ALU_NEG_0:
+	ALU_NEG:
 		A = (u32) -A;
 		CONT;
-	ALU64_NEG_0:
+	ALU64_NEG:
 		A = -A;
 		CONT;
 	ALU_MOV_X:
@@ -382,7 +387,7 @@ select_insn:
 		CONT;
 
 	/* CALL */
-	JMP_CALL_0:
+	JMP_CALL:
 		/* Function call scratches BPF_R1-BPF_R5 registers,
 		 * preserves BPF_R6-BPF_R9, and stores return value
 		 * into BPF_R0.
@@ -392,7 +397,7 @@ select_insn:
 		CONT;
 
 	/* JMP */
-	JMP_JA_0:
+	JMP_JA:
 		insn += insn->off;
 		CONT;
 	JMP_JEQ_X:
@@ -479,7 +484,7 @@ select_insn:
 			CONT_JMP;
 		}
 		CONT;
-	JMP_EXIT_0:
+	JMP_EXIT:
 		return BPF_R0;
 
 	/* STX and ST and LDX*/
-- 
cgit v1.2.3-71-gd317


From b1fcd35cf53553a0a3ef949b05106d921446abc3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 23 May 2014 18:43:58 +0200
Subject: net: filter: let unattached filters use sock_fprog_kern

The sk_unattached_filter_create() API is used by BPF filters that
are not directly attached or related to sockets, and are used in
team, ptp, xt_bpf, cls_bpf, etc. As such all users do their own
internal managment of obtaining filter blocks and thus already
have them in kernel memory and set up before calling into
sk_unattached_filter_create(). As a result, due to __user annotation
in sock_fprog, sparse triggers false positives (incorrect type in
assignment [different address space]) when filters are set up before
passing them to sk_unattached_filter_create(). Therefore, let
sk_unattached_filter_create() API use sock_fprog_kern to overcome
this issue.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/i4l/isdn_ppp.c              |  4 ++--
 drivers/net/ppp/ppp_generic.c            |  4 ++--
 drivers/net/team/team_mode_loadbalance.c | 10 +++++-----
 include/linux/filter.h                   |  2 +-
 lib/test_bpf.c                           |  2 +-
 net/core/filter.c                        |  2 +-
 net/core/ptp_classifier.c                |  2 +-
 net/netfilter/xt_bpf.c                   |  5 +++--
 net/sched/cls_bpf.c                      |  4 ++--
 9 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
index a5da511e3c9a..61ac63237446 100644
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ b/drivers/isdn/i4l/isdn_ppp.c
@@ -634,7 +634,7 @@ isdn_ppp_ioctl(int min, struct file *file, unsigned int cmd, unsigned long arg)
 #ifdef CONFIG_IPPP_FILTER
 	case PPPIOCSPASS:
 	{
-		struct sock_fprog fprog;
+		struct sock_fprog_kern fprog;
 		struct sock_filter *code;
 		int err, len = get_filter(argp, &code);
 
@@ -653,7 +653,7 @@ isdn_ppp_ioctl(int min, struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	case PPPIOCSACTIVE:
 	{
-		struct sock_fprog fprog;
+		struct sock_fprog_kern fprog;
 		struct sock_filter *code;
 		int err, len = get_filter(argp, &code);
 
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index e3923ebb693f..91d6c1272fcf 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -757,7 +757,7 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 		err = get_filter(argp, &code);
 		if (err >= 0) {
-			struct sock_fprog fprog = {
+			struct sock_fprog_kern fprog = {
 				.len = err,
 				.filter = code,
 			};
@@ -778,7 +778,7 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 		err = get_filter(argp, &code);
 		if (err >= 0) {
-			struct sock_fprog fprog = {
+			struct sock_fprog_kern fprog = {
 				.len = err,
 				.filter = code,
 			};
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index dbde3412ee5e..0a6ee07bf0af 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -49,7 +49,7 @@ struct lb_port_mapping {
 struct lb_priv_ex {
 	struct team *team;
 	struct lb_port_mapping tx_hash_to_port_mapping[LB_TX_HASHTABLE_SIZE];
-	struct sock_fprog *orig_fprog;
+	struct sock_fprog_kern *orig_fprog;
 	struct {
 		unsigned int refresh_interval; /* in tenths of second */
 		struct delayed_work refresh_dw;
@@ -241,10 +241,10 @@ static int lb_bpf_func_get(struct team *team, struct team_gsetter_ctx *ctx)
 	return 0;
 }
 
-static int __fprog_create(struct sock_fprog **pfprog, u32 data_len,
+static int __fprog_create(struct sock_fprog_kern **pfprog, u32 data_len,
 			  const void *data)
 {
-	struct sock_fprog *fprog;
+	struct sock_fprog_kern *fprog;
 	struct sock_filter *filter = (struct sock_filter *) data;
 
 	if (data_len % sizeof(struct sock_filter))
@@ -262,7 +262,7 @@ static int __fprog_create(struct sock_fprog **pfprog, u32 data_len,
 	return 0;
 }
 
-static void __fprog_destroy(struct sock_fprog *fprog)
+static void __fprog_destroy(struct sock_fprog_kern *fprog)
 {
 	kfree(fprog->filter);
 	kfree(fprog);
@@ -273,7 +273,7 @@ static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx)
 	struct lb_priv *lb_priv = get_lb_priv(team);
 	struct sk_filter *fp = NULL;
 	struct sk_filter *orig_fp;
-	struct sock_fprog *fprog = NULL;
+	struct sock_fprog_kern *fprog = NULL;
 	int err;
 
 	if (ctx->data.bin_val.len) {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2b0056afd1f7..625f4de9bdf2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -188,7 +188,7 @@ int sk_convert_filter(struct sock_filter *prog, int len,
 		      struct sock_filter_int *new_prog, int *new_len);
 
 int sk_unattached_filter_create(struct sk_filter **pfp,
-				struct sock_fprog *fprog);
+				struct sock_fprog_kern *fprog);
 void sk_unattached_filter_destroy(struct sk_filter *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 3d80adbdb559..e03991ea8cc2 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -1472,7 +1472,7 @@ static int run_one(struct sk_filter *fp, struct bpf_test *t)
 static __init int test_bpf(void)
 {
 	struct sk_filter *fp, *fp_ext = NULL;
-	struct sock_fprog fprog;
+	struct sock_fprog_kern fprog;
 	int err, i, err_cnt = 0;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
diff --git a/net/core/filter.c b/net/core/filter.c
index b3b0889fe089..2c2d35d9d101 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1585,7 +1585,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
  * a negative errno code is returned. On success the return is zero.
  */
 int sk_unattached_filter_create(struct sk_filter **pfp,
-				struct sock_fprog *fprog)
+				struct sock_fprog_kern *fprog)
 {
 	unsigned int fsize = sk_filter_proglen(fprog);
 	struct sk_filter *fp;
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index 37d86157b76e..d3027a73fd4b 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -133,7 +133,7 @@ void __init ptp_classifier_init(void)
 		{ 0x16,  0,  0, 0x00000000 },
 		{ 0x06,  0,  0, 0x00000000 },
 	};
-	struct sock_fprog ptp_prog = {
+	struct sock_fprog_kern ptp_prog = {
 		.len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
 	};
 
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 12d4da8e6c77..bbffdbdaf603 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -23,10 +23,11 @@ MODULE_ALIAS("ip6t_bpf");
 static int bpf_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_bpf_info *info = par->matchinfo;
-	struct sock_fprog program;
+	struct sock_fprog_kern program;
 
 	program.len = info->bpf_program_num_elem;
-	program.filter = (struct sock_filter __user *) info->bpf_program;
+	program.filter = info->bpf_program;
+
 	if (sk_unattached_filter_create(&info->filter, &program)) {
 		pr_info("bpf: check failed: parse error\n");
 		return -EINVAL;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 16186965af97..13f64df2c710 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -160,7 +160,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 {
 	struct sock_filter *bpf_ops, *bpf_old;
 	struct tcf_exts exts;
-	struct sock_fprog tmp;
+	struct sock_fprog_kern tmp;
 	struct sk_filter *fp, *fp_old;
 	u16 bpf_size, bpf_len;
 	u32 classid;
@@ -191,7 +191,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
 
 	tmp.len = bpf_len;
-	tmp.filter = (struct sock_filter __user *) bpf_ops;
+	tmp.filter = bpf_ops;
 
 	ret = sk_unattached_filter_create(&fp, &tmp);
 	if (ret)
-- 
cgit v1.2.3-71-gd317


From 61dc1abae64854c7cef543598b9e6f04886c4ebd Mon Sep 17 00:00:00 2001
From: Nick Dyer <nick.dyer@itdev.co.uk>
Date: Sun, 18 May 2014 23:16:49 -0700
Subject: Input: atmel_mxt_ts - read screen config from chip

By reading the touchscreen configuration from the settings that the
maXTouch chip is actually using, we can remove some platform data.

The matrix size is not used for anything, and results in some rather
confusing code to re-read it because it may change when configuration
is downloaded, so don't print it out.

Signed-off-by: Nick Dyer <nick.dyer@itdev.co.uk>
Acked-by: Benson Leung <bleung@chromium.org>
Acked-by: Yufeng Shen <miletus@chromium.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 arch/arm/mach-s5pv210/mach-goni.c         |   3 -
 drivers/input/touchscreen/atmel_mxt_ts.c  | 136 ++++++++++++++----------------
 drivers/platform/chrome/chromeos_laptop.c |   6 --
 include/linux/i2c/atmel_mxt_ts.h          |  15 ----
 4 files changed, 65 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s5pv210/mach-goni.c b/arch/arm/mach-s5pv210/mach-goni.c
index e549ecf0e5dc..bb9354f45e27 100644
--- a/arch/arm/mach-s5pv210/mach-goni.c
+++ b/arch/arm/mach-s5pv210/mach-goni.c
@@ -239,9 +239,6 @@ static void __init goni_radio_init(void)
 
 /* TSP */
 static struct mxt_platform_data qt602240_platform_data = {
-	.x_size		= 800,
-	.y_size		= 480,
-	.orient		= MXT_DIAGONAL,
 	.irqflags	= IRQF_TRIGGER_FALLING,
 };
 
diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index 7a9197a19f67..75493ca8e784 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -100,33 +100,16 @@
 
 /* MXT_TOUCH_MULTI_T9 field */
 #define MXT_TOUCH_CTRL		0
-#define MXT_TOUCH_XORIGIN	1
-#define MXT_TOUCH_YORIGIN	2
-#define MXT_TOUCH_XSIZE		3
-#define MXT_TOUCH_YSIZE		4
-#define MXT_TOUCH_BLEN		6
-#define MXT_TOUCH_TCHTHR	7
-#define MXT_TOUCH_TCHDI		8
-#define MXT_TOUCH_ORIENT	9
-#define MXT_TOUCH_MOVHYSTI	11
-#define MXT_TOUCH_MOVHYSTN	12
-#define MXT_TOUCH_NUMTOUCH	14
-#define MXT_TOUCH_MRGHYST	15
-#define MXT_TOUCH_MRGTHR	16
-#define MXT_TOUCH_AMPHYST	17
-#define MXT_TOUCH_XRANGE_LSB	18
-#define MXT_TOUCH_XRANGE_MSB	19
-#define MXT_TOUCH_YRANGE_LSB	20
-#define MXT_TOUCH_YRANGE_MSB	21
-#define MXT_TOUCH_XLOCLIP	22
-#define MXT_TOUCH_XHICLIP	23
-#define MXT_TOUCH_YLOCLIP	24
-#define MXT_TOUCH_YHICLIP	25
-#define MXT_TOUCH_XEDGECTRL	26
-#define MXT_TOUCH_XEDGEDIST	27
-#define MXT_TOUCH_YEDGECTRL	28
-#define MXT_TOUCH_YEDGEDIST	29
-#define MXT_TOUCH_JUMPLIMIT	30
+#define MXT_T9_ORIENT		9
+#define MXT_T9_RANGE		18
+
+struct t9_range {
+	u16 x;
+	u16 y;
+} __packed;
+
+/* Touch orient bits */
+#define MXT_XY_SWITCH		(1 << 0)
 
 /* MXT_PROCI_GRIPFACE_T20 field */
 #define MXT_GRIPFACE_CTRL	0
@@ -211,11 +194,6 @@
 #define MXT_PRESS		(1 << 6)
 #define MXT_DETECT		(1 << 7)
 
-/* Touch orient bits */
-#define MXT_XY_SWITCH		(1 << 0)
-#define MXT_X_INVERT		(1 << 1)
-#define MXT_Y_INVERT		(1 << 2)
-
 /* Touchscreen absolute values */
 #define MXT_MAX_AREA		0xff
 
@@ -580,11 +558,6 @@ static int __mxt_read_reg(struct i2c_client *client,
 	return ret;
 }
 
-static int mxt_read_reg(struct i2c_client *client, u16 reg, u8 *val)
-{
-	return __mxt_read_reg(client, reg, 1, val);
-}
-
 static int __mxt_write_reg(struct i2c_client *client, u16 reg, u16 len,
 			   const void *val)
 {
@@ -1029,12 +1002,59 @@ static void mxt_free_object_table(struct mxt_data *data)
 	data->T19_reportid = 0;
 }
 
+static int mxt_read_t9_resolution(struct mxt_data *data)
+{
+	struct i2c_client *client = data->client;
+	int error;
+	struct t9_range range;
+	unsigned char orient;
+	struct mxt_object *object;
+
+	object = mxt_get_object(data, MXT_TOUCH_MULTI_T9);
+	if (!object)
+		return -EINVAL;
+
+	error = __mxt_read_reg(client,
+			       object->start_address + MXT_T9_RANGE,
+			       sizeof(range), &range);
+	if (error)
+		return error;
+
+	le16_to_cpus(&range.x);
+	le16_to_cpus(&range.y);
+
+	error =  __mxt_read_reg(client,
+				object->start_address + MXT_T9_ORIENT,
+				1, &orient);
+	if (error)
+		return error;
+
+	/* Handle default values */
+	if (range.x == 0)
+		range.x = 1023;
+
+	if (range.y == 0)
+		range.y = 1023;
+
+	if (orient & MXT_XY_SWITCH) {
+		data->max_x = range.y;
+		data->max_y = range.x;
+	} else {
+		data->max_x = range.x;
+		data->max_y = range.y;
+	}
+
+	dev_dbg(&client->dev,
+		"Touchscreen size X%uY%u\n", data->max_x, data->max_y);
+
+	return 0;
+}
+
 static int mxt_initialize(struct mxt_data *data)
 {
 	struct i2c_client *client = data->client;
 	struct mxt_info *info = &data->info;
 	int error;
-	u8 val;
 
 	error = mxt_get_info(data);
 	if (error)
@@ -1063,26 +1083,16 @@ static int mxt_initialize(struct mxt_data *data)
 		goto err_free_object_table;
 	}
 
-	/* Update matrix size at info struct */
-	error = mxt_read_reg(client, MXT_MATRIX_X_SIZE, &val);
-	if (error)
-		goto err_free_object_table;
-	info->matrix_xsize = val;
-
-	error = mxt_read_reg(client, MXT_MATRIX_Y_SIZE, &val);
-	if (error)
+	error = mxt_read_t9_resolution(data);
+	if (error) {
+		dev_err(&client->dev, "Failed to initialize T9 resolution\n");
 		goto err_free_object_table;
-	info->matrix_ysize = val;
-
-	dev_info(&client->dev,
-			"Family: %u Variant: %u Firmware V%u.%u.%02X\n",
-			info->family_id, info->variant_id, info->version >> 4,
-			info->version & 0xf, info->build);
+	}
 
 	dev_info(&client->dev,
-			"Matrix X Size: %u Matrix Y Size: %u Objects: %u\n",
-			info->matrix_xsize, info->matrix_ysize,
-			info->object_num);
+		 "Family: %u Variant: %u Firmware V%u.%u.%02X Objects: %u\n",
+		 info->family_id, info->variant_id, info->version >> 4,
+		 info->version & 0xf, info->build, info->object_num);
 
 	return 0;
 
@@ -1091,20 +1101,6 @@ err_free_object_table:
 	return error;
 }
 
-static void mxt_calc_resolution(struct mxt_data *data)
-{
-	unsigned int max_x = data->pdata->x_size - 1;
-	unsigned int max_y = data->pdata->y_size - 1;
-
-	if (data->pdata->orient & MXT_XY_SWITCH) {
-		data->max_x = max_y;
-		data->max_y = max_x;
-	} else {
-		data->max_x = max_x;
-		data->max_y = max_y;
-	}
-}
-
 /* Firmware Version is returned as Major.Minor.Build */
 static ssize_t mxt_fw_version_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
@@ -1430,8 +1426,6 @@ static int mxt_probe(struct i2c_client *client,
 	init_completion(&data->reset_completion);
 	init_completion(&data->crc_completion);
 
-	mxt_calc_resolution(data);
-
 	error = mxt_initialize(data);
 	if (error)
 		goto err_free_mem;
diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index 8b7523ab62e5..7f1a2e2711bd 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -94,9 +94,6 @@ static int mxt_t19_keys[] = {
 };
 
 static struct mxt_platform_data atmel_224s_tp_platform_data = {
-	.x_size			= 102*20,
-	.y_size			= 68*20,
-	.orient			= MXT_VERTICAL_FLIP,
 	.irqflags		= IRQF_TRIGGER_FALLING,
 	.t19_num_keys		= ARRAY_SIZE(mxt_t19_keys),
 	.t19_keymap		= mxt_t19_keys,
@@ -111,9 +108,6 @@ static struct i2c_board_info atmel_224s_tp_device = {
 };
 
 static struct mxt_platform_data atmel_1664s_platform_data = {
-	.x_size			= 1700,
-	.y_size			= 2560,
-	.orient			= MXT_ROTATED_90_COUNTER,
 	.irqflags		= IRQF_TRIGGER_FALLING,
 	.config			= NULL,
 	.config_length		= 0,
diff --git a/include/linux/i2c/atmel_mxt_ts.h b/include/linux/i2c/atmel_mxt_ts.h
index 9f92135b6620..3891dc1de21c 100644
--- a/include/linux/i2c/atmel_mxt_ts.h
+++ b/include/linux/i2c/atmel_mxt_ts.h
@@ -15,26 +15,11 @@
 
 #include <linux/types.h>
 
-/* Orient */
-#define MXT_NORMAL		0x0
-#define MXT_DIAGONAL		0x1
-#define MXT_HORIZONTAL_FLIP	0x2
-#define MXT_ROTATED_90_COUNTER	0x3
-#define MXT_VERTICAL_FLIP	0x4
-#define MXT_ROTATED_90		0x5
-#define MXT_ROTATED_180		0x6
-#define MXT_DIAGONAL_COUNTER	0x7
-
 /* The platform data for the Atmel maXTouch touchscreen driver */
 struct mxt_platform_data {
 	const u8 *config;
 	size_t config_length;
 	u32 config_crc;
-
-	unsigned int x_size;
-	unsigned int y_size;
-	unsigned char orient;
-
 	unsigned long irqflags;
 	u8 t19_num_keys;
 	const unsigned int *t19_keymap;
-- 
cgit v1.2.3-71-gd317


From b475598aec63f2efbc78f0ff1895d917d2370846 Mon Sep 17 00:00:00 2001
From: Haggai Eran <haggaie@mellanox.com>
Date: Thu, 22 May 2014 14:50:10 +0300
Subject: mlx5_core: Store MR attributes in mlx5_mr_core during creation and
 after UMR

The patch stores iova, pd and size during mr creation and after UMRs
that modify them.  It removes the unused access flags field.

Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx5/mr.c              | 4 ++++
 drivers/net/ethernet/mellanox/mlx5/core/mr.c | 4 ++++
 include/linux/mlx5/driver.h                  | 1 -
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9d932a2aa9f4..f472ab246d94 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -794,6 +794,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
 		err = -EFAULT;
 	}
 
+	mr->mmr.iova = virt_addr;
+	mr->mmr.size = len;
+	mr->mmr.pd = to_mpd(pd)->pdn;
+
 unmap_dma:
 	up(&umrc->sem);
 	dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index 4cc927649404..ac52a0fe2d3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -82,7 +82,11 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
 		return mlx5_cmd_status_to_err(&lout.hdr);
 	}
 
+	mr->iova = be64_to_cpu(in->seg.start_addr);
+	mr->size = be64_to_cpu(in->seg.len);
 	mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key;
+	mr->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff;
+
 	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
 		      be32_to_cpu(lout.mkey), key, mr->key);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 93cef6313e72..2bce4aad2570 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -427,7 +427,6 @@ struct mlx5_core_mr {
 	u64			size;
 	u32			key;
 	u32			pd;
-	u32			access;
 };
 
 struct mlx5_core_srq {
-- 
cgit v1.2.3-71-gd317


From 29fe5732bef2ba3c785fcb4aa5ba7160ad8faba5 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Thu, 6 Mar 2014 09:44:18 -0800
Subject: mtd: pfow: remove unused variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes warning:

In file included from drivers/mtd/lpddr/qinfo_probe.c:31:0:
include/linux/mtd/pfow.h: In function ‘send_pfow_command’:
include/linux/mtd/pfow.h:104:6: warning: variable ‘chipnum’ set but not used [-Wunused-but-set-variable]
  int chipnum;
      ^

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/pfow.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/pfow.h b/include/linux/mtd/pfow.h
index b730d4f84655..42ff7ff09bf5 100644
--- a/include/linux/mtd/pfow.h
+++ b/include/linux/mtd/pfow.h
@@ -101,9 +101,6 @@ static inline void send_pfow_command(struct map_info *map,
 				unsigned long len, map_word *datum)
 {
 	int bits_per_chip = map_bankwidth(map) * 8;
-	int chipnum;
-	struct lpddr_private *lpddr = map->fldrv_priv;
-	chipnum = adr >> lpddr->chipshift;
 
 	map_write(map, CMD(cmd_code), map->pfow_base + PFOW_COMMAND_CODE);
 	map_write(map, CMD(adr & ((1<<bits_per_chip) - 1)),
-- 
cgit v1.2.3-71-gd317


From e7cd1d1eb16fcdf53001b926187a82f1f3e1a7e6 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 20 May 2014 11:17:54 -0700
Subject: mfd: twl4030-power: Add generic reset configuration

The twl4030 PMIC needs to be configured properly for things like
warm reset and deeper idle states so the PMIC manages the regulators
properly based on the hardware triggers from the SoC. Earlier
we have configured twl4030 using platform data, but we want to
do it for device tree based booting also.

In some cases configuring twl4030 is needed for things to work.
For example, when rebooting an OMAP3530 at 125 MHz, it hangs.
With this patch, TWL4030 will be reset when a warm reset occures,
and OMAP3530 does not hang on reboot.

Let's add device tree support and configure things for warm reset
as the default when compatible = "ti,twl4030-power". More
complicated configurations can be added to the driver based on
other compatible flags.

Note we now also make the pdata const like it should be.
This allows use it for match->data with the device tree
related functions.

Based on earlier patch by Matthias Brugger <matthias.bgg@gmail.com>
and Lesly A M <leslyam@ti.com>.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../devicetree/bindings/mfd/twl4030-power.txt      |   7 +-
 drivers/mfd/twl4030-power.c                        | 109 ++++++++++++++++++---
 include/linux/i2c/twl.h                            |   3 +
 3 files changed, 105 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mfd/twl4030-power.txt b/Documentation/devicetree/bindings/mfd/twl4030-power.txt
index 8e15ec35ac99..b90611650cd9 100644
--- a/Documentation/devicetree/bindings/mfd/twl4030-power.txt
+++ b/Documentation/devicetree/bindings/mfd/twl4030-power.txt
@@ -5,7 +5,12 @@ to control the power resources, including power scripts. For now, the
 binding only supports the complete shutdown of the system after poweroff.
 
 Required properties:
-- compatible : must be "ti,twl4030-power"
+- compatible : must be one of the following
+	"ti,twl4030-power"
+	"ti,twl4030-power-reset"
+
+The use of ti,twl4030-power-reset is recommended at least on
+3530 that needs a special configuration for warm reset to work.
 
 Optional properties:
 - ti,use_poweroff: With this flag, the chip will initiates an ACTIVE-to-OFF or
diff --git a/drivers/mfd/twl4030-power.c b/drivers/mfd/twl4030-power.c
index 0b037dca46a8..cb5b0cb8f933 100644
--- a/drivers/mfd/twl4030-power.c
+++ b/drivers/mfd/twl4030-power.c
@@ -29,6 +29,7 @@
 #include <linux/i2c/twl.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 
 #include <asm/mach-types.h>
 
@@ -128,6 +129,40 @@ static u8 res_config_addrs[] = {
 	[RES_MAIN_REF]	= 0x94,
 };
 
+/*
+ * Usable values for .remap_sleep and .remap_off
+ * Based on table "5.3.3 Resource Operating modes"
+ */
+enum {
+	TWL_REMAP_OFF = 0,
+	TWL_REMAP_SLEEP = 8,
+	TWL_REMAP_ACTIVE = 9,
+};
+
+/*
+ * Macros to configure the PM register states for various resources.
+ * Note that we can make MSG_SINGULAR etc private to this driver once
+ * omap3 has been made DT only.
+ */
+#define TWL_DFLT_DELAY		2	/* typically 2 32 KiHz cycles */
+#define TWL_RESOURCE_SET(res, state)					\
+	{ MSG_SINGULAR(DEV_GRP_NULL, (res), (state)), TWL_DFLT_DELAY }
+#define TWL_RESOURCE_ON(res)	TWL_RESOURCE_SET(res, RES_STATE_ACTIVE)
+#define TWL_RESOURCE_OFF(res)	TWL_RESOURCE_SET(res, RES_STATE_OFF)
+#define TWL_RESOURCE_RESET(res)	TWL_RESOURCE_SET(res, RES_STATE_WRST)
+/*
+ * It seems that type1 and type2 is just the resource init order
+ * number for the type1 and type2 group.
+ */
+#define TWL_RESOURCE_GROUP_RESET(group, type1, type2)			\
+	{ MSG_BROADCAST(DEV_GRP_NULL, (group), (type1), (type2),	\
+		RES_STATE_WRST), TWL_DFLT_DELAY }
+#define TWL_REMAP_SLEEP(res, devgrp, typ, typ2)				\
+	{ .resource = (res), .devgroup = (devgrp),			\
+	  .type = (typ), .type2 = (typ2),				\
+	  .remap_off = TWL_REMAP_OFF,					\
+	  .remap_sleep = TWL_REMAP_SLEEP, }
+
 static int twl4030_write_script_byte(u8 address, u8 byte)
 {
 	int err;
@@ -502,7 +537,8 @@ int twl4030_remove_script(u8 flags)
 	return err;
 }
 
-static int twl4030_power_configure_scripts(struct twl4030_power_data *pdata)
+static int
+twl4030_power_configure_scripts(const struct twl4030_power_data *pdata)
 {
 	int err;
 	int i;
@@ -518,7 +554,8 @@ static int twl4030_power_configure_scripts(struct twl4030_power_data *pdata)
 	return 0;
 }
 
-static int twl4030_power_configure_resources(struct twl4030_power_data *pdata)
+static int
+twl4030_power_configure_resources(const struct twl4030_power_data *pdata)
 {
 	struct twl4030_resconfig *resconfig = pdata->resource_config;
 	int err;
@@ -550,7 +587,7 @@ void twl4030_power_off(void)
 		pr_err("TWL4030 Unable to power off\n");
 }
 
-static bool twl4030_power_use_poweroff(struct twl4030_power_data *pdata,
+static bool twl4030_power_use_poweroff(const struct twl4030_power_data *pdata,
 					struct device_node *node)
 {
 	if (pdata && pdata->use_poweroff)
@@ -562,10 +599,60 @@ static bool twl4030_power_use_poweroff(struct twl4030_power_data *pdata,
 	return false;
 }
 
+#ifdef CONFIG_OF
+
+/* Generic warm reset configuration for omap3 */
+
+static struct twl4030_ins omap3_wrst_seq[] = {
+	TWL_RESOURCE_OFF(RES_NRES_PWRON),
+	TWL_RESOURCE_OFF(RES_RESET),
+	TWL_RESOURCE_RESET(RES_MAIN_REF),
+	TWL_RESOURCE_GROUP_RESET(RES_GRP_ALL, RES_TYPE_R0, RES_TYPE2_R2),
+	TWL_RESOURCE_RESET(RES_VUSB_3V1),
+	TWL_RESOURCE_GROUP_RESET(RES_GRP_ALL, RES_TYPE_R0, RES_TYPE2_R1),
+	TWL_RESOURCE_GROUP_RESET(RES_GRP_RC, RES_TYPE_ALL, RES_TYPE2_R0),
+	TWL_RESOURCE_ON(RES_RESET),
+	TWL_RESOURCE_ON(RES_NRES_PWRON),
+};
+
+static struct twl4030_script omap3_wrst_script = {
+	.script	= omap3_wrst_seq,
+	.size	= ARRAY_SIZE(omap3_wrst_seq),
+	.flags	= TWL4030_WRST_SCRIPT,
+};
+
+static struct twl4030_script *omap3_reset_scripts[] = {
+	&omap3_wrst_script,
+};
+
+static struct twl4030_resconfig omap3_rconfig[] = {
+	TWL_REMAP_SLEEP(RES_HFCLKOUT, DEV_GRP_P3, -1, -1),
+	TWL_REMAP_SLEEP(RES_VDD1, DEV_GRP_P1, -1, -1),
+	TWL_REMAP_SLEEP(RES_VDD2, DEV_GRP_P1, -1, -1),
+	{ 0, 0 },
+};
+
+static struct twl4030_power_data omap3_reset = {
+	.scripts		= omap3_reset_scripts,
+	.num			= ARRAY_SIZE(omap3_reset_scripts),
+	.resource_config	= omap3_rconfig,
+};
+
+static struct of_device_id twl4030_power_of_match[] = {
+	{
+		.compatible = "ti,twl4030-power-reset",
+		.data = &omap3_reset,
+	},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, twl4030_power_of_match);
+#endif	/* CONFIG_OF */
+
 static int twl4030_power_probe(struct platform_device *pdev)
 {
-	struct twl4030_power_data *pdata = dev_get_platdata(&pdev->dev);
+	const struct twl4030_power_data *pdata = dev_get_platdata(&pdev->dev);
 	struct device_node *node = pdev->dev.of_node;
+	const struct of_device_id *match;
 	int err = 0;
 	int err2 = 0;
 	u8 val;
@@ -586,8 +673,12 @@ static int twl4030_power_probe(struct platform_device *pdev)
 		return err;
 	}
 
+	match = of_match_device(of_match_ptr(twl4030_power_of_match),
+				&pdev->dev);
+	if (match && match->data)
+		pdata = match->data;
+
 	if (pdata) {
-		/* TODO: convert to device tree */
 		err = twl4030_power_configure_scripts(pdata);
 		if (err) {
 			pr_err("TWL4030 failed to load scripts\n");
@@ -637,14 +728,6 @@ static int twl4030_power_remove(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_OF
-static const struct of_device_id twl4030_power_of_match[] = {
-	{.compatible = "ti,twl4030-power", },
-	{ },
-};
-MODULE_DEVICE_TABLE(of, twl4030_power_of_match);
-#endif
-
 static struct platform_driver twl4030_power_driver = {
 	.driver = {
 		.name	= "twl4030_power",
diff --git a/include/linux/i2c/twl.h b/include/linux/i2c/twl.h
index ade1c06d4ceb..5fe031375ed4 100644
--- a/include/linux/i2c/twl.h
+++ b/include/linux/i2c/twl.h
@@ -486,7 +486,10 @@ static inline int twl6030_mmc_card_detect(struct device *dev, int slot)
 #define RES_GRP_ALL		0x7	/* All resource groups */
 
 #define RES_TYPE2_R0		0x0
+#define RES_TYPE2_R1		0x1
+#define RES_TYPE2_R2		0x2
 
+#define RES_TYPE_R0		0x0
 #define RES_TYPE_ALL		0x7
 
 /* Resource states */
-- 
cgit v1.2.3-71-gd317


From 482e7db160df713a2d1d4c7ee9fffad92008283f Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 20 May 2014 11:17:54 -0700
Subject: mfd: twl4030-power: Add support for board specific configuration

With the recommended twl4030 configuration added, we can now add
board specific changes as modifications to the recommended
configuration.

Note that the data is private to this driver, and the data must
always have a NULL resource in the sentinel.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/twl4030-power.c | 21 +++++++++++++++++++++
 include/linux/i2c/twl.h     |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/twl4030-power.c b/drivers/mfd/twl4030-power.c
index 2bfbb40ca9d2..4846c7b48ebb 100644
--- a/drivers/mfd/twl4030-power.c
+++ b/drivers/mfd/twl4030-power.c
@@ -567,13 +567,34 @@ twl4030_power_configure_scripts(const struct twl4030_power_data *pdata)
 	return 0;
 }
 
+static void twl4030_patch_rconfig(struct twl4030_resconfig *common,
+				  struct twl4030_resconfig *board)
+{
+	while (common->resource) {
+		struct twl4030_resconfig *b = board;
+
+		while (b->resource) {
+			if (b->resource == common->resource) {
+				*common = *b;
+				break;
+			}
+			b++;
+		}
+		common++;
+	}
+}
+
 static int
 twl4030_power_configure_resources(const struct twl4030_power_data *pdata)
 {
 	struct twl4030_resconfig *resconfig = pdata->resource_config;
+	struct twl4030_resconfig *boardconf = pdata->board_config;
 	int err;
 
 	if (resconfig) {
+		if (boardconf)
+			twl4030_patch_rconfig(resconfig, boardconf);
+
 		while (resconfig->resource) {
 			err = twl4030_configure_resource(resconfig);
 			if (err)
diff --git a/include/linux/i2c/twl.h b/include/linux/i2c/twl.h
index 5fe031375ed4..57fe782bf031 100644
--- a/include/linux/i2c/twl.h
+++ b/include/linux/i2c/twl.h
@@ -662,6 +662,7 @@ struct twl4030_power_data {
 	struct twl4030_script **scripts;
 	unsigned num;
 	struct twl4030_resconfig *resource_config;
+	struct twl4030_resconfig *board_config;
 #define TWL4030_RESCONFIG_UNDEF	((u8)-1)
 	bool use_poweroff;	/* Board is wired for TWL poweroff */
 };
-- 
cgit v1.2.3-71-gd317


From aa76fcf473f6bfa839f37f77b6fdb71f0fb88d8f Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 21 Feb 2014 17:36:21 +0200
Subject: CLK: TI: DPLL: add support for omap2 core dpll

OMAP2 has slightly different DPLL compared to later OMAP generations.
This patch adds support for the ti,omap2-dpll-core-clock and also adds
the bindings documentation.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 .../devicetree/bindings/clock/ti/dpll.txt          |  9 +++
 arch/arm/mach-omap2/clock.h                        |  1 -
 arch/arm/mach-omap2/clock2xxx.h                    |  4 --
 drivers/clk/ti/dpll.c                              | 78 +++++++++++++++++++---
 include/linux/clk/ti.h                             |  6 ++
 5 files changed, 82 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/ti/dpll.txt b/Documentation/devicetree/bindings/clock/ti/dpll.txt
index 30bfdb7c9f18..50a1a427608f 100644
--- a/Documentation/devicetree/bindings/clock/ti/dpll.txt
+++ b/Documentation/devicetree/bindings/clock/ti/dpll.txt
@@ -30,6 +30,7 @@ Required properties:
 		"ti,am3-dpll-clock",
 		"ti,am3-dpll-core-clock",
 		"ti,am3-dpll-x2-clock",
+		"ti,omap2-dpll-core-clock",
 
 - #clock-cells : from common clock binding; shall be set to 0.
 - clocks : link phandles of parent clocks, first entry lists reference clock
@@ -41,6 +42,7 @@ Required properties:
 	"mult-div1" - contains the multiplier / divider register base address
 	"autoidle" - contains the autoidle register base address (optional)
   ti,am3-* dpll types do not have autoidle register
+  ti,omap2-* dpll type does not support idlest / autoidle registers
 
 Optional properties:
 - DPLL mode setting - defining any one or more of the following overrides
@@ -73,3 +75,10 @@ Examples:
 		clocks = <&sys_clkin_ck>, <&sys_clkin_ck>;
 		reg = <0x90>, <0x5c>, <0x68>;
 	};
+
+	dpll_ck: dpll_ck {
+		#clock-cells = <0>;
+		compatible = "ti,omap2-dpll-core-clock";
+		clocks = <&sys_ck>, <&sys_ck>;
+		reg = <0x0500>, <0x0540>;
+	};
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index bda767a9dea8..f6e9904d7a75 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -279,7 +279,6 @@ extern const struct clk_hw_omap_ops clkhwops_omap3430es2_hsotgusb_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_apll54;
 extern const struct clk_hw_omap_ops clkhwops_apll96;
-extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 
 /* clksel_rate blocks shared between OMAP44xx and AM33xx */
diff --git a/arch/arm/mach-omap2/clock2xxx.h b/arch/arm/mach-omap2/clock2xxx.h
index 539dc08afbba..45f41a411603 100644
--- a/arch/arm/mach-omap2/clock2xxx.h
+++ b/arch/arm/mach-omap2/clock2xxx.h
@@ -21,10 +21,6 @@ unsigned long omap2xxx_sys_clk_recalc(struct clk_hw *clk,
 				      unsigned long parent_rate);
 unsigned long omap2_osc_clk_recalc(struct clk_hw *clk,
 				   unsigned long parent_rate);
-unsigned long omap2_dpllcore_recalc(struct clk_hw *hw,
-				    unsigned long parent_rate);
-int omap2_reprogram_dpllcore(struct clk_hw *clk, unsigned long rate,
-			     unsigned long parent_rate);
 void omap2xxx_clkt_dpllcore_init(struct clk_hw *hw);
 unsigned long omap2_clk_apll54_recalc(struct clk_hw *hw,
 				      unsigned long parent_rate);
diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index dda262db42ea..34e233990212 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -35,21 +35,18 @@ static const struct clk_ops dpll_m4xen_ck_ops = {
 	.set_rate	= &omap3_noncore_dpll_set_rate,
 	.get_parent	= &omap2_init_dpll_parent,
 };
+#else
+static const struct clk_ops dpll_m4xen_ck_ops = {};
 #endif
 
+#if defined(CONFIG_ARCH_OMAP3) || defined(CONFIG_ARCH_OMAP4) || \
+	defined(CONFIG_SOC_OMAP5) || defined(CONFIG_SOC_DRA7XX) || \
+	defined(CONFIG_SOC_AM33XX) || defined(CONFIG_SOC_AM43XX)
 static const struct clk_ops dpll_core_ck_ops = {
 	.recalc_rate	= &omap3_dpll_recalc,
 	.get_parent	= &omap2_init_dpll_parent,
 };
 
-#ifdef CONFIG_ARCH_OMAP3
-static const struct clk_ops omap3_dpll_core_ck_ops = {
-	.get_parent	= &omap2_init_dpll_parent,
-	.recalc_rate	= &omap3_dpll_recalc,
-	.round_rate	= &omap2_dpll_round_rate,
-};
-#endif
-
 static const struct clk_ops dpll_ck_ops = {
 	.enable		= &omap3_noncore_dpll_enable,
 	.disable	= &omap3_noncore_dpll_disable,
@@ -65,6 +62,33 @@ static const struct clk_ops dpll_no_gate_ck_ops = {
 	.round_rate	= &omap2_dpll_round_rate,
 	.set_rate	= &omap3_noncore_dpll_set_rate,
 };
+#else
+static const struct clk_ops dpll_core_ck_ops = {};
+static const struct clk_ops dpll_ck_ops = {};
+static const struct clk_ops dpll_no_gate_ck_ops = {};
+const struct clk_hw_omap_ops clkhwops_omap3_dpll = {};
+#endif
+
+#ifdef CONFIG_ARCH_OMAP2
+static const struct clk_ops omap2_dpll_core_ck_ops = {
+	.get_parent	= &omap2_init_dpll_parent,
+	.recalc_rate	= &omap2_dpllcore_recalc,
+	.round_rate	= &omap2_dpll_round_rate,
+	.set_rate	= &omap2_reprogram_dpllcore,
+};
+#else
+static const struct clk_ops omap2_dpll_core_ck_ops = {};
+#endif
+
+#ifdef CONFIG_ARCH_OMAP3
+static const struct clk_ops omap3_dpll_core_ck_ops = {
+	.get_parent	= &omap2_init_dpll_parent,
+	.recalc_rate	= &omap3_dpll_recalc,
+	.round_rate	= &omap2_dpll_round_rate,
+};
+#else
+static const struct clk_ops omap3_dpll_core_ck_ops = {};
+#endif
 
 #ifdef CONFIG_ARCH_OMAP3
 static const struct clk_ops omap3_dpll_ck_ops = {
@@ -237,10 +261,27 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	init->parent_names = parent_names;
 
 	dd->control_reg = ti_clk_get_reg_addr(node, 0);
-	dd->idlest_reg = ti_clk_get_reg_addr(node, 1);
-	dd->mult_div1_reg = ti_clk_get_reg_addr(node, 2);
 
-	if (!dd->control_reg || !dd->idlest_reg || !dd->mult_div1_reg)
+	/*
+	 * Special case for OMAP2 DPLL, register order is different due to
+	 * missing idlest_reg, also clkhwops is different. Detected from
+	 * missing idlest_mask.
+	 */
+	if (!dd->idlest_mask) {
+		dd->mult_div1_reg = ti_clk_get_reg_addr(node, 1);
+#ifdef CONFIG_ARCH_OMAP2
+		clk_hw->ops = &clkhwops_omap2xxx_dpll;
+		omap2xxx_clkt_dpllcore_init(&clk_hw->hw);
+#endif
+	} else {
+		dd->idlest_reg = ti_clk_get_reg_addr(node, 1);
+		if (!dd->idlest_reg)
+			goto cleanup;
+
+		dd->mult_div1_reg = ti_clk_get_reg_addr(node, 2);
+	}
+
+	if (!dd->control_reg || !dd->mult_div1_reg)
 		goto cleanup;
 
 	if (dd->autoidle_mask) {
@@ -547,3 +588,18 @@ static void __init of_ti_am3_core_dpll_setup(struct device_node *node)
 }
 CLK_OF_DECLARE(ti_am3_core_dpll_clock, "ti,am3-dpll-core-clock",
 	       of_ti_am3_core_dpll_setup);
+
+static void __init of_ti_omap2_core_dpll_setup(struct device_node *node)
+{
+	const struct dpll_data dd = {
+		.enable_mask = 0x3,
+		.mult_mask = 0x3ff << 12,
+		.div1_mask = 0xf << 8,
+		.max_divider = 16,
+		.min_divider = 1,
+	};
+
+	of_ti_dpll_setup(node, &omap2_dpll_core_ck_ops, &dd);
+}
+CLK_OF_DECLARE(ti_omap2_core_dpll_clock, "ti,omap2-dpll-core-clock",
+	       of_ti_omap2_core_dpll_setup);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 4a21a872dbbd..753878c6fa52 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -259,6 +259,11 @@ int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
 int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
 void omap3_clk_lock_dpll5(void);
+unsigned long omap2_dpllcore_recalc(struct clk_hw *hw,
+				    unsigned long parent_rate);
+int omap2_reprogram_dpllcore(struct clk_hw *clk, unsigned long rate,
+			     unsigned long parent_rate);
+void omap2xxx_clkt_dpllcore_init(struct clk_hw *hw);
 
 void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index);
 void ti_dt_clocks_register(struct ti_dt_clk *oclks);
@@ -287,6 +292,7 @@ static inline void of_ti_clk_allow_autoidle_all(void) { }
 static inline void of_ti_clk_deny_autoidle_all(void) { }
 #endif
 
+extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap3_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
 extern const struct clk_hw_omap_ops clkhwops_wait;
-- 
cgit v1.2.3-71-gd317


From 4d008589e271e28eae728eef7f5fb1f658f12b9f Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 24 Feb 2014 16:06:34 +0200
Subject: CLK: TI: APLL: add support for omap2 aplls

This patch adds support for omap2 type aplls, which have gating and
autoidle functionality.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 .../devicetree/bindings/clock/ti/apll.txt          |  24 ++-
 arch/arm/mach-omap2/clock.h                        |  11 --
 drivers/clk/ti/apll.c                              | 181 +++++++++++++++++++++
 include/linux/clk/ti.h                             |  21 ++-
 4 files changed, 220 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/ti/apll.txt b/Documentation/devicetree/bindings/clock/ti/apll.txt
index 7faf5a68b3be..ade4dd4c30f0 100644
--- a/Documentation/devicetree/bindings/clock/ti/apll.txt
+++ b/Documentation/devicetree/bindings/clock/ti/apll.txt
@@ -14,18 +14,32 @@ a subtype of a DPLL [2], although a simplified one at that.
 [2] Documentation/devicetree/bindings/clock/ti/dpll.txt
 
 Required properties:
-- compatible : shall be "ti,dra7-apll-clock"
+- compatible : shall be "ti,dra7-apll-clock" or "ti,omap2-apll-clock"
 - #clock-cells : from common clock binding; shall be set to 0.
 - clocks : link phandles of parent clocks (clk-ref and clk-bypass)
 - reg : address and length of the register set for controlling the APLL.
   It contains the information of registers in the following order:
-	"control" - contains the control register base address
-	"idlest" - contains the idlest register base address
+	"control" - contains the control register offset
+	"idlest" - contains the idlest register offset
+	"autoidle" - contains the autoidle register offset (OMAP2 only)
+- ti,clock-frequency : static clock frequency for the clock (OMAP2 only)
+- ti,idlest-shift : bit-shift for the idlest field (OMAP2 only)
+- ti,bit-shift : bit-shift for enable and autoidle fields (OMAP2 only)
 
 Examples:
-	apll_pcie_ck: apll_pcie_ck@4a008200 {
+	apll_pcie_ck: apll_pcie_ck {
 		#clock-cells = <0>;
 		clocks = <&apll_pcie_in_clk_mux>, <&dpll_pcie_ref_ck>;
-		reg = <0x4a00821c 0x4>, <0x4a008220 0x4>;
+		reg = <0x021c>, <0x0220>;
 		compatible = "ti,dra7-apll-clock";
 	};
+
+	apll96_ck: apll96_ck {
+		#clock-cells = <0>;
+		compatible = "ti,omap2-apll-clock";
+		clocks = <&sys_ck>;
+		ti,bit-shift = <2>;
+		ti,idlest-shift = <8>;
+		ti,clock-frequency = <96000000>;
+		reg = <0x0500>, <0x0530>, <0x0520>;
+	};
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index f6e9904d7a75..eb441d137843 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -178,17 +178,6 @@ struct clksel {
 	const struct clksel_rate *rates;
 };
 
-struct clk_hw_omap_ops {
-	void			(*find_idlest)(struct clk_hw_omap *oclk,
-					void __iomem **idlest_reg,
-					u8 *idlest_bit, u8 *idlest_val);
-	void			(*find_companion)(struct clk_hw_omap *oclk,
-					void __iomem **other_reg,
-					u8 *other_bit);
-	void			(*allow_idle)(struct clk_hw_omap *oclk);
-	void			(*deny_idle)(struct clk_hw_omap *oclk);
-};
-
 unsigned long omap_fixed_divisor_recalc(struct clk_hw *hw,
 					unsigned long parent_rate);
 
diff --git a/drivers/clk/ti/apll.c b/drivers/clk/ti/apll.c
index b986f61f5a77..5428c9c547cd 100644
--- a/drivers/clk/ti/apll.c
+++ b/drivers/clk/ti/apll.c
@@ -221,3 +221,184 @@ cleanup:
 	kfree(init);
 }
 CLK_OF_DECLARE(dra7_apll_clock, "ti,dra7-apll-clock", of_dra7_apll_setup);
+
+#define OMAP2_EN_APLL_LOCKED	0x3
+#define OMAP2_EN_APLL_STOPPED	0x0
+
+static int omap2_apll_is_enabled(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct dpll_data *ad = clk->dpll_data;
+	u32 v;
+
+	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v &= ad->enable_mask;
+
+	v >>= __ffs(ad->enable_mask);
+
+	return v == OMAP2_EN_APLL_LOCKED ? 1 : 0;
+}
+
+static unsigned long omap2_apll_recalc(struct clk_hw *hw,
+				       unsigned long parent_rate)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+
+	if (omap2_apll_is_enabled(hw))
+		return clk->fixed_rate;
+
+	return 0;
+}
+
+static int omap2_apll_enable(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct dpll_data *ad = clk->dpll_data;
+	u32 v;
+	int i = 0;
+
+	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v &= ~ad->enable_mask;
+	v |= OMAP2_EN_APLL_LOCKED << __ffs(ad->enable_mask);
+	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+
+	while (1) {
+		v = ti_clk_ll_ops->clk_readl(ad->idlest_reg);
+		if (v & ad->idlest_mask)
+			break;
+		if (i > MAX_APLL_WAIT_TRIES)
+			break;
+		i++;
+		udelay(1);
+	}
+
+	if (i == MAX_APLL_WAIT_TRIES) {
+		pr_warn("%s failed to transition to locked\n",
+			__clk_get_name(clk->hw.clk));
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void omap2_apll_disable(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct dpll_data *ad = clk->dpll_data;
+	u32 v;
+
+	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v &= ~ad->enable_mask;
+	v |= OMAP2_EN_APLL_STOPPED << __ffs(ad->enable_mask);
+	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+}
+
+static struct clk_ops omap2_apll_ops = {
+	.enable		= &omap2_apll_enable,
+	.disable	= &omap2_apll_disable,
+	.is_enabled	= &omap2_apll_is_enabled,
+	.recalc_rate	= &omap2_apll_recalc,
+};
+
+static void omap2_apll_set_autoidle(struct clk_hw_omap *clk, u32 val)
+{
+	struct dpll_data *ad = clk->dpll_data;
+	u32 v;
+
+	v = ti_clk_ll_ops->clk_readl(ad->autoidle_reg);
+	v &= ~ad->autoidle_mask;
+	v |= val << __ffs(ad->autoidle_mask);
+	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+}
+
+#define OMAP2_APLL_AUTOIDLE_LOW_POWER_STOP	0x3
+#define OMAP2_APLL_AUTOIDLE_DISABLE		0x0
+
+static void omap2_apll_allow_idle(struct clk_hw_omap *clk)
+{
+	omap2_apll_set_autoidle(clk, OMAP2_APLL_AUTOIDLE_LOW_POWER_STOP);
+}
+
+static void omap2_apll_deny_idle(struct clk_hw_omap *clk)
+{
+	omap2_apll_set_autoidle(clk, OMAP2_APLL_AUTOIDLE_DISABLE);
+}
+
+static struct clk_hw_omap_ops omap2_apll_hwops = {
+	.allow_idle	= &omap2_apll_allow_idle,
+	.deny_idle	= &omap2_apll_deny_idle,
+};
+
+static void __init of_omap2_apll_setup(struct device_node *node)
+{
+	struct dpll_data *ad = NULL;
+	struct clk_hw_omap *clk_hw = NULL;
+	struct clk_init_data *init = NULL;
+	struct clk *clk;
+	const char *parent_name;
+	u32 val;
+
+	ad = kzalloc(sizeof(*clk_hw), GFP_KERNEL);
+	clk_hw = kzalloc(sizeof(*clk_hw), GFP_KERNEL);
+	init = kzalloc(sizeof(*init), GFP_KERNEL);
+
+	if (!ad || !clk_hw || !init)
+		goto cleanup;
+
+	clk_hw->dpll_data = ad;
+	clk_hw->hw.init = init;
+	init->ops = &omap2_apll_ops;
+	init->name = node->name;
+	clk_hw->ops = &omap2_apll_hwops;
+
+	init->num_parents = of_clk_get_parent_count(node);
+	if (init->num_parents != 1) {
+		pr_err("%s must have one parent\n", node->name);
+		goto cleanup;
+	}
+
+	parent_name = of_clk_get_parent_name(node, 0);
+	init->parent_names = &parent_name;
+
+	if (of_property_read_u32(node, "ti,clock-frequency", &val)) {
+		pr_err("%s missing clock-frequency\n", node->name);
+		goto cleanup;
+	}
+	clk_hw->fixed_rate = val;
+
+	if (of_property_read_u32(node, "ti,bit-shift", &val)) {
+		pr_err("%s missing bit-shift\n", node->name);
+		goto cleanup;
+	}
+
+	clk_hw->enable_bit = val;
+	ad->enable_mask = 0x3 << val;
+	ad->autoidle_mask = 0x3 << val;
+
+	if (of_property_read_u32(node, "ti,idlest-shift", &val)) {
+		pr_err("%s missing idlest-shift\n", node->name);
+		goto cleanup;
+	}
+
+	ad->idlest_mask = 1 << val;
+
+	ad->control_reg = ti_clk_get_reg_addr(node, 0);
+	ad->autoidle_reg = ti_clk_get_reg_addr(node, 1);
+	ad->idlest_reg = ti_clk_get_reg_addr(node, 2);
+
+	if (!ad->control_reg || !ad->autoidle_reg || !ad->idlest_reg)
+		goto cleanup;
+
+	clk = clk_register(NULL, &clk_hw->hw);
+	if (!IS_ERR(clk)) {
+		of_clk_add_provider(node, of_clk_src_simple_get, clk);
+		kfree(init);
+		return;
+	}
+cleanup:
+	kfree(ad);
+	kfree(clk_hw);
+	kfree(init);
+}
+CLK_OF_DECLARE(omap2_apll_clock, "ti,omap2-apll-clock",
+	       of_omap2_apll_setup);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 753878c6fa52..44bf84002a34 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -94,7 +94,26 @@ struct dpll_data {
 	u8			flags;
 };
 
-struct clk_hw_omap_ops;
+struct clk_hw_omap;
+
+/**
+ * struct clk_hw_omap_ops - OMAP clk ops
+ * @find_idlest: find idlest register information for a clock
+ * @find_companion: find companion clock register information for a clock,
+ *		    basically converts CM_ICLKEN* <-> CM_FCLKEN*
+ * @allow_idle: enables autoidle hardware functionality for a clock
+ * @deny_idle: prevent autoidle hardware functionality for a clock
+ */
+struct clk_hw_omap_ops {
+	void	(*find_idlest)(struct clk_hw_omap *oclk,
+			       void __iomem **idlest_reg,
+			       u8 *idlest_bit, u8 *idlest_val);
+	void	(*find_companion)(struct clk_hw_omap *oclk,
+				  void __iomem **other_reg,
+				  u8 *other_bit);
+	void	(*allow_idle)(struct clk_hw_omap *oclk);
+	void	(*deny_idle)(struct clk_hw_omap *oclk);
+};
 
 /**
  * struct clk_hw_omap - OMAP struct clk
-- 
cgit v1.2.3-71-gd317


From de742570745e12b53c70130ace958f2a60044000 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 25 Feb 2014 19:16:07 +0200
Subject: CLK: TI: interface: add support for omap2430 specific interface clock

OMAP2430 I2CHS modules require specific hardware ops to be used, so added
a new compatible string for this.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 Documentation/devicetree/bindings/clock/ti/interface.txt |  2 ++
 arch/arm/mach-omap2/clock.h                              |  1 -
 drivers/clk/ti/interface.c                               | 11 +++++++++++
 include/linux/clk/ti.h                                   |  1 +
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/ti/interface.txt b/Documentation/devicetree/bindings/clock/ti/interface.txt
index 064e8caccac3..3111a409fea6 100644
--- a/Documentation/devicetree/bindings/clock/ti/interface.txt
+++ b/Documentation/devicetree/bindings/clock/ti/interface.txt
@@ -21,6 +21,8 @@ Required properties:
   "ti,omap3-dss-interface-clock" - interface clock with DSS specific HW handling
   "ti,omap3-ssi-interface-clock" - interface clock with SSI specific HW handling
   "ti,am35xx-interface-clock" - interface clock with AM35xx specific HW handling
+  "ti,omap2430-interface-clock" - interface clock with OMAP2430 specific HW
+				  handling
 - #clock-cells : from common clock binding; shall be set to 0
 - clocks : link to phandle of parent clock
 - reg : base address for the control register
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index eb441d137843..12f54d428d7c 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -268,7 +268,6 @@ extern const struct clk_hw_omap_ops clkhwops_omap3430es2_hsotgusb_wait;
 extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait;
 extern const struct clk_hw_omap_ops clkhwops_apll54;
 extern const struct clk_hw_omap_ops clkhwops_apll96;
-extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 
 /* clksel_rate blocks shared between OMAP44xx and AM33xx */
 extern const struct clksel_rate div_1_0_rates[];
diff --git a/drivers/clk/ti/interface.c b/drivers/clk/ti/interface.c
index 320a2b168bb2..9c3e8c4aaa40 100644
--- a/drivers/clk/ti/interface.c
+++ b/drivers/clk/ti/interface.c
@@ -94,6 +94,7 @@ static void __init of_ti_no_wait_interface_clk_setup(struct device_node *node)
 CLK_OF_DECLARE(ti_no_wait_interface_clk, "ti,omap3-no-wait-interface-clock",
 	       of_ti_no_wait_interface_clk_setup);
 
+#ifdef CONFIG_ARCH_OMAP3
 static void __init of_ti_hsotgusb_interface_clk_setup(struct device_node *node)
 {
 	_of_ti_interface_clk_setup(node,
@@ -123,3 +124,13 @@ static void __init of_ti_am35xx_interface_clk_setup(struct device_node *node)
 }
 CLK_OF_DECLARE(ti_am35xx_interface_clk, "ti,am35xx-interface-clock",
 	       of_ti_am35xx_interface_clk_setup);
+#endif
+
+#ifdef CONFIG_SOC_OMAP2430
+static void __init of_ti_omap2430_interface_clk_setup(struct device_node *node)
+{
+	_of_ti_interface_clk_setup(node, &clkhwops_omap2430_i2chs_wait);
+}
+CLK_OF_DECLARE(ti_omap2430_interface_clk, "ti,omap2430-interface-clock",
+	       of_ti_omap2430_interface_clk_setup);
+#endif
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 44bf84002a34..a8390d478528 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -312,6 +312,7 @@ static inline void of_ti_clk_deny_autoidle_all(void) { }
 #endif
 
 extern const struct clk_hw_omap_ops clkhwops_omap2xxx_dpll;
+extern const struct clk_hw_omap_ops clkhwops_omap2430_i2chs_wait;
 extern const struct clk_hw_omap_ops clkhwops_omap3_dpll;
 extern const struct clk_hw_omap_ops clkhwops_omap4_dpllmx;
 extern const struct clk_hw_omap_ops clkhwops_wait;
-- 
cgit v1.2.3-71-gd317


From be67c3bf382c591d8267e0ef12d80041854731d9 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 24 Feb 2014 17:52:57 +0200
Subject: CLK: TI: OMAP2: add clock init support

Adds support for registering the alias clocks, boot time clock-enable list
and disabling autoidle of clocks.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 drivers/clk/ti/Makefile   |   1 +
 drivers/clk/ti/clk-2xxx.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/ti.h    |   2 +
 3 files changed, 257 insertions(+)
 create mode 100644 drivers/clk/ti/clk-2xxx.c

(limited to 'include/linux')

diff --git a/drivers/clk/ti/Makefile b/drivers/clk/ti/Makefile
index 4319d4031aa3..4afeaed9e9ba 100644
--- a/drivers/clk/ti/Makefile
+++ b/drivers/clk/ti/Makefile
@@ -3,6 +3,7 @@ obj-y					+= clk.o autoidle.o clockdomain.o
 clk-common				= dpll.o composite.o divider.o gate.o \
 					  fixed-factor.o mux.o apll.o
 obj-$(CONFIG_SOC_AM33XX)		+= $(clk-common) clk-33xx.o
+obj-$(CONFIG_ARCH_OMAP2)		+= $(clk-common) interface.o clk-2xxx.o
 obj-$(CONFIG_ARCH_OMAP3)		+= $(clk-common) interface.o clk-3xxx.o
 obj-$(CONFIG_ARCH_OMAP4)		+= $(clk-common) clk-44xx.o
 obj-$(CONFIG_SOC_OMAP5)			+= $(clk-common) clk-54xx.o
diff --git a/drivers/clk/ti/clk-2xxx.c b/drivers/clk/ti/clk-2xxx.c
new file mode 100644
index 000000000000..f6400fb5ee3e
--- /dev/null
+++ b/drivers/clk/ti/clk-2xxx.c
@@ -0,0 +1,254 @@
+/*
+ * OMAP2 Clock init
+ *
+ * Copyright (C) 2013 Texas Instruments, Inc
+ *     Tero Kristo (t-kristo@ti.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/clk-provider.h>
+#include <linux/clk/ti.h>
+
+static struct ti_dt_clk omap2xxx_clks[] = {
+	DT_CLK(NULL, "func_32k_ck", "func_32k_ck"),
+	DT_CLK(NULL, "secure_32k_ck", "secure_32k_ck"),
+	DT_CLK(NULL, "virt_12m_ck", "virt_12m_ck"),
+	DT_CLK(NULL, "virt_13m_ck", "virt_13m_ck"),
+	DT_CLK(NULL, "virt_19200000_ck", "virt_19200000_ck"),
+	DT_CLK(NULL, "virt_26m_ck", "virt_26m_ck"),
+	DT_CLK(NULL, "aplls_clkin_ck", "aplls_clkin_ck"),
+	DT_CLK(NULL, "aplls_clkin_x2_ck", "aplls_clkin_x2_ck"),
+	DT_CLK(NULL, "osc_ck", "osc_ck"),
+	DT_CLK(NULL, "sys_ck", "sys_ck"),
+	DT_CLK(NULL, "alt_ck", "alt_ck"),
+	DT_CLK(NULL, "mcbsp_clks", "mcbsp_clks"),
+	DT_CLK(NULL, "dpll_ck", "dpll_ck"),
+	DT_CLK(NULL, "apll96_ck", "apll96_ck"),
+	DT_CLK(NULL, "apll54_ck", "apll54_ck"),
+	DT_CLK(NULL, "func_54m_ck", "func_54m_ck"),
+	DT_CLK(NULL, "core_ck", "core_ck"),
+	DT_CLK(NULL, "func_96m_ck", "func_96m_ck"),
+	DT_CLK(NULL, "func_48m_ck", "func_48m_ck"),
+	DT_CLK(NULL, "func_12m_ck", "func_12m_ck"),
+	DT_CLK(NULL, "sys_clkout_src", "sys_clkout_src"),
+	DT_CLK(NULL, "sys_clkout", "sys_clkout"),
+	DT_CLK(NULL, "emul_ck", "emul_ck"),
+	DT_CLK(NULL, "mpu_ck", "mpu_ck"),
+	DT_CLK(NULL, "dsp_fck", "dsp_fck"),
+	DT_CLK(NULL, "gfx_3d_fck", "gfx_3d_fck"),
+	DT_CLK(NULL, "gfx_2d_fck", "gfx_2d_fck"),
+	DT_CLK(NULL, "gfx_ick", "gfx_ick"),
+	DT_CLK("omapdss_dss", "ick", "dss_ick"),
+	DT_CLK(NULL, "dss_ick", "dss_ick"),
+	DT_CLK(NULL, "dss1_fck", "dss1_fck"),
+	DT_CLK(NULL, "dss2_fck", "dss2_fck"),
+	DT_CLK(NULL, "dss_54m_fck", "dss_54m_fck"),
+	DT_CLK(NULL, "core_l3_ck", "core_l3_ck"),
+	DT_CLK(NULL, "ssi_fck", "ssi_ssr_sst_fck"),
+	DT_CLK(NULL, "usb_l4_ick", "usb_l4_ick"),
+	DT_CLK(NULL, "l4_ck", "l4_ck"),
+	DT_CLK(NULL, "ssi_l4_ick", "ssi_l4_ick"),
+	DT_CLK(NULL, "gpt1_ick", "gpt1_ick"),
+	DT_CLK(NULL, "gpt1_fck", "gpt1_fck"),
+	DT_CLK(NULL, "gpt2_ick", "gpt2_ick"),
+	DT_CLK(NULL, "gpt2_fck", "gpt2_fck"),
+	DT_CLK(NULL, "gpt3_ick", "gpt3_ick"),
+	DT_CLK(NULL, "gpt3_fck", "gpt3_fck"),
+	DT_CLK(NULL, "gpt4_ick", "gpt4_ick"),
+	DT_CLK(NULL, "gpt4_fck", "gpt4_fck"),
+	DT_CLK(NULL, "gpt5_ick", "gpt5_ick"),
+	DT_CLK(NULL, "gpt5_fck", "gpt5_fck"),
+	DT_CLK(NULL, "gpt6_ick", "gpt6_ick"),
+	DT_CLK(NULL, "gpt6_fck", "gpt6_fck"),
+	DT_CLK(NULL, "gpt7_ick", "gpt7_ick"),
+	DT_CLK(NULL, "gpt7_fck", "gpt7_fck"),
+	DT_CLK(NULL, "gpt8_ick", "gpt8_ick"),
+	DT_CLK(NULL, "gpt8_fck", "gpt8_fck"),
+	DT_CLK(NULL, "gpt9_ick", "gpt9_ick"),
+	DT_CLK(NULL, "gpt9_fck", "gpt9_fck"),
+	DT_CLK(NULL, "gpt10_ick", "gpt10_ick"),
+	DT_CLK(NULL, "gpt10_fck", "gpt10_fck"),
+	DT_CLK(NULL, "gpt11_ick", "gpt11_ick"),
+	DT_CLK(NULL, "gpt11_fck", "gpt11_fck"),
+	DT_CLK(NULL, "gpt12_ick", "gpt12_ick"),
+	DT_CLK(NULL, "gpt12_fck", "gpt12_fck"),
+	DT_CLK("omap-mcbsp.1", "ick", "mcbsp1_ick"),
+	DT_CLK(NULL, "mcbsp1_ick", "mcbsp1_ick"),
+	DT_CLK(NULL, "mcbsp1_fck", "mcbsp1_fck"),
+	DT_CLK("omap-mcbsp.2", "ick", "mcbsp2_ick"),
+	DT_CLK(NULL, "mcbsp2_ick", "mcbsp2_ick"),
+	DT_CLK(NULL, "mcbsp2_fck", "mcbsp2_fck"),
+	DT_CLK("omap2_mcspi.1", "ick", "mcspi1_ick"),
+	DT_CLK(NULL, "mcspi1_ick", "mcspi1_ick"),
+	DT_CLK(NULL, "mcspi1_fck", "mcspi1_fck"),
+	DT_CLK("omap2_mcspi.2", "ick", "mcspi2_ick"),
+	DT_CLK(NULL, "mcspi2_ick", "mcspi2_ick"),
+	DT_CLK(NULL, "mcspi2_fck", "mcspi2_fck"),
+	DT_CLK(NULL, "uart1_ick", "uart1_ick"),
+	DT_CLK(NULL, "uart1_fck", "uart1_fck"),
+	DT_CLK(NULL, "uart2_ick", "uart2_ick"),
+	DT_CLK(NULL, "uart2_fck", "uart2_fck"),
+	DT_CLK(NULL, "uart3_ick", "uart3_ick"),
+	DT_CLK(NULL, "uart3_fck", "uart3_fck"),
+	DT_CLK(NULL, "gpios_ick", "gpios_ick"),
+	DT_CLK(NULL, "gpios_fck", "gpios_fck"),
+	DT_CLK("omap_wdt", "ick", "mpu_wdt_ick"),
+	DT_CLK(NULL, "mpu_wdt_ick", "mpu_wdt_ick"),
+	DT_CLK(NULL, "mpu_wdt_fck", "mpu_wdt_fck"),
+	DT_CLK(NULL, "sync_32k_ick", "sync_32k_ick"),
+	DT_CLK(NULL, "wdt1_ick", "wdt1_ick"),
+	DT_CLK(NULL, "omapctrl_ick", "omapctrl_ick"),
+	DT_CLK("omap24xxcam", "fck", "cam_fck"),
+	DT_CLK(NULL, "cam_fck", "cam_fck"),
+	DT_CLK("omap24xxcam", "ick", "cam_ick"),
+	DT_CLK(NULL, "cam_ick", "cam_ick"),
+	DT_CLK(NULL, "mailboxes_ick", "mailboxes_ick"),
+	DT_CLK(NULL, "wdt4_ick", "wdt4_ick"),
+	DT_CLK(NULL, "wdt4_fck", "wdt4_fck"),
+	DT_CLK(NULL, "mspro_ick", "mspro_ick"),
+	DT_CLK(NULL, "mspro_fck", "mspro_fck"),
+	DT_CLK(NULL, "fac_ick", "fac_ick"),
+	DT_CLK(NULL, "fac_fck", "fac_fck"),
+	DT_CLK("omap_hdq.0", "ick", "hdq_ick"),
+	DT_CLK(NULL, "hdq_ick", "hdq_ick"),
+	DT_CLK("omap_hdq.0", "fck", "hdq_fck"),
+	DT_CLK(NULL, "hdq_fck", "hdq_fck"),
+	DT_CLK("omap_i2c.1", "ick", "i2c1_ick"),
+	DT_CLK(NULL, "i2c1_ick", "i2c1_ick"),
+	DT_CLK("omap_i2c.2", "ick", "i2c2_ick"),
+	DT_CLK(NULL, "i2c2_ick", "i2c2_ick"),
+	DT_CLK(NULL, "gpmc_fck", "gpmc_fck"),
+	DT_CLK(NULL, "sdma_fck", "sdma_fck"),
+	DT_CLK(NULL, "sdma_ick", "sdma_ick"),
+	DT_CLK(NULL, "sdrc_ick", "sdrc_ick"),
+	DT_CLK(NULL, "des_ick", "des_ick"),
+	DT_CLK("omap-sham", "ick", "sha_ick"),
+	DT_CLK(NULL, "sha_ick", "sha_ick"),
+	DT_CLK("omap_rng", "ick", "rng_ick"),
+	DT_CLK(NULL, "rng_ick", "rng_ick"),
+	DT_CLK("omap-aes", "ick", "aes_ick"),
+	DT_CLK(NULL, "aes_ick", "aes_ick"),
+	DT_CLK(NULL, "pka_ick", "pka_ick"),
+	DT_CLK(NULL, "usb_fck", "usb_fck"),
+	DT_CLK(NULL, "timer_32k_ck", "func_32k_ck"),
+	DT_CLK(NULL, "timer_sys_ck", "sys_ck"),
+	DT_CLK(NULL, "timer_ext_ck", "alt_ck"),
+	{ .node_name = NULL },
+};
+
+static struct ti_dt_clk omap2420_clks[] = {
+	DT_CLK(NULL, "sys_clkout2_src", "sys_clkout2_src"),
+	DT_CLK(NULL, "sys_clkout2", "sys_clkout2"),
+	DT_CLK(NULL, "dsp_ick", "dsp_ick"),
+	DT_CLK(NULL, "iva1_ifck", "iva1_ifck"),
+	DT_CLK(NULL, "iva1_mpu_int_ifck", "iva1_mpu_int_ifck"),
+	DT_CLK(NULL, "wdt3_ick", "wdt3_ick"),
+	DT_CLK(NULL, "wdt3_fck", "wdt3_fck"),
+	DT_CLK("mmci-omap.0", "ick", "mmc_ick"),
+	DT_CLK(NULL, "mmc_ick", "mmc_ick"),
+	DT_CLK("mmci-omap.0", "fck", "mmc_fck"),
+	DT_CLK(NULL, "mmc_fck", "mmc_fck"),
+	DT_CLK(NULL, "eac_ick", "eac_ick"),
+	DT_CLK(NULL, "eac_fck", "eac_fck"),
+	DT_CLK(NULL, "i2c1_fck", "i2c1_fck"),
+	DT_CLK(NULL, "i2c2_fck", "i2c2_fck"),
+	DT_CLK(NULL, "vlynq_ick", "vlynq_ick"),
+	DT_CLK(NULL, "vlynq_fck", "vlynq_fck"),
+	DT_CLK("musb-hdrc", "fck", "osc_ck"),
+	{ .node_name = NULL },
+};
+
+static struct ti_dt_clk omap2430_clks[] = {
+	DT_CLK("twl", "fck", "osc_ck"),
+	DT_CLK(NULL, "iva2_1_ick", "iva2_1_ick"),
+	DT_CLK(NULL, "mdm_ick", "mdm_ick"),
+	DT_CLK(NULL, "mdm_osc_ck", "mdm_osc_ck"),
+	DT_CLK("omap-mcbsp.3", "ick", "mcbsp3_ick"),
+	DT_CLK(NULL, "mcbsp3_ick", "mcbsp3_ick"),
+	DT_CLK(NULL, "mcbsp3_fck", "mcbsp3_fck"),
+	DT_CLK("omap-mcbsp.4", "ick", "mcbsp4_ick"),
+	DT_CLK(NULL, "mcbsp4_ick", "mcbsp4_ick"),
+	DT_CLK(NULL, "mcbsp4_fck", "mcbsp4_fck"),
+	DT_CLK("omap-mcbsp.5", "ick", "mcbsp5_ick"),
+	DT_CLK(NULL, "mcbsp5_ick", "mcbsp5_ick"),
+	DT_CLK(NULL, "mcbsp5_fck", "mcbsp5_fck"),
+	DT_CLK("omap2_mcspi.3", "ick", "mcspi3_ick"),
+	DT_CLK(NULL, "mcspi3_ick", "mcspi3_ick"),
+	DT_CLK(NULL, "mcspi3_fck", "mcspi3_fck"),
+	DT_CLK(NULL, "icr_ick", "icr_ick"),
+	DT_CLK(NULL, "i2chs1_fck", "i2chs1_fck"),
+	DT_CLK(NULL, "i2chs2_fck", "i2chs2_fck"),
+	DT_CLK("musb-omap2430", "ick", "usbhs_ick"),
+	DT_CLK(NULL, "usbhs_ick", "usbhs_ick"),
+	DT_CLK("omap_hsmmc.0", "ick", "mmchs1_ick"),
+	DT_CLK(NULL, "mmchs1_ick", "mmchs1_ick"),
+	DT_CLK(NULL, "mmchs1_fck", "mmchs1_fck"),
+	DT_CLK("omap_hsmmc.1", "ick", "mmchs2_ick"),
+	DT_CLK(NULL, "mmchs2_ick", "mmchs2_ick"),
+	DT_CLK(NULL, "mmchs2_fck", "mmchs2_fck"),
+	DT_CLK(NULL, "gpio5_ick", "gpio5_ick"),
+	DT_CLK(NULL, "gpio5_fck", "gpio5_fck"),
+	DT_CLK(NULL, "mdm_intc_ick", "mdm_intc_ick"),
+	DT_CLK("omap_hsmmc.0", "mmchsdb_fck", "mmchsdb1_fck"),
+	DT_CLK(NULL, "mmchsdb1_fck", "mmchsdb1_fck"),
+	DT_CLK("omap_hsmmc.1", "mmchsdb_fck", "mmchsdb2_fck"),
+	DT_CLK(NULL, "mmchsdb2_fck", "mmchsdb2_fck"),
+	{ .node_name = NULL },
+};
+
+static const char *enable_init_clks[] = {
+	"apll96_ck",
+	"apll54_ck",
+	"sync_32k_ick",
+	"omapctrl_ick",
+	"gpmc_fck",
+	"sdrc_ick",
+};
+
+enum {
+	OMAP2_SOC_OMAP2420,
+	OMAP2_SOC_OMAP2430,
+};
+
+static int __init omap2xxx_dt_clk_init(int soc_type)
+{
+	ti_dt_clocks_register(omap2xxx_clks);
+
+	if (soc_type == OMAP2_SOC_OMAP2420)
+		ti_dt_clocks_register(omap2420_clks);
+	else
+		ti_dt_clocks_register(omap2430_clks);
+
+	omap2_clk_disable_autoidle_all();
+
+	omap2_clk_enable_init_clocks(enable_init_clks,
+				     ARRAY_SIZE(enable_init_clks));
+
+	pr_info("Clocking rate (Crystal/DPLL/MPU): %ld.%01ld/%ld/%ld MHz\n",
+		(clk_get_rate(clk_get_sys(NULL, "sys_ck")) / 1000000),
+		(clk_get_rate(clk_get_sys(NULL, "sys_ck")) / 100000) % 10,
+		(clk_get_rate(clk_get_sys(NULL, "dpll_ck")) / 1000000),
+		(clk_get_rate(clk_get_sys(NULL, "mpu_ck")) / 1000000));
+
+	return 0;
+}
+
+int __init omap2420_dt_clk_init(void)
+{
+	return omap2xxx_dt_clk_init(OMAP2_SOC_OMAP2420);
+}
+
+int __init omap2430_dt_clk_init(void)
+{
+	return omap2xxx_dt_clk_init(OMAP2_SOC_OMAP2430);
+}
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index a8390d478528..188f0cbb26c2 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -302,6 +302,8 @@ int omap5xxx_dt_clk_init(void);
 int dra7xx_dt_clk_init(void);
 int am33xx_dt_clk_init(void);
 int am43xx_dt_clk_init(void);
+int omap2420_dt_clk_init(void);
+int omap2430_dt_clk_init(void);
 
 #ifdef CONFIG_OF
 void of_ti_clk_allow_autoidle_all(void);
-- 
cgit v1.2.3-71-gd317


From 61f25ca76ccc7b63371a7a6b0b8b9a8a46745b79 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 24 Feb 2014 18:49:35 +0200
Subject: ARM: OMAP2: clock: add DT boot support for cpufreq_ck

The clock and clkdev for this are added manually.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c | 53 ++++++++++++++++++++++++++++
 drivers/clk/ti/clk-2xxx.c                    |  2 ++
 include/linux/clk/ti.h                       |  1 +
 3 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c b/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c
index b935ed2922d8..85e0b0c06718 100644
--- a/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c
+++ b/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c
@@ -208,3 +208,56 @@ void omap2xxx_clkt_vps_late_init(void)
 		clk_put(c);
 	}
 }
+
+#ifdef CONFIG_OF
+#include <linux/clk-provider.h>
+#include <linux/clkdev.h>
+
+static const struct clk_ops virt_prcm_set_ops = {
+	.recalc_rate	= &omap2_table_mpu_recalc,
+	.set_rate	= &omap2_select_table_rate,
+	.round_rate	= &omap2_round_to_table_rate,
+};
+
+/**
+ * omap2xxx_clkt_vps_init - initialize virt_prcm_set clock
+ *
+ * Does a manual init for the virtual prcm DVFS clock for OMAP2. This
+ * function is called only from omap2 DT clock init, as the virtual
+ * node is not modelled in the DT clock data.
+ */
+void omap2xxx_clkt_vps_init(void)
+{
+	struct clk_init_data init = { NULL };
+	struct clk_hw_omap *hw = NULL;
+	struct clk *clk;
+	const char *parent_name = "mpu_ck";
+	struct clk_lookup *lookup = NULL;
+
+	omap2xxx_clkt_vps_late_init();
+	omap2xxx_clkt_vps_check_bootloader_rates();
+
+	hw = kzalloc(sizeof(*hw), GFP_KERNEL);
+	lookup = kzalloc(sizeof(*lookup), GFP_KERNEL);
+	if (!hw || !lookup)
+		goto cleanup;
+	init.name = "virt_prcm_set";
+	init.ops = &virt_prcm_set_ops;
+	init.parent_names = &parent_name;
+	init.num_parents = 1;
+
+	hw->hw.init = &init;
+
+	clk = clk_register(NULL, &hw->hw);
+
+	lookup->dev_id = NULL;
+	lookup->con_id = "cpufreq_ck";
+	lookup->clk = clk;
+
+	clkdev_add(lookup);
+	return;
+cleanup:
+	kfree(hw);
+	kfree(lookup);
+}
+#endif
diff --git a/drivers/clk/ti/clk-2xxx.c b/drivers/clk/ti/clk-2xxx.c
index f6400fb5ee3e..c808ab3d2bb2 100644
--- a/drivers/clk/ti/clk-2xxx.c
+++ b/drivers/clk/ti/clk-2xxx.c
@@ -229,6 +229,8 @@ static int __init omap2xxx_dt_clk_init(int soc_type)
 	else
 		ti_dt_clocks_register(omap2430_clks);
 
+	omap2xxx_clkt_vps_init();
+
 	omap2_clk_disable_autoidle_all();
 
 	omap2_clk_enable_init_clocks(enable_init_clks,
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 188f0cbb26c2..4231c41bed51 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -283,6 +283,7 @@ unsigned long omap2_dpllcore_recalc(struct clk_hw *hw,
 int omap2_reprogram_dpllcore(struct clk_hw *clk, unsigned long rate,
 			     unsigned long parent_rate);
 void omap2xxx_clkt_dpllcore_init(struct clk_hw *hw);
+void omap2xxx_clkt_vps_init(void);
 
 void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index);
 void ti_dt_clocks_register(struct ti_dt_clk *oclks);
-- 
cgit v1.2.3-71-gd317


From 494b6590043b4cd73ceb3f58e1c012a2c6c98d85 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Mon, 26 May 2014 18:06:34 +0200
Subject: wireless: add missing WLAN_EID_BSS_INTOLERANT_CHL_REPORT

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f194ccb8539c..6bff13f74050 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1711,6 +1711,7 @@ enum ieee80211_eid {
 	WLAN_EID_RRM_ENABLED_CAPABILITIES = 70,
 	WLAN_EID_MULTIPLE_BSSID = 71,
 	WLAN_EID_BSS_COEX_2040 = 72,
+	WLAN_EID_BSS_INTOLERANT_CHL_REPORT = 73,
 	WLAN_EID_OVERLAP_BSS_SCAN_PARAM = 74,
 	WLAN_EID_RIC_DESCRIPTOR = 75,
 	WLAN_EID_MMIE = 76,
-- 
cgit v1.2.3-71-gd317


From f08dbf8a61462aa122b9b5077849a3f4bd84702a Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 8 Jan 2014 11:23:35 +0000
Subject: cpuidle: declare cpuidle_dev in cpuidle.h

Declaring this allows drivers which need to initialise each struct
cpuidle_device at initialisation time to make use of the structures
already defined in cpuidle.c, rather than having to wastefully define
their own.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
---
 include/linux/cpuidle.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index b0238cba440b..99cbd7a74e9f 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -84,6 +84,7 @@ struct cpuidle_device {
 };
 
 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
+DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev);
 
 /**
  * cpuidle_get_last_residency - retrieves the last state's residency time
-- 
cgit v1.2.3-71-gd317


From c25dc82899e67a32fdcfb20dd72a37fc236fde2e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 22 May 2014 17:07:30 -0600
Subject: PCI: Add DMA alias iterator

In a mixed PCI/PCI-X/PCIe topology, bridges can take ownership of
transactions, replacing the original requester ID with their own.
Sometimes we just want to know the resulting device or resulting alias;
other times we want each step in the chain.  This iterator allows either
usage.  When an endpoint is connected via an unbroken chain of PCIe
switches and root ports, it has no alias and its requester ID is visible to
the root bus.  When PCI/X get in the way, we pick up aliases for bridges.

The reason why we potentially care about each step in the path is because
of PCI-X.  PCI-X has the concept of a requester ID, but bridges may or may
not take ownership of various types of transactions.  We therefore leave it
to the consumer of this function to prune out what they don't care about
rather than attempt to flatten the alias ourselves.

Tested-by: George Spelvin <linux@horizon.com>
Tested-by: Pat Erley <pat-lkml@erley.org>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/search.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h  |  4 +++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 4a1b972efe7f..5601cdb8bbb3 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -17,6 +17,76 @@
 DECLARE_RWSEM(pci_bus_sem);
 EXPORT_SYMBOL_GPL(pci_bus_sem);
 
+/*
+ * pci_for_each_dma_alias - Iterate over DMA aliases for a device
+ * @pdev: starting downstream device
+ * @fn: function to call for each alias
+ * @data: opaque data to pass to @fn
+ *
+ * Starting @pdev, walk up the bus calling @fn for each possible alias
+ * of @pdev at the root bus.
+ */
+int pci_for_each_dma_alias(struct pci_dev *pdev,
+			   int (*fn)(struct pci_dev *pdev,
+				     u16 alias, void *data), void *data)
+{
+	struct pci_bus *bus;
+	int ret;
+
+	ret = fn(pdev, PCI_DEVID(pdev->bus->number, pdev->devfn), data);
+	if (ret)
+		return ret;
+
+	for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
+		struct pci_dev *tmp;
+
+		/* Skip virtual buses */
+		if (!bus->self)
+			continue;
+
+		tmp = bus->self;
+
+		/*
+		 * PCIe-to-PCI/X bridges alias transactions from downstream
+		 * devices using the subordinate bus number (PCI Express to
+		 * PCI/PCI-X Bridge Spec, rev 1.0, sec 2.3).  For all cases
+		 * where the upstream bus is PCI/X we alias to the bridge
+		 * (there are various conditions in the previous reference
+		 * where the bridge may take ownership of transactions, even
+		 * when the secondary interface is PCI-X).
+		 */
+		if (pci_is_pcie(tmp)) {
+			switch (pci_pcie_type(tmp)) {
+			case PCI_EXP_TYPE_ROOT_PORT:
+			case PCI_EXP_TYPE_UPSTREAM:
+			case PCI_EXP_TYPE_DOWNSTREAM:
+				continue;
+			case PCI_EXP_TYPE_PCI_BRIDGE:
+				ret = fn(tmp,
+					 PCI_DEVID(tmp->subordinate->number,
+						   PCI_DEVFN(0, 0)), data);
+				if (ret)
+					return ret;
+				continue;
+			case PCI_EXP_TYPE_PCIE_BRIDGE:
+				ret = fn(tmp,
+					 PCI_DEVID(tmp->bus->number,
+						   tmp->devfn), data);
+				if (ret)
+					return ret;
+				continue;
+			}
+		} else {
+			ret = fn(tmp, PCI_DEVID(tmp->bus->number, tmp->devfn),
+				 data);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
 /*
  * find the upstream PCIe-to-PCI bridge of a PCI device
  * if the device is PCIE, return NULL
diff --git a/include/linux/pci.h b/include/linux/pci.h
index aab57b4abe7f..14b074bbc841 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1795,6 +1795,10 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
 }
 #endif
 
+int pci_for_each_dma_alias(struct pci_dev *pdev,
+			   int (*fn)(struct pci_dev *pdev,
+				     u16 alias, void *data), void *data);
+
 /**
  * pci_find_upstream_pcie_bridge - find upstream PCIe-to-PCI bridge of a device
  * @pdev: the PCI device
-- 
cgit v1.2.3-71-gd317


From 6b121592f8a3fd2bd0de128637b76a0d0864d993 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 22 May 2014 17:07:36 -0600
Subject: PCI: Convert pci_dev_flags definitions to bit shifts

Convert the pci_dev_flags definitions from decimal constants to bit shifts.
We're only a few entries away from where using the decimal value becomes
cumbersome.  No functional change.

Tested-by: George Spelvin <linux@horizon.com>
Tested-by: Pat Erley <pat-lkml@erley.org>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 14b074bbc841..545903df00dc 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -164,13 +164,13 @@ enum pci_dev_flags {
 	/* INTX_DISABLE in PCI_COMMAND register disables MSI
 	 * generation too.
 	 */
-	PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG = (__force pci_dev_flags_t) 1,
+	PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG = (__force pci_dev_flags_t) (1 << 0),
 	/* Device configuration is irrevocably lost if disabled into D3 */
-	PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2,
+	PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) (1 << 1),
 	/* Provide indication device is assigned by a Virtual Machine Manager */
-	PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4,
+	PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) (1 << 2),
 	/* Flag for quirk use to store if quirk-specific ACS is enabled */
-	PCI_DEV_FLAGS_ACS_ENABLED_QUIRK = (__force pci_dev_flags_t) 8,
+	PCI_DEV_FLAGS_ACS_ENABLED_QUIRK = (__force pci_dev_flags_t) (1 << 3),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3-71-gd317


From 31c2b8153c58f11ddb80dfd392c16f13c2d709c6 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 22 May 2014 17:07:43 -0600
Subject: PCI: Add support for DMA alias quirks

Some devices are broken and use a requester ID other than their physical
devfn.  Add a byte, using an existing gap in the pci_dev structure, to
store an alternate "alias" devfn.  A bit in the dev_flags tells us when
this is valid.  We then add the alias as one more step in the
pci_for_each_dma_alias() iterator.

Tested-by: George Spelvin <linux@horizon.com>
Tested-by: Pat Erley <pat-lkml@erley.org>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/search.c | 11 +++++++++++
 include/linux/pci.h  |  3 +++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 5601cdb8bbb3..2c19f3f40621 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -37,6 +37,17 @@ int pci_for_each_dma_alias(struct pci_dev *pdev,
 	if (ret)
 		return ret;
 
+	/*
+	 * If the device is broken and uses an alias requester ID for
+	 * DMA, iterate over that too.
+	 */
+	if (unlikely(pdev->dev_flags & PCI_DEV_FLAGS_DMA_ALIAS_DEVFN)) {
+		ret = fn(pdev, PCI_DEVID(pdev->bus->number,
+					 pdev->dma_alias_devfn), data);
+		if (ret)
+			return ret;
+	}
+
 	for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
 		struct pci_dev *tmp;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 545903df00dc..9d4035c276f4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -171,6 +171,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) (1 << 2),
 	/* Flag for quirk use to store if quirk-specific ACS is enabled */
 	PCI_DEV_FLAGS_ACS_ENABLED_QUIRK = (__force pci_dev_flags_t) (1 << 3),
+	/* Flag to indicate the device uses dma_alias_devfn */
+	PCI_DEV_FLAGS_DMA_ALIAS_DEVFN = (__force pci_dev_flags_t) (1 << 4),
 };
 
 enum pci_irq_reroute_variant {
@@ -268,6 +270,7 @@ struct pci_dev {
 	u8		rom_base_reg;	/* which config register controls the ROM */
 	u8		pin;		/* which interrupt pin this device uses */
 	u16		pcie_flags_reg;	/* cached PCIe Capabilities Register */
+	u8		dma_alias_devfn;/* devfn of DMA alias, if any */
 
 	struct pci_driver *driver;	/* which driver has allocated this device */
 	u64		dma_mask;	/* Mask of the bits of bus address this
-- 
cgit v1.2.3-71-gd317


From a20c93e3160e37ecccc738d8eef085c8507949ed Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Apr 2014 15:07:21 +0200
Subject: nfs: remove ->write_pageio_init from rpc ops

The write_pageio_init method is just a very convoluted way to grab the
right nfs_pageio_ops vector.  The vector to chose is not a choice of
protocol version, but just a pNFS vs MDS I/O choice that can simply be
done inside nfs_pageio_init_write based on the presence of a layout
driver, and a new force_mds flag to the special case of falling back
to MDS I/O on a pNFS-capable volume.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c         |  4 ++--
 fs/nfs/internal.h       |  2 +-
 fs/nfs/nfs3proc.c       |  1 -
 fs/nfs/nfs4proc.c       |  1 -
 fs/nfs/pnfs.c           | 16 +---------------
 fs/nfs/pnfs.h           |  8 --------
 fs/nfs/proc.c           |  1 -
 fs/nfs/write.c          | 21 +++++++++++++--------
 include/linux/nfs_xdr.h |  2 --
 9 files changed, 17 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b8797ae6831f..6a31102b0819 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -564,7 +564,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	dreq->count = 0;
 	get_dreq(dreq);
 
-	NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE,
+	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 
@@ -874,7 +874,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	size_t requested_bytes = 0;
 	unsigned long seg;
 
-	NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
+	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 	get_dreq(dreq);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index dd8bfc2e2464..8431083de179 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -422,7 +422,7 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 
 /* write.c */
 extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-			struct inode *inode, int ioflags,
+			struct inode *inode, int ioflags, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
 extern struct nfs_write_header *nfs_writehdr_alloc(void);
 extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index db60149c4579..e98488053906 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -951,7 +951,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.read_rpc_prepare = nfs3_proc_read_rpc_prepare,
 	.read_done	= nfs3_read_done,
 	.write_setup	= nfs3_proc_write_setup,
-	.write_pageio_init = nfs_pageio_init_write,
 	.write_rpc_prepare = nfs3_proc_write_rpc_prepare,
 	.write_done	= nfs3_write_done,
 	.commit_setup	= nfs3_proc_commit_setup,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 397be39c6dc8..8da0c62966b5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8437,7 +8437,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.read_rpc_prepare = nfs4_proc_read_rpc_prepare,
 	.read_done	= nfs4_read_done,
 	.write_setup	= nfs4_proc_write_setup,
-	.write_pageio_init = pnfs_pageio_init_write,
 	.write_rpc_prepare = nfs4_proc_write_rpc_prepare,
 	.write_done	= nfs4_write_done,
 	.commit_setup	= nfs4_proc_commit_setup,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cb53d450ae32..9edac9f01c2a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1447,20 +1447,6 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
 		nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0);
 }
 
-void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
-		       int ioflags,
-		       const struct nfs_pgio_completion_ops *compl_ops)
-{
-	struct nfs_server *server = NFS_SERVER(inode);
-	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-
-	if (ld == NULL)
-		nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
-	else
-		nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags);
-}
-
 bool
 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		     struct nfs_page *req)
@@ -1496,7 +1482,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,
 	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
-	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
+	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
 	pgio.pg_dreq = dreq;
 	while (!list_empty(head)) {
 		struct nfs_page *req = nfs_list_entry(head->next);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 023793909778..e9ac8fbaee3d 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -182,8 +182,6 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
 
 void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
 			   const struct nfs_pgio_completion_ops *);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
-			    int, const struct nfs_pgio_completion_ops *);
 
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -467,12 +465,6 @@ static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, str
 	nfs_pageio_init_read(pgio, inode, compl_ops);
 }
 
-static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags,
-					  const struct nfs_pgio_completion_ops *compl_ops)
-{
-	nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
-}
-
 static inline int
 pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
 		 struct nfs_commit_info *cinfo)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e55ce9e8b034..f9cc29590a18 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -739,7 +739,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.read_rpc_prepare = nfs_proc_read_rpc_prepare,
 	.read_done	= nfs_read_done,
 	.write_setup	= nfs_proc_write_setup,
-	.write_pageio_init = nfs_pageio_init_write,
 	.write_rpc_prepare = nfs_proc_write_rpc_prepare,
 	.write_done	= nfs_write_done,
 	.commit_setup	= nfs_proc_commit_setup,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index cd7c651f9b84..ee6d46fde76c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -354,10 +354,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
-	NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio,
-							  page->mapping->host,
-							  wb_priority(wbc),
-							  &nfs_async_write_completion_ops);
+	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+				false, &nfs_async_write_completion_ops);
 	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
@@ -400,7 +398,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
-	NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops);
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+				&nfs_async_write_completion_ops);
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
 
@@ -1282,11 +1281,17 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
 };
 
 void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-			       struct inode *inode, int ioflags,
+			       struct inode *inode, int ioflags, bool force_mds,
 			       const struct nfs_pgio_completion_ops *compl_ops)
 {
-	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops,
-				NFS_SERVER(inode)->wsize, ioflags);
+	struct nfs_server *server = NFS_SERVER(inode);
+	const struct nfs_pageio_ops *pg_ops = &nfs_pageio_write_ops;
+
+#ifdef CONFIG_NFS_V4_1
+	if (server->pnfs_curr_ld && !force_mds)
+		pg_ops = server->pnfs_curr_ld->pg_write_ops;
+#endif
+	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, server->wsize, ioflags);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6fb5b2335b59..78216f859527 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1471,8 +1471,6 @@ struct nfs_rpc_ops {
 	int	(*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
 	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
 	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
-	void	(*write_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, int,
-				     const struct nfs_pgio_completion_ops *);
 	int	(*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
 	int	(*write_done)  (struct rpc_task *, struct nfs_write_data *);
 	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
-- 
cgit v1.2.3-71-gd317


From fab5fc25d230edcc8ee72367e505955a2fae0cac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Apr 2014 15:07:22 +0200
Subject: nfs: remove ->read_pageio_init from rpc ops

The read_pageio_init method is just a very convoluted way to grab the
right nfs_pageio_ops vector.  The vector to chose is not a choice of
protocol version, but just a pNFS vs MDS I/O choice that can simply be
done inside nfs_pageio_init_read based on the presence of a layout
driver, and a new force_mds flag to the special case of falling back
to MDS I/O on a pNFS-capable volume.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c         |  2 +-
 fs/nfs/internal.h       |  2 +-
 fs/nfs/nfs3proc.c       |  1 -
 fs/nfs/nfs4proc.c       |  1 -
 fs/nfs/pnfs.c           | 15 +--------------
 fs/nfs/pnfs.h           |  9 ---------
 fs/nfs/proc.c           |  1 -
 fs/nfs/read.c           | 19 ++++++++++++++-----
 include/linux/nfs_xdr.h |  2 --
 9 files changed, 17 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6a31102b0819..bbe688e2cc89 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -424,7 +424,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	size_t requested_bytes = 0;
 	unsigned long seg;
 
-	NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
+	nfs_pageio_init_read(&desc, dreq->inode, false,
 			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
 	desc.pg_dreq = dreq;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8431083de179..98fe618db2aa 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -398,7 +398,7 @@ struct nfs_pgio_completion_ops;
 extern struct nfs_read_header *nfs_readhdr_alloc(void);
 extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-			struct inode *inode,
+			struct inode *inode, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
 extern int nfs_initiate_read(struct rpc_clnt *clnt,
 			     struct nfs_read_data *data,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index e98488053906..d873241a9b3a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -947,7 +947,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.pathconf	= nfs3_proc_pathconf,
 	.decode_dirent	= nfs3_decode_dirent,
 	.read_setup	= nfs3_proc_read_setup,
-	.read_pageio_init = nfs_pageio_init_read,
 	.read_rpc_prepare = nfs3_proc_read_rpc_prepare,
 	.read_done	= nfs3_read_done,
 	.write_setup	= nfs3_proc_write_setup,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8da0c62966b5..21cd1f2ee35a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8433,7 +8433,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.set_capabilities = nfs4_server_capabilities,
 	.decode_dirent	= nfs4_decode_dirent,
 	.read_setup	= nfs4_proc_read_setup,
-	.read_pageio_init = pnfs_pageio_init_read,
 	.read_rpc_prepare = nfs4_proc_read_rpc_prepare,
 	.read_done	= nfs4_read_done,
 	.write_setup	= nfs4_proc_write_setup,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 9edac9f01c2a..3d5bc2baafd1 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1434,19 +1434,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
 
-void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
-		      const struct nfs_pgio_completion_ops *compl_ops)
-{
-	struct nfs_server *server = NFS_SERVER(inode);
-	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-
-	if (ld == NULL)
-		nfs_pageio_init_read(pgio, inode, compl_ops);
-	else
-		nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0);
-}
-
 bool
 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		     struct nfs_page *req)
@@ -1641,7 +1628,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,
 	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
-	nfs_pageio_init_read(&pgio, inode, compl_ops);
+	nfs_pageio_init_read(&pgio, inode, true, compl_ops);
 	pgio.pg_dreq = dreq;
 	while (!list_empty(head)) {
 		struct nfs_page *req = nfs_list_entry(head->next);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e9ac8fbaee3d..94a9a1834b3f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -180,9 +180,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
 
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
-			   const struct nfs_pgio_completion_ops *);
-
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
@@ -459,12 +456,6 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
 
-static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
-					 const struct nfs_pgio_completion_ops *compl_ops)
-{
-	nfs_pageio_init_read(pgio, inode, compl_ops);
-}
-
 static inline int
 pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
 		 struct nfs_commit_info *cinfo)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index f9cc29590a18..8cc227fcd4d2 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -735,7 +735,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.pathconf	= nfs_proc_pathconf,
 	.decode_dirent	= nfs2_decode_dirent,
 	.read_setup	= nfs_proc_read_setup,
-	.read_pageio_init = nfs_pageio_init_read,
 	.read_rpc_prepare = nfs_proc_read_rpc_prepare,
 	.read_done	= nfs_read_done,
 	.write_setup	= nfs_proc_write_setup,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 411aedda14bb..7f87461be3a9 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -114,11 +115,17 @@ int nfs_return_empty_page(struct page *page)
 }
 
 void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-			      struct inode *inode,
+			      struct inode *inode, bool force_mds,
 			      const struct nfs_pgio_completion_ops *compl_ops)
 {
-	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops,
-			NFS_SERVER(inode)->rsize, 0);
+	struct nfs_server *server = NFS_SERVER(inode);
+	const struct nfs_pageio_ops *pg_ops = &nfs_pageio_read_ops;
+
+#ifdef CONFIG_NFS_V4_1
+	if (server->pnfs_curr_ld && !force_mds)
+		pg_ops = server->pnfs_curr_ld->pg_read_ops;
+#endif
+	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, server->rsize, 0);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
@@ -147,7 +154,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_CACHE_SIZE)
 		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 
-	NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops);
+	nfs_pageio_init_read(&pgio, inode, false,
+			     &nfs_async_read_completion_ops);
 	nfs_pageio_add_request(&pgio, new);
 	nfs_pageio_complete(&pgio);
 	NFS_I(inode)->read_io += pgio.pg_bytes_written;
@@ -654,7 +662,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
-	NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops);
+	nfs_pageio_init_read(&pgio, inode, false,
+			     &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 78216f859527..3e8fc1fe585b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1466,8 +1466,6 @@ struct nfs_rpc_ops {
 	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
 	int	(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
 	void	(*read_setup)   (struct nfs_read_data *, struct rpc_message *);
-	void	(*read_pageio_init)(struct nfs_pageio_descriptor *, struct inode *,
-				    const struct nfs_pgio_completion_ops *);
 	int	(*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
 	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
 	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
-- 
cgit v1.2.3-71-gd317


From c8fe16e3f96a9bb95a10cedb19d2be2d2d580940 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 28 May 2014 14:57:02 -0600
Subject: PCI: Add support for PCIe-to-PCI bridge DMA alias quirks

Several PCIe-to-PCI bridges fail to provide a PCIe capability, causing us
to handle them as conventional PCI devices when they really use the
requester ID of the secondary bus.  We need to differentiate these from
PCIe-to-PCI bridges that actually use the conventional PCI ID when a PCIe
capability is not present, such as those found on the root complex of may
Intel chipsets.  Add a dev_flag bit to identify devices to be handled as
standard PCIe-to-PCI bridges.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/search.c | 10 ++++++++--
 include/linux/pci.h  |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 2c19f3f40621..df38f73f091f 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -88,8 +88,14 @@ int pci_for_each_dma_alias(struct pci_dev *pdev,
 				continue;
 			}
 		} else {
-			ret = fn(tmp, PCI_DEVID(tmp->bus->number, tmp->devfn),
-				 data);
+			if (tmp->dev_flags & PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS)
+				ret = fn(tmp,
+					 PCI_DEVID(tmp->subordinate->number,
+						   PCI_DEVFN(0, 0)), data);
+			else
+				ret = fn(tmp,
+					 PCI_DEVID(tmp->bus->number,
+						   tmp->devfn), data);
 			if (ret)
 				return ret;
 		}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9d4035c276f4..85ab35e974a9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -173,6 +173,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_ACS_ENABLED_QUIRK = (__force pci_dev_flags_t) (1 << 3),
 	/* Flag to indicate the device uses dma_alias_devfn */
 	PCI_DEV_FLAGS_DMA_ALIAS_DEVFN = (__force pci_dev_flags_t) (1 << 4),
+	/* Use a PCIe-to-PCI bridge alias even if !pci_is_pcie */
+	PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS = (__force pci_dev_flags_t) (1 << 5),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3-71-gd317


From 3c6b899c49e5e9c2803b59ee553eddaf69cea7f6 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:24 -0400
Subject: NFS: Create a common argument structure for reads and writes

Reads and writes have very similar arguments.  This patch combines them
together and documents the few fields used only by write.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs2xdr.c        |  8 ++++----
 fs/nfs/nfs3xdr.c        |  8 ++++----
 fs/nfs/nfs4proc.c       |  4 ++--
 fs/nfs/nfs4xdr.c        | 10 ++++++----
 fs/nfs/read.c           |  2 +-
 fs/nfs/write.c          |  2 +-
 include/linux/nfs_xdr.h | 47 +++++++++++++++++++----------------------------
 7 files changed, 37 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 62db136339ea..461cd8bd9401 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -613,7 +613,7 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
  *	};
  */
 static void encode_readargs(struct xdr_stream *xdr,
-			    const struct nfs_readargs *args)
+			    const struct nfs_pgio_args *args)
 {
 	u32 offset = args->offset;
 	u32 count = args->count;
@@ -629,7 +629,7 @@ static void encode_readargs(struct xdr_stream *xdr,
 
 static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
 				  struct xdr_stream *xdr,
-				  const struct nfs_readargs *args)
+				  const struct nfs_pgio_args *args)
 {
 	encode_readargs(xdr, args);
 	prepare_reply_buffer(req, args->pages, args->pgbase,
@@ -649,7 +649,7 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
  *	};
  */
 static void encode_writeargs(struct xdr_stream *xdr,
-			     const struct nfs_writeargs *args)
+			     const struct nfs_pgio_args *args)
 {
 	u32 offset = args->offset;
 	u32 count = args->count;
@@ -669,7 +669,7 @@ static void encode_writeargs(struct xdr_stream *xdr,
 
 static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
 				   struct xdr_stream *xdr,
-				   const struct nfs_writeargs *args)
+				   const struct nfs_pgio_args *args)
 {
 	encode_writeargs(xdr, args);
 	xdr->buf->flags |= XDRBUF_WRITE;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index fa6d72131c19..02f16c212007 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -953,7 +953,7 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
  *	};
  */
 static void encode_read3args(struct xdr_stream *xdr,
-			     const struct nfs_readargs *args)
+			     const struct nfs_pgio_args *args)
 {
 	__be32 *p;
 
@@ -966,7 +966,7 @@ static void encode_read3args(struct xdr_stream *xdr,
 
 static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
 				   struct xdr_stream *xdr,
-				   const struct nfs_readargs *args)
+				   const struct nfs_pgio_args *args)
 {
 	encode_read3args(xdr, args);
 	prepare_reply_buffer(req, args->pages, args->pgbase,
@@ -992,7 +992,7 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
  *	};
  */
 static void encode_write3args(struct xdr_stream *xdr,
-			      const struct nfs_writeargs *args)
+			      const struct nfs_pgio_args *args)
 {
 	__be32 *p;
 
@@ -1008,7 +1008,7 @@ static void encode_write3args(struct xdr_stream *xdr,
 
 static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
 				    struct xdr_stream *xdr,
-				    const struct nfs_writeargs *args)
+				    const struct nfs_pgio_args *args)
 {
 	encode_write3args(xdr, args);
 	xdr->buf->flags |= XDRBUF_WRITE;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 21cd1f2ee35a..4794ca693367 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4055,7 +4055,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 }
 
 static bool nfs4_read_stateid_changed(struct rpc_task *task,
-		struct nfs_readargs *args)
+		struct nfs_pgio_args *args)
 {
 
 	if (!nfs4_error_stateid_expired(task->tk_status) ||
@@ -4121,7 +4121,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
 }
 
 static bool nfs4_write_stateid_changed(struct rpc_task *task,
-		struct nfs_writeargs *args)
+		struct nfs_pgio_args *args)
 {
 
 	if (!nfs4_error_stateid_expired(task->tk_status) ||
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 73ce8d4fe2c8..032159c36a57 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1556,7 +1556,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
 }
 
-static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
+static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+			struct compound_hdr *hdr)
 {
 	__be32 *p;
 
@@ -1701,7 +1702,8 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4
 	encode_nfs4_verifier(xdr, &arg->confirm);
 }
 
-static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
+static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+			 struct compound_hdr *hdr)
 {
 	__be32 *p;
 
@@ -2451,7 +2453,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
  * Encode a READ request
  */
 static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
-			      struct nfs_readargs *args)
+			      struct nfs_pgio_args *args)
 {
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -2513,7 +2515,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
  * Encode a WRITE request
  */
 static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
-			       struct nfs_writeargs *args)
+			       struct nfs_pgio_args *args)
 {
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 7f87461be3a9..46d555206023 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -470,7 +470,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 
 static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
 {
-	struct nfs_readargs *argp = &data->args;
+	struct nfs_pgio_args *argp = &data->args;
 	struct nfs_readres *resp = &data->res;
 
 	/* This is a short read! */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee6d46fde76c..25ba3830ec8b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1388,7 +1388,7 @@ static int nfs_should_remove_suid(const struct inode *inode)
  */
 void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct nfs_writeargs	*argp = &data->args;
+	struct nfs_pgio_args	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
 	struct inode		*inode = data->header->inode;
 	int status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 3e8fc1fe585b..5875001928f9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -491,18 +491,6 @@ struct nfs4_delegreturnres {
 /*
  * Arguments to the read call.
  */
-struct nfs_readargs {
-	struct nfs4_sequence_args	seq_args;
-	struct nfs_fh *		fh;
-	struct nfs_open_context *context;
-	struct nfs_lock_context *lock_context;
-	nfs4_stateid		stateid;
-	__u64			offset;
-	__u32			count;
-	unsigned int		pgbase;
-	struct page **		pages;
-};
-
 struct nfs_readres {
 	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr *	fattr;
@@ -513,20 +501,6 @@ struct nfs_readres {
 /*
  * Arguments to the write call.
  */
-struct nfs_writeargs {
-	struct nfs4_sequence_args	seq_args;
-	struct nfs_fh *		fh;
-	struct nfs_open_context *context;
-	struct nfs_lock_context *lock_context;
-	nfs4_stateid		stateid;
-	__u64			offset;
-	__u32			count;
-	enum nfs3_stable_how	stable;
-	unsigned int		pgbase;
-	struct page **		pages;
-	const u32 *		bitmask;
-};
-
 struct nfs_write_verifier {
 	char			data[8];
 };
@@ -544,6 +518,23 @@ struct nfs_writeres {
 	const struct nfs_server *server;
 };
 
+/*
+ * Arguments shared by the read and write call.
+ */
+struct nfs_pgio_args {
+	struct nfs4_sequence_args	seq_args;
+	struct nfs_fh *		fh;
+	struct nfs_open_context *context;
+	struct nfs_lock_context *lock_context;
+	nfs4_stateid		stateid;
+	__u64			offset;
+	__u32			count;
+	unsigned int		pgbase;
+	struct page **		pages;
+	const u32 *		bitmask;	/* used by write */
+	enum nfs3_stable_how	stable;		/* used by write */
+};
+
 /*
  * Arguments to the commit call.
  */
@@ -1269,7 +1260,7 @@ struct nfs_read_data {
 	struct list_head	list;
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;	/* fattr storage */
-	struct nfs_readargs args;
+	struct nfs_pgio_args	args;
 	struct nfs_readres  res;
 	unsigned long		timestamp;	/* For lease renewal */
 	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
@@ -1321,7 +1312,7 @@ struct nfs_write_data {
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;
 	struct nfs_writeverf	verf;
-	struct nfs_writeargs	args;		/* argument struct */
+	struct nfs_pgio_args	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 	unsigned long		timestamp;	/* For lease renewal */
 	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
-- 
cgit v1.2.3-71-gd317


From 9137bdf3d241fc2cbeb2a8ced51d1546150aa6a1 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:25 -0400
Subject: NFS: Create a common results structure for reads and writes

Reads and writes have very similar results.  This patch combines the two
structs together with comments to show where the differing fields are
used.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs2xdr.c        |  6 +++---
 fs/nfs/nfs3xdr.c        |  8 ++++----
 fs/nfs/nfs4xdr.c        |  9 +++++----
 fs/nfs/read.c           |  2 +-
 fs/nfs/write.c          |  2 +-
 include/linux/nfs_xdr.h | 32 ++++++++++++--------------------
 6 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 461cd8bd9401..5f61b83f4a1c 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -103,7 +103,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 /*
  *	typedef opaque	nfsdata<>;
  */
-static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
 {
 	u32 recvd, count;
 	__be32 *p;
@@ -857,7 +857,7 @@ out_default:
  *	};
  */
 static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
-				struct nfs_readres *result)
+				struct nfs_pgio_res *result)
 {
 	enum nfs_stat status;
 	int error;
@@ -878,7 +878,7 @@ out_default:
 }
 
 static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
-				 struct nfs_writeres *result)
+				 struct nfs_pgio_res *result)
 {
 	/* All NFSv2 writes are "file sync" writes */
 	result->verf->committed = NFS_FILE_SYNC;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 02f16c212007..8f4cbe7f4aa8 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1589,7 +1589,7 @@ out_default:
  *	};
  */
 static int decode_read3resok(struct xdr_stream *xdr,
-			     struct nfs_readres *result)
+			     struct nfs_pgio_res *result)
 {
 	u32 eof, count, ocount, recvd;
 	__be32 *p;
@@ -1625,7 +1625,7 @@ out_overflow:
 }
 
 static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-				 struct nfs_readres *result)
+				 struct nfs_pgio_res *result)
 {
 	enum nfs_stat status;
 	int error;
@@ -1673,7 +1673,7 @@ out_status:
  *	};
  */
 static int decode_write3resok(struct xdr_stream *xdr,
-			      struct nfs_writeres *result)
+			      struct nfs_pgio_res *result)
 {
 	__be32 *p;
 
@@ -1697,7 +1697,7 @@ out_eio:
 }
 
 static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-				  struct nfs_writeres *result)
+				  struct nfs_pgio_res *result)
 {
 	enum nfs_stat status;
 	int error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 032159c36a57..939ae606cfa4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5087,7 +5087,8 @@ static int decode_putrootfh(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_PUTROOTFH);
 }
 
-static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
+static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
+		       struct nfs_pgio_res *res)
 {
 	__be32 *p;
 	uint32_t count, eof, recvd;
@@ -5341,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);
 }
 
-static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
+static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 {
 	__be32 *p;
 	int status;
@@ -6638,7 +6639,7 @@ out:
  * Decode Read response
  */
 static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
-			     struct nfs_readres *res)
+			     struct nfs_pgio_res *res)
 {
 	struct compound_hdr hdr;
 	int status;
@@ -6663,7 +6664,7 @@ out:
  * Decode WRITE response
  */
 static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
-			      struct nfs_writeres *res)
+			      struct nfs_pgio_res *res)
 {
 	struct compound_hdr hdr;
 	int status;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 46d555206023..473bba35a2cb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -471,7 +471,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
 {
 	struct nfs_pgio_args *argp = &data->args;
-	struct nfs_readres *resp = &data->res;
+	struct nfs_pgio_res  *resp = &data->res;
 
 	/* This is a short read! */
 	nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 25ba3830ec8b..d392a70092fe 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1389,7 +1389,7 @@ static int nfs_should_remove_suid(const struct inode *inode)
 void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct nfs_pgio_args	*argp = &data->args;
-	struct nfs_writeres	*resp = &data->res;
+	struct nfs_pgio_res	*resp = &data->res;
 	struct inode		*inode = data->header->inode;
 	int status;
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5875001928f9..381f832b03c6 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -488,16 +488,6 @@ struct nfs4_delegreturnres {
 	const struct nfs_server *server;
 };
 
-/*
- * Arguments to the read call.
- */
-struct nfs_readres {
-	struct nfs4_sequence_res	seq_res;
-	struct nfs_fattr *	fattr;
-	__u32			count;
-	int                     eof;
-};
-
 /*
  * Arguments to the write call.
  */
@@ -510,14 +500,6 @@ struct nfs_writeverf {
 	enum nfs3_stable_how	committed;
 };
 
-struct nfs_writeres {
-	struct nfs4_sequence_res	seq_res;
-	struct nfs_fattr *	fattr;
-	struct nfs_writeverf *	verf;
-	__u32			count;
-	const struct nfs_server *server;
-};
-
 /*
  * Arguments shared by the read and write call.
  */
@@ -535,6 +517,16 @@ struct nfs_pgio_args {
 	enum nfs3_stable_how	stable;		/* used by write */
 };
 
+struct nfs_pgio_res {
+	struct nfs4_sequence_res	seq_res;
+	struct nfs_fattr *	fattr;
+	__u32			count;
+	int			eof;		/* used by read */
+	struct nfs_writeverf *	verf;		/* used by write */
+	const struct nfs_server *server;	/* used by write */
+
+};
+
 /*
  * Arguments to the commit call.
  */
@@ -1261,7 +1253,7 @@ struct nfs_read_data {
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;	/* fattr storage */
 	struct nfs_pgio_args	args;
-	struct nfs_readres  res;
+	struct nfs_pgio_res	res;
 	unsigned long		timestamp;	/* For lease renewal */
 	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
 	__u64			mds_offset;
@@ -1313,7 +1305,7 @@ struct nfs_write_data {
 	struct nfs_fattr	fattr;
 	struct nfs_writeverf	verf;
 	struct nfs_pgio_args	args;		/* argument struct */
-	struct nfs_writeres	res;		/* result struct */
+	struct nfs_pgio_res	res;		/* result struct */
 	unsigned long		timestamp;	/* For lease renewal */
 	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
 	__u64			mds_offset;	/* Filelayout dense stripe */
-- 
cgit v1.2.3-71-gd317


From 9c7e1b3d50b56b8d8f6237ed232350b7c6476cd5 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:26 -0400
Subject: NFS: Create a common read and write data struct

At this point, the only difference between nfs_read_data and
nfs_write_data is the write verifier.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 22 ++++++++++----------
 fs/nfs/internal.h                | 10 ++++-----
 fs/nfs/nfs3proc.c                | 12 +++++------
 fs/nfs/nfs4_fs.h                 |  4 ++--
 fs/nfs/nfs4filelayout.c          | 34 +++++++++++++++----------------
 fs/nfs/nfs4proc.c                | 30 +++++++++++++--------------
 fs/nfs/nfs4trace.h               |  8 ++++----
 fs/nfs/objlayout/objio_osd.c     |  6 +++---
 fs/nfs/objlayout/objlayout.c     | 16 +++++++--------
 fs/nfs/objlayout/objlayout.h     |  8 ++++----
 fs/nfs/pnfs.c                    | 26 ++++++++++++------------
 fs/nfs/pnfs.h                    | 10 ++++-----
 fs/nfs/proc.c                    | 12 +++++------
 fs/nfs/read.c                    | 32 ++++++++++++++---------------
 fs/nfs/write.c                   | 36 ++++++++++++++++----------------
 include/linux/nfs_fs.h           |  4 ++--
 include/linux/nfs_xdr.h          | 44 ++++++++++++++--------------------------
 17 files changed, 150 insertions(+), 164 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 65d849bdf77a..206cc68c9694 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -210,7 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err)
 			SetPageUptodate(bvec->bv_page);
 
 	if (err) {
-		struct nfs_read_data *rdata = par->data;
+		struct nfs_pgio_data *rdata = par->data;
 		struct nfs_pgio_header *header = rdata->header;
 
 		if (!header->pnfs_error)
@@ -224,17 +224,17 @@ static void bl_end_io_read(struct bio *bio, int err)
 static void bl_read_cleanup(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_read_data *rdata;
+	struct nfs_pgio_data *rdata;
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	rdata = container_of(task, struct nfs_read_data, task);
+	rdata = container_of(task, struct nfs_pgio_data, task);
 	pnfs_ld_read_done(rdata);
 }
 
 static void
 bl_end_par_io_read(void *data, int unused)
 {
-	struct nfs_read_data *rdata = data;
+	struct nfs_pgio_data *rdata = data;
 
 	rdata->task.tk_status = rdata->header->pnfs_error;
 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
@@ -242,7 +242,7 @@ bl_end_par_io_read(void *data, int unused)
 }
 
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_read_data *rdata)
+bl_read_pagelist(struct nfs_pgio_data *rdata)
 {
 	struct nfs_pgio_header *header = rdata->header;
 	int i, hole;
@@ -390,7 +390,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	}
 
 	if (unlikely(err)) {
-		struct nfs_write_data *data = par->data;
+		struct nfs_pgio_data *data = par->data;
 		struct nfs_pgio_header *header = data->header;
 
 		if (!header->pnfs_error)
@@ -405,7 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct nfs_write_data *data = par->data;
+	struct nfs_pgio_data *data = par->data;
 	struct nfs_pgio_header *header = data->header;
 
 	if (!uptodate) {
@@ -423,10 +423,10 @@ static void bl_end_io_write(struct bio *bio, int err)
 static void bl_write_cleanup(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_write_data *wdata;
+	struct nfs_pgio_data *wdata;
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	wdata = container_of(task, struct nfs_write_data, task);
+	wdata = container_of(task, struct nfs_pgio_data, task);
 	if (likely(!wdata->header->pnfs_error)) {
 		/* Marks for LAYOUTCOMMIT */
 		mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
@@ -438,7 +438,7 @@ static void bl_write_cleanup(struct work_struct *work)
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
 static void bl_end_par_io_write(void *data, int num_se)
 {
-	struct nfs_write_data *wdata = data;
+	struct nfs_pgio_data *wdata = data;
 
 	if (unlikely(wdata->header->pnfs_error)) {
 		bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
@@ -673,7 +673,7 @@ check_page:
 }
 
 static enum pnfs_try_status
-bl_write_pagelist(struct nfs_write_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
 {
 	struct nfs_pgio_header *header = wdata->header;
 	int i, ret, npg_zero, pg_index, last = 0;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 98fe618db2aa..af01b80412dd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -401,13 +401,13 @@ extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
 extern int nfs_initiate_read(struct rpc_clnt *clnt,
-			     struct nfs_read_data *data,
+			     struct nfs_pgio_data *data,
 			     const struct rpc_call_ops *call_ops, int flags);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
 			      struct nfs_pgio_header *hdr);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
-extern void nfs_readdata_release(struct nfs_read_data *rdata);
+extern void nfs_readdata_release(struct nfs_pgio_data *rdata);
 
 /* super.c */
 void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
@@ -429,10 +429,10 @@ extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 			     struct nfs_pgio_header *hdr);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
-extern void nfs_writedata_release(struct nfs_write_data *wdata);
+extern void nfs_writedata_release(struct nfs_pgio_data *wdata);
 extern void nfs_commit_free(struct nfs_commit_data *p);
 extern int nfs_initiate_write(struct rpc_clnt *clnt,
-			      struct nfs_write_data *data,
+			      struct nfs_pgio_data *data,
 			      const struct rpc_call_ops *call_ops,
 			      int how, int flags);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
@@ -492,7 +492,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_read_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d873241a9b3a..d235369c3dfb 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -795,7 +795,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return status;
 }
 
-static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct inode *inode = data->header->inode;
 
@@ -807,18 +807,18 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
-static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
 
-static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	rpc_call_start(task);
 	return 0;
 }
 
-static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct inode *inode = data->header->inode;
 
@@ -829,12 +829,12 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
 
-static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	rpc_call_start(task);
 	return 0;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e1d1badbe53c..f63cb87cd730 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -337,7 +337,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
  */
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-			 struct rpc_message *msg, struct nfs_write_data *wdata)
+			 struct rpc_message *msg, struct nfs_pgio_data *wdata)
 {
 	if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
 	    !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
@@ -369,7 +369,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
 
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-			 struct rpc_message *msg, struct nfs_write_data *wdata)
+			 struct rpc_message *msg, struct nfs_pgio_data *wdata)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index b9a35c05b60f..e6936147ad95 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -84,7 +84,7 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
-static void filelayout_reset_write(struct nfs_write_data *data)
+static void filelayout_reset_write(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 	struct rpc_task *task = &data->task;
@@ -105,7 +105,7 @@ static void filelayout_reset_write(struct nfs_write_data *data)
 	}
 }
 
-static void filelayout_reset_read(struct nfs_read_data *data)
+static void filelayout_reset_read(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 	struct rpc_task *task = &data->task;
@@ -243,7 +243,7 @@ wait_on_recovery:
 /* NFS_PROTO call done callback routines */
 
 static int filelayout_read_done_cb(struct rpc_task *task,
-				struct nfs_read_data *data)
+				struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 	int err;
@@ -270,7 +270,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
  * rfc5661 is not clear about which credential should be used.
  */
 static void
-filelayout_set_layoutcommit(struct nfs_write_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
 {
 	struct nfs_pgio_header *hdr = wdata->header;
 
@@ -305,7 +305,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
  */
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = data;
+	struct nfs_pgio_data *rdata = data;
 
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
 		rpc_exit(task, -EIO);
@@ -317,7 +317,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 		rpc_exit(task, 0);
 		return;
 	}
-	rdata->read_done_cb = filelayout_read_done_cb;
+	rdata->pgio_done_cb = filelayout_read_done_cb;
 
 	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
 			&rdata->args.seq_args,
@@ -331,7 +331,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = data;
+	struct nfs_pgio_data *rdata = data;
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
@@ -347,14 +347,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
 
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = data;
+	struct nfs_pgio_data *rdata = data;
 
 	rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
 }
 
 static void filelayout_read_release(void *data)
 {
-	struct nfs_read_data *rdata = data;
+	struct nfs_pgio_data *rdata = data;
 	struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
 
 	filelayout_fenceme(lo->plh_inode, lo);
@@ -363,7 +363,7 @@ static void filelayout_read_release(void *data)
 }
 
 static int filelayout_write_done_cb(struct rpc_task *task,
-				struct nfs_write_data *data)
+				struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 	int err;
@@ -419,7 +419,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = data;
+	struct nfs_pgio_data *wdata = data;
 
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
 		rpc_exit(task, -EIO);
@@ -443,7 +443,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = data;
+	struct nfs_pgio_data *wdata = data;
 
 	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
 	    task->tk_status == 0) {
@@ -457,14 +457,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
 
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = data;
+	struct nfs_pgio_data *wdata = data;
 
 	rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
 }
 
 static void filelayout_write_release(void *data)
 {
-	struct nfs_write_data *wdata = data;
+	struct nfs_pgio_data *wdata = data;
 	struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
 
 	filelayout_fenceme(lo->plh_inode, lo);
@@ -529,7 +529,7 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
 };
 
 static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_read_data *data)
+filelayout_read_pagelist(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 	struct pnfs_layout_segment *lseg = hdr->lseg;
@@ -575,7 +575,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 
 /* Perform async writes. */
 static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
 {
 	struct nfs_pgio_header *hdr = data->header;
 	struct pnfs_layout_segment *lseg = hdr->lseg;
@@ -600,7 +600,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 		__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
 		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
-	data->write_done_cb = filelayout_write_done_cb;
+	data->pgio_done_cb = filelayout_write_done_cb;
 	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4794ca693367..e793aa91454a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4033,12 +4033,12 @@ static bool nfs4_error_stateid_expired(int err)
 	return false;
 }
 
-void __nfs4_read_done_cb(struct nfs_read_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_data *data)
 {
 	nfs_invalidate_atime(data->header->inode);
 }
 
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
@@ -4068,7 +4068,7 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
 	return true;
 }
 
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 
 	dprintk("--> %s\n", __func__);
@@ -4077,19 +4077,19 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 		return -EAGAIN;
 	if (nfs4_read_stateid_changed(task, &data->args))
 		return -EAGAIN;
-	return data->read_done_cb ? data->read_done_cb(task, data) :
+	return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
 				    nfs4_read_done_cb(task, data);
 }
 
-static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
 {
 	data->timestamp   = jiffies;
-	data->read_done_cb = nfs4_read_done_cb;
+	data->pgio_done_cb = nfs4_read_done_cb;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
 }
 
-static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 			&data->args.seq_args,
@@ -4104,7 +4104,7 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat
 	return 0;
 }
 
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct inode *inode = data->header->inode;
 	
@@ -4134,18 +4134,18 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
 	return true;
 }
 
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return -EAGAIN;
 	if (nfs4_write_stateid_changed(task, &data->args))
 		return -EAGAIN;
-	return data->write_done_cb ? data->write_done_cb(task, data) :
+	return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
 		nfs4_write_done_cb(task, data);
 }
 
 static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
+bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
 {
 	const struct nfs_pgio_header *hdr = data->header;
 
@@ -4158,7 +4158,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
 	return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
 
-static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
 {
 	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
@@ -4168,8 +4168,8 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 	} else
 		data->args.bitmask = server->cache_consistency_bitmask;
 
-	if (!data->write_done_cb)
-		data->write_done_cb = nfs4_write_done_cb;
+	if (!data->pgio_done_cb)
+		data->pgio_done_cb = nfs4_write_done_cb;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 
@@ -4177,7 +4177,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 }
 
-static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 			&data->args.seq_args,
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 849cf146db30..0a744f3a86f6 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,7 +932,7 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
 
 DECLARE_EVENT_CLASS(nfs4_read_event,
 		TP_PROTO(
-			const struct nfs_read_data *data,
+			const struct nfs_pgio_data *data,
 			int error
 		),
 
@@ -972,7 +972,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
 #define DEFINE_NFS4_READ_EVENT(name) \
 	DEFINE_EVENT(nfs4_read_event, name, \
 			TP_PROTO( \
-				const struct nfs_read_data *data, \
+				const struct nfs_pgio_data *data, \
 				int error \
 			), \
 			TP_ARGS(data, error))
@@ -983,7 +983,7 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
 
 DECLARE_EVENT_CLASS(nfs4_write_event,
 		TP_PROTO(
-			const struct nfs_write_data *data,
+			const struct nfs_pgio_data *data,
 			int error
 		),
 
@@ -1024,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
 #define DEFINE_NFS4_WRITE_EVENT(name) \
 	DEFINE_EVENT(nfs4_write_event, name, \
 			TP_PROTO( \
-				const struct nfs_write_data *data, \
+				const struct nfs_pgio_data *data, \
 				int error \
 			), \
 			TP_ARGS(data, error))
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5457745dd4f1..426b366b0b33 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -439,7 +439,7 @@ static void _read_done(struct ore_io_state *ios, void *private)
 	objlayout_read_done(&objios->oir, status, objios->sync);
 }
 
-int objio_read_pagelist(struct nfs_read_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_data *rdata)
 {
 	struct nfs_pgio_header *hdr = rdata->header;
 	struct objio_state *objios;
@@ -487,7 +487,7 @@ static void _write_done(struct ore_io_state *ios, void *private)
 static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
 	struct objio_state *objios = priv;
-	struct nfs_write_data *wdata = objios->oir.rpcdata;
+	struct nfs_pgio_data *wdata = objios->oir.rpcdata;
 	struct address_space *mapping = wdata->header->inode->i_mapping;
 	pgoff_t index = offset / PAGE_SIZE;
 	struct page *page;
@@ -531,7 +531,7 @@ static const struct _ore_r4w_op _r4w_op = {
 	.put_page = &__r4w_put_page,
 };
 
-int objio_write_pagelist(struct nfs_write_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
 {
 	struct nfs_pgio_header *hdr = wdata->header;
 	struct objio_state *objios;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index e4f9cbfec67b..2f955f671003 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -229,11 +229,11 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
 static void _rpc_read_complete(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_read_data *rdata;
+	struct nfs_pgio_data *rdata;
 
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	rdata = container_of(task, struct nfs_read_data, task);
+	rdata = container_of(task, struct nfs_pgio_data, task);
 
 	pnfs_ld_read_done(rdata);
 }
@@ -241,7 +241,7 @@ static void _rpc_read_complete(struct work_struct *work)
 void
 objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-	struct nfs_read_data *rdata = oir->rpcdata;
+	struct nfs_pgio_data *rdata = oir->rpcdata;
 
 	oir->status = rdata->task.tk_status = status;
 	if (status >= 0)
@@ -266,7 +266,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async reads.
  */
 enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_read_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_data *rdata)
 {
 	struct nfs_pgio_header *hdr = rdata->header;
 	struct inode *inode = hdr->inode;
@@ -312,11 +312,11 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 static void _rpc_write_complete(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_write_data *wdata;
+	struct nfs_pgio_data *wdata;
 
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	wdata = container_of(task, struct nfs_write_data, task);
+	wdata = container_of(task, struct nfs_pgio_data, task);
 
 	pnfs_ld_write_done(wdata);
 }
@@ -324,7 +324,7 @@ static void _rpc_write_complete(struct work_struct *work)
 void
 objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-	struct nfs_write_data *wdata = oir->rpcdata;
+	struct nfs_pgio_data *wdata = oir->rpcdata;
 
 	oir->status = wdata->task.tk_status = status;
 	if (status >= 0) {
@@ -351,7 +351,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async writes.
  */
 enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_write_data *wdata,
+objlayout_write_pagelist(struct nfs_pgio_data *wdata,
 			 int how)
 {
 	struct nfs_pgio_header *hdr = wdata->header;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 87aa1dec6120..01e041029a6c 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
  */
 extern void objio_free_result(struct objlayout_io_res *oir);
 
-extern int objio_read_pagelist(struct nfs_read_data *rdata);
-extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
 
 /*
  * callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
 extern void objlayout_free_lseg(struct pnfs_layout_segment *);
 
 extern enum pnfs_try_status objlayout_read_pagelist(
-	struct nfs_read_data *);
+	struct nfs_pgio_data *);
 
 extern enum pnfs_try_status objlayout_write_pagelist(
-	struct nfs_write_data *,
+	struct nfs_pgio_data *,
 	int how);
 
 extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3d5bc2baafd1..e9cea3ab7cf9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1492,7 +1492,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
 
-static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 
@@ -1511,7 +1511,7 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_write_done(struct nfs_write_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 
@@ -1527,7 +1527,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
 static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
-		struct nfs_write_data *data)
+		struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 
@@ -1540,7 +1540,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
 }
 
 static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_write_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
 			const struct rpc_call_ops *call_ops,
 			struct pnfs_layout_segment *lseg,
 			int how)
@@ -1564,7 +1564,7 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
 static void
 pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
 {
-	struct nfs_write_data *data;
+	struct nfs_pgio_data *data;
 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 
@@ -1572,7 +1572,7 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
 	while (!list_empty(head)) {
 		enum pnfs_try_status trypnfs;
 
-		data = list_first_entry(head, struct nfs_write_data, list);
+		data = list_first_entry(head, struct nfs_pgio_data, list);
 		list_del_init(&data->list);
 
 		trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
@@ -1647,7 +1647,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
 
-static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 
@@ -1666,7 +1666,7 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_read_done(struct nfs_read_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 
@@ -1682,7 +1682,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
 static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
-		struct nfs_read_data *data)
+		struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
 
@@ -1698,7 +1698,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
  * Call the appropriate parallel I/O subsystem read function.
  */
 static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_read_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
 		       const struct rpc_call_ops *call_ops,
 		       struct pnfs_layout_segment *lseg)
 {
@@ -1722,7 +1722,7 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
 static void
 pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
 {
-	struct nfs_read_data *data;
+	struct nfs_pgio_data *data;
 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 
@@ -1730,7 +1730,7 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
 	while (!list_empty(head)) {
 		enum pnfs_try_status trypnfs;
 
-		data = list_first_entry(head, struct nfs_read_data, list);
+		data = list_first_entry(head, struct nfs_pgio_data, list);
 		list_del_init(&data->list);
 
 		trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
@@ -1821,7 +1821,7 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 
 void
-pnfs_set_layoutcommit(struct nfs_write_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
 {
 	struct nfs_pgio_header *hdr = wdata->header;
 	struct inode *inode = hdr->inode;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 94a9a1834b3f..0031267d7492 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -113,8 +113,8 @@ struct pnfs_layoutdriver_type {
 	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
 	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
 	 */
-	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
-	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+	enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
+	enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
 
 	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
 
@@ -212,13 +212,13 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_write_data *);
-void pnfs_ld_read_done(struct nfs_read_data *);
+void pnfs_ld_write_done(struct nfs_pgio_data *);
+void pnfs_ld_read_done(struct nfs_pgio_data *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       struct nfs_open_context *ctx,
 					       loff_t pos,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 8cc227fcd4d2..c54829eb2156 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,7 +578,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return 0;
 }
 
-static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct inode *inode = data->header->inode;
 
@@ -594,18 +594,18 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
-static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
 
-static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	rpc_call_start(task);
 	return 0;
 }
 
-static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct inode *inode = data->header->inode;
 
@@ -614,14 +614,14 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
 {
 	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
 	data->args.stable = NFS_FILE_SYNC;
 	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
 
-static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	rpc_call_start(task);
 	return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 473bba35a2cb..9e426cc179ed 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,10 +51,10 @@ struct nfs_read_header *nfs_readhdr_alloc(void)
 }
 EXPORT_SYMBOL_GPL(nfs_readhdr_alloc);
 
-static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
+static struct nfs_pgio_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
 						unsigned int pagecount)
 {
-	struct nfs_read_data *data, *prealloc;
+	struct nfs_pgio_data *data, *prealloc;
 
 	prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data;
 	if (prealloc->header == NULL)
@@ -84,7 +84,7 @@ void nfs_readhdr_free(struct nfs_pgio_header *hdr)
 }
 EXPORT_SYMBOL_GPL(nfs_readhdr_free);
 
-void nfs_readdata_release(struct nfs_read_data *rdata)
+void nfs_readdata_release(struct nfs_pgio_data *rdata)
 {
 	struct nfs_pgio_header *hdr = rdata->header;
 	struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
@@ -212,7 +212,7 @@ out:
 }
 
 int nfs_initiate_read(struct rpc_clnt *clnt,
-		      struct nfs_read_data *data,
+		      struct nfs_pgio_data *data,
 		      const struct rpc_call_ops *call_ops, int flags)
 {
 	struct inode *inode = data->header->inode;
@@ -255,7 +255,7 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);
 /*
  * Set up the NFS read request struct
  */
-static void nfs_read_rpcsetup(struct nfs_read_data *data,
+static void nfs_read_rpcsetup(struct nfs_pgio_data *data,
 		unsigned int count, unsigned int offset)
 {
 	struct nfs_page *req = data->header->req;
@@ -274,7 +274,7 @@ static void nfs_read_rpcsetup(struct nfs_read_data *data,
 	nfs_fattr_init(&data->fattr);
 }
 
-static int nfs_do_read(struct nfs_read_data *data,
+static int nfs_do_read(struct nfs_pgio_data *data,
 		const struct rpc_call_ops *call_ops)
 {
 	struct inode *inode = data->header->inode;
@@ -286,13 +286,13 @@ static int
 nfs_do_multiple_reads(struct list_head *head,
 		const struct rpc_call_ops *call_ops)
 {
-	struct nfs_read_data *data;
+	struct nfs_pgio_data *data;
 	int ret = 0;
 
 	while (!list_empty(head)) {
 		int ret2;
 
-		data = list_first_entry(head, struct nfs_read_data, list);
+		data = list_first_entry(head, struct nfs_pgio_data, list);
 		list_del_init(&data->list);
 
 		ret2 = nfs_do_read(data, call_ops);
@@ -324,8 +324,8 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
 {
 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	while (!list_empty(&hdr->rpc_list)) {
-		struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
-				struct nfs_read_data, list);
+		struct nfs_pgio_data *data = list_first_entry(&hdr->rpc_list,
+				struct nfs_pgio_data, list);
 		list_del(&data->list);
 		nfs_readdata_release(data);
 	}
@@ -350,7 +350,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
 {
 	struct nfs_page *req = hdr->req;
 	struct page *page = req->wb_page;
-	struct nfs_read_data *data;
+	struct nfs_pgio_data *data;
 	size_t rsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
 
@@ -382,7 +382,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
 {
 	struct nfs_page		*req;
 	struct page		**pages;
-	struct nfs_read_data    *data;
+	struct nfs_pgio_data	*data;
 	struct list_head *head = &desc->pg_list;
 
 	data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base,
@@ -447,7 +447,7 @@ static const struct nfs_pageio_ops nfs_pageio_read_ops = {
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
+int nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct inode *inode = data->header->inode;
 	int status;
@@ -468,7 +468,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
+static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_args *argp = &data->args;
 	struct nfs_pgio_res  *resp = &data->res;
@@ -490,7 +490,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 
 static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = calldata;
+	struct nfs_pgio_data *data = calldata;
 	struct nfs_pgio_header *hdr = data->header;
 
 	/* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */
@@ -520,7 +520,7 @@ static void nfs_readpage_release_common(void *calldata)
 
 void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = calldata;
+	struct nfs_pgio_data *data = calldata;
 	int err;
 	err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
 	if (err)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d392a70092fe..3a2fc5c4c79a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -88,10 +88,10 @@ struct nfs_write_header *nfs_writehdr_alloc(void)
 }
 EXPORT_SYMBOL_GPL(nfs_writehdr_alloc);
 
-static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
+static struct nfs_pgio_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
 						  unsigned int pagecount)
 {
-	struct nfs_write_data *data, *prealloc;
+	struct nfs_pgio_data *data, *prealloc;
 
 	prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
 	if (prealloc->header == NULL)
@@ -120,7 +120,7 @@ void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 }
 EXPORT_SYMBOL_GPL(nfs_writehdr_free);
 
-void nfs_writedata_release(struct nfs_write_data *wdata)
+void nfs_writedata_release(struct nfs_pgio_data *wdata)
 {
 	struct nfs_pgio_header *hdr = wdata->header;
 	struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
@@ -582,7 +582,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 }
 
 static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_data *data)
 {
 	if (data->verf.committed == NFS_DATA_SYNC)
 		return data->header->lseg == NULL;
@@ -613,7 +613,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 }
 
 static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_data *data)
 {
 	return 0;
 }
@@ -990,7 +990,7 @@ static int flush_task_priority(int how)
 }
 
 int nfs_initiate_write(struct rpc_clnt *clnt,
-		       struct nfs_write_data *data,
+		       struct nfs_pgio_data *data,
 		       const struct rpc_call_ops *call_ops,
 		       int how, int flags)
 {
@@ -1047,7 +1047,7 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);
 /*
  * Set up the argument/result storage required for the RPC call.
  */
-static void nfs_write_rpcsetup(struct nfs_write_data *data,
+static void nfs_write_rpcsetup(struct nfs_pgio_data *data,
 		unsigned int count, unsigned int offset,
 		int how, struct nfs_commit_info *cinfo)
 {
@@ -1082,7 +1082,7 @@ static void nfs_write_rpcsetup(struct nfs_write_data *data,
 	nfs_fattr_init(&data->fattr);
 }
 
-static int nfs_do_write(struct nfs_write_data *data,
+static int nfs_do_write(struct nfs_pgio_data *data,
 		const struct rpc_call_ops *call_ops,
 		int how)
 {
@@ -1095,13 +1095,13 @@ static int nfs_do_multiple_writes(struct list_head *head,
 		const struct rpc_call_ops *call_ops,
 		int how)
 {
-	struct nfs_write_data *data;
+	struct nfs_pgio_data *data;
 	int ret = 0;
 
 	while (!list_empty(head)) {
 		int ret2;
 
-		data = list_first_entry(head, struct nfs_write_data, list);
+		data = list_first_entry(head, struct nfs_pgio_data, list);
 		list_del_init(&data->list);
 		
 		ret2 = nfs_do_write(data, call_ops, how);
@@ -1144,8 +1144,8 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
 {
 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	while (!list_empty(&hdr->rpc_list)) {
-		struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
-				struct nfs_write_data, list);
+		struct nfs_pgio_data *data = list_first_entry(&hdr->rpc_list,
+				struct nfs_pgio_data, list);
 		list_del(&data->list);
 		nfs_writedata_release(data);
 	}
@@ -1161,7 +1161,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
 {
 	struct nfs_page *req = hdr->req;
 	struct page *page = req->wb_page;
-	struct nfs_write_data *data;
+	struct nfs_pgio_data *data;
 	size_t wsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
@@ -1211,7 +1211,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
 {
 	struct nfs_page		*req;
 	struct page		**pages;
-	struct nfs_write_data	*data;
+	struct nfs_pgio_data	*data;
 	struct list_head *head = &desc->pg_list;
 	struct nfs_commit_info cinfo;
 
@@ -1305,7 +1305,7 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
 
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = calldata;
+	struct nfs_pgio_data *data = calldata;
 	int err;
 	err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
 	if (err)
@@ -1328,14 +1328,14 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
  */
 static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = calldata;
+	struct nfs_pgio_data	*data = calldata;
 
 	nfs_writeback_done(task, data);
 }
 
 static void nfs_writeback_release_common(void *calldata)
 {
-	struct nfs_write_data	*data = calldata;
+	struct nfs_pgio_data	*data = calldata;
 	struct nfs_pgio_header *hdr = data->header;
 	int status = data->task.tk_status;
 
@@ -1386,7 +1386,7 @@ static int nfs_should_remove_suid(const struct inode *inode)
 /*
  * This function is called when the WRITE call is complete.
  */
-void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+void nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_args	*argp = &data->args;
 	struct nfs_pgio_res	*resp = &data->res;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fa6918b0f829..7e0db561d829 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -520,7 +520,7 @@ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern void nfs_writeback_done(struct rpc_task *, struct nfs_pgio_data *);
 
 /*
  * Try to write back everything synchronously (but check the
@@ -553,7 +553,7 @@ nfs_have_writebacks(struct inode *inode)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
+extern int  nfs_readpage_result(struct rpc_task *, struct nfs_pgio_data *);
 extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 			       struct page *);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 381f832b03c6..64f6f1491c0d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1247,20 +1247,6 @@ struct nfs_page_array {
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
-struct nfs_read_data {
-	struct nfs_pgio_header	*header;
-	struct list_head	list;
-	struct rpc_task		task;
-	struct nfs_fattr	fattr;	/* fattr storage */
-	struct nfs_pgio_args	args;
-	struct nfs_pgio_res	res;
-	unsigned long		timestamp;	/* For lease renewal */
-	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
-	__u64			mds_offset;
-	struct nfs_page_array	pages;
-	struct nfs_client	*ds_clp;	/* pNFS data server */
-};
-
 /* used as flag bits in nfs_pgio_header */
 enum {
 	NFS_IOHDR_ERROR = 0,
@@ -1293,29 +1279,29 @@ struct nfs_pgio_header {
 	unsigned long		flags;
 };
 
-struct nfs_read_header {
-	struct nfs_pgio_header	header;
-	struct nfs_read_data	rpc_data;
-};
-
-struct nfs_write_data {
+struct nfs_pgio_data {
 	struct nfs_pgio_header	*header;
 	struct list_head	list;
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;
-	struct nfs_writeverf	verf;
+	struct nfs_writeverf	verf;		/* Used for writes */
 	struct nfs_pgio_args	args;		/* argument struct */
 	struct nfs_pgio_res	res;		/* result struct */
 	unsigned long		timestamp;	/* For lease renewal */
-	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
+	int (*pgio_done_cb) (struct rpc_task *task, struct nfs_pgio_data *data);
 	__u64			mds_offset;	/* Filelayout dense stripe */
 	struct nfs_page_array	pages;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 };
 
+struct nfs_read_header {
+	struct nfs_pgio_header	header;
+	struct nfs_pgio_data	rpc_data;
+};
+
 struct nfs_write_header {
 	struct nfs_pgio_header	header;
-	struct nfs_write_data	rpc_data;
+	struct nfs_pgio_data	rpc_data;
 	struct nfs_writeverf	verf;
 };
 
@@ -1448,12 +1434,12 @@ struct nfs_rpc_ops {
 			     struct nfs_pathconf *);
 	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
 	int	(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
-	void	(*read_setup)   (struct nfs_read_data *, struct rpc_message *);
-	int	(*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
-	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
-	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
-	int	(*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
-	int	(*write_done)  (struct rpc_task *, struct nfs_write_data *);
+	void	(*read_setup)   (struct nfs_pgio_data *, struct rpc_message *);
+	int	(*read_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
+	int	(*read_done)  (struct rpc_task *, struct nfs_pgio_data *);
+	void	(*write_setup)  (struct nfs_pgio_data *, struct rpc_message *);
+	int	(*write_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
+	int	(*write_done)  (struct rpc_task *, struct nfs_pgio_data *);
 	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
 	void	(*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
 	int	(*commit_done) (struct rpc_task *, struct nfs_commit_data *);
-- 
cgit v1.2.3-71-gd317


From c0752cdfbbb691cfe98812f7aed8ce1e766823c4 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:27 -0400
Subject: NFS: Create a common read and write header struct

The only difference is the write verifier field, but we can keep that
for a little bit longer.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h       |  4 ++--
 fs/nfs/pnfs.c           |  4 ++--
 fs/nfs/read.c           | 14 +++++++-------
 fs/nfs/write.c          | 14 +++++++-------
 include/linux/nfs_xdr.h |  7 +------
 5 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index af01b80412dd..b0e7a41d14a8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -395,7 +395,7 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
 
 struct nfs_pgio_completion_ops;
 /* read.c */
-extern struct nfs_read_header *nfs_readhdr_alloc(void);
+extern struct nfs_rw_header *nfs_readhdr_alloc(void);
 extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, bool force_mds,
@@ -424,7 +424,7 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, int ioflags, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
-extern struct nfs_write_header *nfs_writehdr_alloc(void);
+extern struct nfs_rw_header *nfs_writehdr_alloc(void);
 extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 			     struct nfs_pgio_header *hdr);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e9cea3ab7cf9..43cfe11aa1a4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1592,7 +1592,7 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_write_header *whdr;
+	struct nfs_rw_header *whdr;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
@@ -1750,7 +1750,7 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_read_header *rhdr;
+	struct nfs_rw_header *rhdr;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 9e426cc179ed..d29ca3673694 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -34,9 +34,9 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-struct nfs_read_header *nfs_readhdr_alloc(void)
+struct nfs_rw_header *nfs_readhdr_alloc(void)
 {
-	struct nfs_read_header *rhdr;
+	struct nfs_rw_header *rhdr;
 
 	rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 	if (rhdr) {
@@ -56,7 +56,7 @@ static struct nfs_pgio_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
 {
 	struct nfs_pgio_data *data, *prealloc;
 
-	prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data;
+	prealloc = &container_of(hdr, struct nfs_rw_header, header)->rpc_data;
 	if (prealloc->header == NULL)
 		data = prealloc;
 	else
@@ -78,7 +78,7 @@ out:
 
 void nfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
-	struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header);
+	struct nfs_rw_header *rhdr = container_of(hdr, struct nfs_rw_header, header);
 
 	kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
@@ -87,7 +87,7 @@ EXPORT_SYMBOL_GPL(nfs_readhdr_free);
 void nfs_readdata_release(struct nfs_pgio_data *rdata)
 {
 	struct nfs_pgio_header *hdr = rdata->header;
-	struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
+	struct nfs_rw_header *read_header = container_of(hdr, struct nfs_rw_header, header);
 
 	put_nfs_open_context(rdata->args.context);
 	if (rdata->pages.pagevec != rdata->pages.page_array)
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pagein);
 
 static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_read_header *rhdr;
+	struct nfs_rw_header *rhdr;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
@@ -680,7 +680,7 @@ out:
 int __init nfs_init_readpagecache(void)
 {
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
-					     sizeof(struct nfs_read_header),
+					     sizeof(struct nfs_rw_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3a2fc5c4c79a..37c4c988519c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -70,9 +70,9 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-struct nfs_write_header *nfs_writehdr_alloc(void)
+struct nfs_rw_header *nfs_writehdr_alloc(void)
 {
-	struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+	struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
 	if (p) {
 		struct nfs_pgio_header *hdr = &p->header;
@@ -93,7 +93,7 @@ static struct nfs_pgio_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
 {
 	struct nfs_pgio_data *data, *prealloc;
 
-	prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
+	prealloc = &container_of(hdr, struct nfs_rw_header, header)->rpc_data;
 	if (prealloc->header == NULL)
 		data = prealloc;
 	else
@@ -115,7 +115,7 @@ out:
 
 void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-	struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
+	struct nfs_rw_header *whdr = container_of(hdr, struct nfs_rw_header, header);
 	mempool_free(whdr, nfs_wdata_mempool);
 }
 EXPORT_SYMBOL_GPL(nfs_writehdr_free);
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(nfs_writehdr_free);
 void nfs_writedata_release(struct nfs_pgio_data *wdata)
 {
 	struct nfs_pgio_header *hdr = wdata->header;
-	struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
+	struct nfs_rw_header *write_header = container_of(hdr, struct nfs_rw_header, header);
 
 	put_nfs_open_context(wdata->args.context);
 	if (wdata->pages.pagevec != wdata->pages.page_array)
@@ -1253,7 +1253,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_flush);
 
 static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_write_header *whdr;
+	struct nfs_rw_header *whdr;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
@@ -1910,7 +1910,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-					     sizeof(struct nfs_write_header),
+					     sizeof(struct nfs_rw_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_wdata_cachep == NULL)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 64f6f1491c0d..2d34e0dc1870 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1294,12 +1294,7 @@ struct nfs_pgio_data {
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 };
 
-struct nfs_read_header {
-	struct nfs_pgio_header	header;
-	struct nfs_pgio_data	rpc_data;
-};
-
-struct nfs_write_header {
+struct nfs_rw_header {
 	struct nfs_pgio_header	header;
 	struct nfs_pgio_data	rpc_data;
 	struct nfs_writeverf	verf;
-- 
cgit v1.2.3-71-gd317


From f79d06f544a797d75cbf5256a5d06c4b3d2759cc Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:28 -0400
Subject: NFS: Move the write verifier into the nfs_pgio_header

The header had a pointer to the verifier that was set from the old write
data struct.  We don't need to keep the pointer around now that we have
shared structures.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c         | 4 ++--
 fs/nfs/write.c          | 7 +++----
 include/linux/nfs_xdr.h | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bbe688e2cc89..164b0167677b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -813,12 +813,12 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
 				bit = NFS_IOHDR_NEED_RESCHED;
 			else if (dreq->flags == 0) {
-				memcpy(&dreq->verf, hdr->verf,
+				memcpy(&dreq->verf, &hdr->verf,
 				       sizeof(dreq->verf));
 				bit = NFS_IOHDR_NEED_COMMIT;
 				dreq->flags = NFS_ODIRECT_DO_COMMIT;
 			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
-				if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
+				if (memcmp(&dreq->verf, &hdr->verf, sizeof(dreq->verf))) {
 					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 					bit = NFS_IOHDR_NEED_RESCHED;
 				} else
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 37c4c988519c..321a791c72bf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -82,7 +82,6 @@ struct nfs_rw_header *nfs_writehdr_alloc(void)
 		INIT_LIST_HEAD(&hdr->rpc_list);
 		spin_lock_init(&hdr->lock);
 		atomic_set(&hdr->refcnt, 0);
-		hdr->verf = &p->verf;
 	}
 	return p;
 }
@@ -644,7 +643,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 			goto next;
 		}
 		if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
-			memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf));
+			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 			goto next;
 		}
@@ -1344,8 +1343,8 @@ static void nfs_writeback_release_common(void *calldata)
 		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
 			; /* Do nothing */
 		else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
-			memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf));
-		else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf)))
+			memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
+		else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
 			set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
 		spin_unlock(&hdr->lock);
 	}
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2d34e0dc1870..965c2aa6b33f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1263,7 +1263,7 @@ struct nfs_pgio_header {
 	struct list_head	rpc_list;
 	atomic_t		refcnt;
 	struct nfs_page		*req;
-	struct nfs_writeverf	*verf;
+	struct nfs_writeverf	verf;		/* Used for writes */
 	struct pnfs_layout_segment *lseg;
 	loff_t			io_start;
 	const struct rpc_call_ops *mds_ops;
@@ -1297,7 +1297,6 @@ struct nfs_pgio_data {
 struct nfs_rw_header {
 	struct nfs_pgio_header	header;
 	struct nfs_pgio_data	rpc_data;
-	struct nfs_writeverf	verf;
 };
 
 struct nfs_mds_commit_info {
-- 
cgit v1.2.3-71-gd317


From 4a0de55c565a36cac8422b76a948c4634a90781e Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:30 -0400
Subject: NFS: Create a common rw_header_alloc and rw_header_free function

I create a new struct nfs_rw_ops to decide the differences between reads
and writes.  This struct will be set when initializing a new
nfs_pgio_descriptor, and then passed on to the nfs_rw_header when a new
header is allocated.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h        |  6 ++----
 fs/nfs/pagelist.c        | 33 +++++++++++++++++++++++++++++++++
 fs/nfs/pnfs.c            |  8 ++++----
 fs/nfs/read.c            | 34 +++++++++++++---------------------
 fs/nfs/write.c           | 28 ++++++++++++----------------
 include/linux/nfs_page.h |  7 +++++++
 include/linux/nfs_xdr.h  |  1 +
 7 files changed, 72 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ddc142c5062..9d6a40eae11c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -237,6 +237,8 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 int nfs_iocounter_wait(struct nfs_io_counter *c);
 
+struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
+void nfs_rw_header_free(struct nfs_pgio_header *);
 struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *, unsigned int);
 void nfs_pgio_data_release(struct nfs_pgio_data *);
 
@@ -397,8 +399,6 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
 
 struct nfs_pgio_completion_ops;
 /* read.c */
-extern struct nfs_rw_header *nfs_readhdr_alloc(void);
-extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
@@ -425,8 +425,6 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, int ioflags, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
-extern struct nfs_rw_header *nfs_writehdr_alloc(void);
-extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 			     struct nfs_pgio_header *hdr);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a98ccf722d7b..ca356fe0836b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -300,6 +300,37 @@ static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
 	return container_of(hdr, struct nfs_rw_header, header);
 }
 
+/**
+ * nfs_rw_header_alloc - Allocate a header for a read or write
+ * @ops: Read or write function vector
+ */
+struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+{
+	struct nfs_rw_header *header = ops->rw_alloc_header();
+
+	if (header) {
+		struct nfs_pgio_header *hdr = &header->header;
+
+		INIT_LIST_HEAD(&hdr->pages);
+		INIT_LIST_HEAD(&hdr->rpc_list);
+		spin_lock_init(&hdr->lock);
+		atomic_set(&hdr->refcnt, 0);
+		hdr->rw_ops = ops;
+	}
+	return header;
+}
+EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+
+/*
+ * nfs_rw_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_rw_header_free(struct nfs_pgio_header *hdr)
+{
+	hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
+}
+EXPORT_SYMBOL_GPL(nfs_rw_header_free);
+
 /**
  * nfs_pgio_data_alloc - Allocate pageio data
  * @hdr: The header making a request
@@ -367,6 +398,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     struct inode *inode,
 		     const struct nfs_pageio_ops *pg_ops,
 		     const struct nfs_pgio_completion_ops *compl_ops,
+		     const struct nfs_rw_ops *rw_ops,
 		     size_t bsize,
 		     int io_flags)
 {
@@ -380,6 +412,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_inode = inode;
 	desc->pg_ops = pg_ops;
 	desc->pg_completion_ops = compl_ops;
+	desc->pg_rw_ops = rw_ops;
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
 	desc->pg_lseg = NULL;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e192ba69a7d4..54c84c128b2b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1585,7 +1585,7 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
 	pnfs_put_lseg(hdr->lseg);
-	nfs_writehdr_free(hdr);
+	nfs_rw_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 
@@ -1596,7 +1596,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	whdr = nfs_writehdr_alloc();
+	whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
 	if (!whdr) {
 		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		pnfs_put_lseg(desc->pg_lseg);
@@ -1743,7 +1743,7 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
 	pnfs_put_lseg(hdr->lseg);
-	nfs_readhdr_free(hdr);
+	nfs_rw_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 
@@ -1754,7 +1754,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	rhdr = nfs_readhdr_alloc();
+	rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
 	if (!rhdr) {
 		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		ret = -ENOMEM;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index ab4c1a5b5fbd..4cf3577bd54e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -31,33 +31,19 @@
 static const struct nfs_pageio_ops nfs_pageio_read_ops;
 static const struct rpc_call_ops nfs_read_common_ops;
 static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+static const struct nfs_rw_ops nfs_rw_read_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-struct nfs_rw_header *nfs_readhdr_alloc(void)
+static struct nfs_rw_header *nfs_readhdr_alloc(void)
 {
-	struct nfs_rw_header *rhdr;
-
-	rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
-	if (rhdr) {
-		struct nfs_pgio_header *hdr = &rhdr->header;
-
-		INIT_LIST_HEAD(&hdr->pages);
-		INIT_LIST_HEAD(&hdr->rpc_list);
-		spin_lock_init(&hdr->lock);
-		atomic_set(&hdr->refcnt, 0);
-	}
-	return rhdr;
+	return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 }
-EXPORT_SYMBOL_GPL(nfs_readhdr_alloc);
 
-void nfs_readhdr_free(struct nfs_pgio_header *hdr)
+static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
 {
-	struct nfs_rw_header *rhdr = container_of(hdr, struct nfs_rw_header, header);
-
 	kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
-EXPORT_SYMBOL_GPL(nfs_readhdr_free);
 
 static
 int nfs_return_empty_page(struct page *page)
@@ -79,7 +65,8 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 	if (server->pnfs_curr_ld && !force_mds)
 		pg_ops = server->pnfs_curr_ld->pg_read_ops;
 #endif
-	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, server->rsize, 0);
+	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
+			server->rsize, 0);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
@@ -375,13 +362,13 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	rhdr = nfs_readhdr_alloc();
+	rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
 	if (!rhdr) {
 		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		return -ENOMEM;
 	}
 	hdr = &rhdr->header;
-	nfs_pgheader_init(desc, hdr, nfs_readhdr_free);
+	nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
 	atomic_inc(&hdr->refcnt);
 	ret = nfs_generic_pagein(desc, hdr);
 	if (ret == 0)
@@ -647,3 +634,8 @@ void nfs_destroy_readpagecache(void)
 {
 	kmem_cache_destroy(nfs_rdata_cachep);
 }
+
+static const struct nfs_rw_ops nfs_rw_read_ops = {
+	.rw_alloc_header	= nfs_readhdr_alloc,
+	.rw_free_header		= nfs_readhdr_free,
+};
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0dc4d6a28bd0..9c5cde38da45 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -46,6 +46,7 @@ static const struct rpc_call_ops nfs_write_common_ops;
 static const struct rpc_call_ops nfs_commit_ops;
 static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
+static const struct nfs_rw_ops nfs_rw_write_ops;
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -70,29 +71,19 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-struct nfs_rw_header *nfs_writehdr_alloc(void)
+static struct nfs_rw_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
-	if (p) {
-		struct nfs_pgio_header *hdr = &p->header;
-
+	if (p)
 		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&hdr->pages);
-		INIT_LIST_HEAD(&hdr->rpc_list);
-		spin_lock_init(&hdr->lock);
-		atomic_set(&hdr->refcnt, 0);
-	}
 	return p;
 }
-EXPORT_SYMBOL_GPL(nfs_writehdr_alloc);
 
-void nfs_writehdr_free(struct nfs_pgio_header *hdr)
+static void nfs_writehdr_free(struct nfs_rw_header *whdr)
 {
-	struct nfs_rw_header *whdr = container_of(hdr, struct nfs_rw_header, header);
 	mempool_free(whdr, nfs_wdata_mempool);
 }
-EXPORT_SYMBOL_GPL(nfs_writehdr_free);
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 {
@@ -1210,13 +1201,13 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	whdr = nfs_writehdr_alloc();
+	whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
 	if (!whdr) {
 		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		return -ENOMEM;
 	}
 	hdr = &whdr->header;
-	nfs_pgheader_init(desc, hdr, nfs_writehdr_free);
+	nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
 	atomic_inc(&hdr->refcnt);
 	ret = nfs_generic_flush(desc, hdr);
 	if (ret == 0)
@@ -1244,7 +1235,8 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 	if (server->pnfs_curr_ld && !force_mds)
 		pg_ops = server->pnfs_curr_ld->pg_write_ops;
 #endif
-	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, server->wsize, ioflags);
+	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
+			server->wsize, ioflags);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
@@ -1925,3 +1917,7 @@ void nfs_destroy_writepagecache(void)
 	kmem_cache_destroy(nfs_wdata_cachep);
 }
 
+static const struct nfs_rw_ops nfs_rw_write_ops = {
+	.rw_alloc_header	= nfs_writehdr_alloc,
+	.rw_free_header		= nfs_writehdr_free,
+};
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 92ce5783b707..594812546c25 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -52,6 +52,11 @@ struct nfs_pageio_ops {
 	int	(*pg_doio)(struct nfs_pageio_descriptor *);
 };
 
+struct nfs_rw_ops {
+	struct nfs_rw_header *(*rw_alloc_header)(void);
+	void (*rw_free_header)(struct nfs_rw_header *);
+};
+
 struct nfs_pageio_descriptor {
 	struct list_head	pg_list;
 	unsigned long		pg_bytes_written;
@@ -63,6 +68,7 @@ struct nfs_pageio_descriptor {
 
 	struct inode		*pg_inode;
 	const struct nfs_pageio_ops *pg_ops;
+	const struct nfs_rw_ops *pg_rw_ops;
 	int 			pg_ioflags;
 	int			pg_error;
 	const struct rpc_call_ops *pg_rpc_callops;
@@ -86,6 +92,7 @@ extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     struct inode *inode,
 			     const struct nfs_pageio_ops *pg_ops,
 			     const struct nfs_pgio_completion_ops *compl_ops,
+			     const struct nfs_rw_ops *rw_ops,
 			     size_t bsize,
 			     int how);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 965c2aa6b33f..a1b91b67145e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1269,6 +1269,7 @@ struct nfs_pgio_header {
 	const struct rpc_call_ops *mds_ops;
 	void (*release) (struct nfs_pgio_header *hdr);
 	const struct nfs_pgio_completion_ops *completion_ops;
+	const struct nfs_rw_ops	*rw_ops;
 	struct nfs_direct_req	*dreq;
 	void			*layout_private;
 	spinlock_t		lock;
-- 
cgit v1.2.3-71-gd317


From a4cdda59111f92000297e0d3edb1e0e08ba3549b Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:31 -0400
Subject: NFS: Create a common pgio_rpc_prepare function

The read and write paths do exactly the same thing for the rpc_prepare
rpc_op.  This patch combines them together into a single function.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h        |  2 ++
 fs/nfs/nfs3proc.c        | 11 ++---------
 fs/nfs/nfs4proc.c        | 22 +++-------------------
 fs/nfs/pagelist.c        | 26 ++++++++++++++++++++++++++
 fs/nfs/proc.c            | 11 ++---------
 fs/nfs/read.c            | 19 +++----------------
 fs/nfs/write.c           | 19 +++++--------------
 include/linux/nfs_page.h |  2 ++
 include/linux/nfs_xdr.h  |  3 +--
 9 files changed, 46 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9d6a40eae11c..1959260f8c57 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -241,6 +241,8 @@ struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
 void nfs_rw_header_free(struct nfs_pgio_header *);
 struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *, unsigned int);
 void nfs_pgio_data_release(struct nfs_pgio_data *);
+void nfs_pgio_prepare(struct rpc_task *, void *);
+void nfs_pgio_release(void *);
 
 static inline void nfs_iocounter_init(struct nfs_io_counter *c)
 {
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d235369c3dfb..e7daa42bbc86 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -812,7 +812,7 @@ static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
 
-static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	rpc_call_start(task);
 	return 0;
@@ -834,12 +834,6 @@ static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
 
-static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
-{
-	rpc_call_start(task);
-	return 0;
-}
-
 static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	rpc_call_start(task);
@@ -946,11 +940,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.fsinfo		= nfs3_proc_fsinfo,
 	.pathconf	= nfs3_proc_pathconf,
 	.decode_dirent	= nfs3_decode_dirent,
+	.pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
 	.read_setup	= nfs3_proc_read_setup,
-	.read_rpc_prepare = nfs3_proc_read_rpc_prepare,
 	.read_done	= nfs3_read_done,
 	.write_setup	= nfs3_proc_write_setup,
-	.write_rpc_prepare = nfs3_proc_write_rpc_prepare,
 	.write_done	= nfs3_write_done,
 	.commit_setup	= nfs3_proc_commit_setup,
 	.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e793aa91454a..44fb93a66d26 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4089,7 +4089,7 @@ static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
 }
 
-static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 			&data->args.seq_args,
@@ -4097,7 +4097,7 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_pgio_dat
 			task))
 		return 0;
 	if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
-				data->args.lock_context, FMODE_READ) == -EIO)
+				data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
 		return -EIO;
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
 		return -EIO;
@@ -4177,21 +4177,6 @@ static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 }
 
-static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
-{
-	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-			&data->args.seq_args,
-			&data->res.seq_res,
-			task))
-		return 0;
-	if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
-				data->args.lock_context, FMODE_WRITE) == -EIO)
-		return -EIO;
-	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
-		return -EIO;
-	return 0;
-}
-
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	nfs4_setup_sequence(NFS_SERVER(data->inode),
@@ -8432,11 +8417,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.pathconf	= nfs4_proc_pathconf,
 	.set_capabilities = nfs4_server_capabilities,
 	.decode_dirent	= nfs4_decode_dirent,
+	.pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,
 	.read_setup	= nfs4_proc_read_setup,
-	.read_rpc_prepare = nfs4_proc_read_rpc_prepare,
 	.read_done	= nfs4_read_done,
 	.write_setup	= nfs4_proc_write_setup,
-	.write_rpc_prepare = nfs4_proc_write_rpc_prepare,
 	.write_done	= nfs4_write_done,
 	.commit_setup	= nfs4_proc_commit_setup,
 	.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ca356fe0836b..0fa211d35e40 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -386,6 +386,32 @@ void nfs_pgio_data_release(struct nfs_pgio_data *data)
 }
 EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
 
+/**
+ * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * @task: The current task
+ * @calldata: pageio data to prepare
+ */
+void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_pgio_data *data = calldata;
+	int err;
+	err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+	if (err)
+		rpc_exit(task, err);
+}
+
+/**
+ * nfs_pgio_release - Release pageio data
+ * @calldata: The pageio data to release
+ */
+void nfs_pgio_release(void *calldata)
+{
+	struct nfs_pgio_data *data = calldata;
+	if (data->header->rw_ops->rw_release)
+		data->header->rw_ops->rw_release(data);
+	nfs_pgio_data_release(data);
+}
+
 /**
  * nfs_pageio_init - initialise a page io descriptor
  * @desc: pointer to descriptor
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c54829eb2156..c171ce1a8a30 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -599,7 +599,7 @@ static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *
 	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
 
-static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
 {
 	rpc_call_start(task);
 	return 0;
@@ -621,12 +621,6 @@ static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message
 	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
 
-static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
-{
-	rpc_call_start(task);
-	return 0;
-}
-
 static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	BUG();
@@ -734,11 +728,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.fsinfo		= nfs_proc_fsinfo,
 	.pathconf	= nfs_proc_pathconf,
 	.decode_dirent	= nfs2_decode_dirent,
+	.pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,
 	.read_setup	= nfs_proc_read_setup,
-	.read_rpc_prepare = nfs_proc_read_rpc_prepare,
 	.read_done	= nfs_read_done,
 	.write_setup	= nfs_proc_write_setup,
-	.write_rpc_prepare = nfs_proc_write_rpc_prepare,
 	.write_done	= nfs_write_done,
 	.commit_setup	= nfs_proc_commit_setup,
 	.commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4cf3577bd54e..cfa15e828dd6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -454,24 +454,10 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
 		nfs_readpage_retry(task, data);
 }
 
-static void nfs_readpage_release_common(void *calldata)
-{
-	nfs_pgio_data_release(calldata);
-}
-
-void nfs_read_prepare(struct rpc_task *task, void *calldata)
-{
-	struct nfs_pgio_data *data = calldata;
-	int err;
-	err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
-	if (err)
-		rpc_exit(task, err);
-}
-
 static const struct rpc_call_ops nfs_read_common_ops = {
-	.rpc_call_prepare = nfs_read_prepare,
+	.rpc_call_prepare = nfs_pgio_prepare,
 	.rpc_call_done = nfs_readpage_result_common,
-	.rpc_release = nfs_readpage_release_common,
+	.rpc_release = nfs_pgio_release,
 };
 
 /*
@@ -636,6 +622,7 @@ void nfs_destroy_readpagecache(void)
 }
 
 static const struct nfs_rw_ops nfs_rw_read_ops = {
+	.rw_mode		= FMODE_READ,
 	.rw_alloc_header	= nfs_readhdr_alloc,
 	.rw_free_header		= nfs_readhdr_free,
 };
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9c5cde38da45..ae799c96ec2b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1248,15 +1248,6 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
 
 
-void nfs_write_prepare(struct rpc_task *task, void *calldata)
-{
-	struct nfs_pgio_data *data = calldata;
-	int err;
-	err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
-	if (err)
-		rpc_exit(task, err);
-}
-
 void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_commit_data *data = calldata;
@@ -1278,9 +1269,8 @@ static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
 	nfs_writeback_done(task, data);
 }
 
-static void nfs_writeback_release_common(void *calldata)
+static void nfs_writeback_release_common(struct nfs_pgio_data *data)
 {
-	struct nfs_pgio_data	*data = calldata;
 	struct nfs_pgio_header *hdr = data->header;
 	int status = data->task.tk_status;
 
@@ -1294,13 +1284,12 @@ static void nfs_writeback_release_common(void *calldata)
 			set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
 		spin_unlock(&hdr->lock);
 	}
-	nfs_pgio_data_release(data);
 }
 
 static const struct rpc_call_ops nfs_write_common_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
+	.rpc_call_prepare = nfs_pgio_prepare,
 	.rpc_call_done = nfs_writeback_done_common,
-	.rpc_release = nfs_writeback_release_common,
+	.rpc_release = nfs_pgio_release,
 };
 
 /*
@@ -1918,6 +1907,8 @@ void nfs_destroy_writepagecache(void)
 }
 
 static const struct nfs_rw_ops nfs_rw_write_ops = {
+	.rw_mode		= FMODE_WRITE,
 	.rw_alloc_header	= nfs_writehdr_alloc,
 	.rw_free_header		= nfs_writehdr_free,
+	.rw_release		= nfs_writeback_release_common,
 };
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 594812546c25..da00a4d6f470 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -53,8 +53,10 @@ struct nfs_pageio_ops {
 };
 
 struct nfs_rw_ops {
+	const fmode_t rw_mode;
 	struct nfs_rw_header *(*rw_alloc_header)(void);
 	void (*rw_free_header)(struct nfs_rw_header *);
+	void (*rw_release)(struct nfs_pgio_data *);
 };
 
 struct nfs_pageio_descriptor {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index a1b91b67145e..adef7bd2d06d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1429,11 +1429,10 @@ struct nfs_rpc_ops {
 			     struct nfs_pathconf *);
 	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
 	int	(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
+	int	(*pgio_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
 	void	(*read_setup)   (struct nfs_pgio_data *, struct rpc_message *);
-	int	(*read_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
 	int	(*read_done)  (struct rpc_task *, struct nfs_pgio_data *);
 	void	(*write_setup)  (struct nfs_pgio_data *, struct rpc_message *);
-	int	(*write_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
 	int	(*write_done)  (struct rpc_task *, struct nfs_pgio_data *);
 	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
 	void	(*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
-- 
cgit v1.2.3-71-gd317


From 0eecb2145c1ce18e36617008424a93836ad0a3bd Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:32 -0400
Subject: NFS: Create a common nfs_pgio_result_common function

Combining these functions will let me make a single nfs_rw_common_ops
struct (see the next patch).

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h        |  1 +
 fs/nfs/pagelist.c        | 23 ++++++++++++++++++++++
 fs/nfs/read.c            | 25 ++++++++----------------
 fs/nfs/write.c           | 51 +++++++++++++++++++-----------------------------
 include/linux/nfs_fs.h   |  2 --
 include/linux/nfs_page.h |  2 ++
 6 files changed, 54 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1959260f8c57..7c0ae364bdad 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -243,6 +243,7 @@ struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *, unsigned int
 void nfs_pgio_data_release(struct nfs_pgio_data *);
 void nfs_pgio_prepare(struct rpc_task *, void *);
 void nfs_pgio_release(void *);
+void nfs_pgio_result(struct rpc_task *, void *);
 
 static inline void nfs_iocounter_init(struct nfs_io_counter *c)
 {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 0fa211d35e40..f74df87058b6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -24,6 +24,8 @@
 #include "internal.h"
 #include "pnfs.h"
 
+#define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+
 static struct kmem_cache *nfs_page_cachep;
 
 static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
@@ -447,6 +449,27 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init);
 
+/**
+ * nfs_pgio_result - Basic pageio error handling
+ * @task: The task that ran
+ * @calldata: Pageio data to check
+ */
+void nfs_pgio_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_pgio_data *data = calldata;
+	struct inode *inode = data->header->inode;
+
+	dprintk("NFS: %s: %5u, (status %d)\n", __func__,
+		task->tk_pid, task->tk_status);
+
+	if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+		return;
+	if (task->tk_status < 0)
+		nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+	else
+		data->header->rw_ops->rw_result(task, data);
+}
+
 static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
 		const struct nfs_open_context *ctx2)
 {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index cfa15e828dd6..bc78bd248eb8 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -388,15 +388,10 @@ static const struct nfs_pageio_ops nfs_pageio_read_ops = {
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-int nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+			     struct inode *inode)
 {
-	struct inode *inode = data->header->inode;
-	int status;
-
-	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
-			task->tk_status);
-
-	status = NFS_PROTO(inode)->read_done(task, data);
+	int status = NFS_PROTO(inode)->read_done(task, data);
 	if (status != 0)
 		return status;
 
@@ -429,17 +424,11 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data
 	rpc_restart_call_prepare(task);
 }
 
-static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
+static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
 {
-	struct nfs_pgio_data *data = calldata;
 	struct nfs_pgio_header *hdr = data->header;
 
-	/* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */
-	if (nfs_readpage_result(task, data) != 0)
-		return;
-	if (task->tk_status < 0)
-		nfs_set_pgio_error(hdr, task->tk_status, data->args.offset);
-	else if (data->res.eof) {
+	if (data->res.eof) {
 		loff_t bound;
 
 		bound = data->args.offset + data->res.count;
@@ -456,7 +445,7 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
 
 static const struct rpc_call_ops nfs_read_common_ops = {
 	.rpc_call_prepare = nfs_pgio_prepare,
-	.rpc_call_done = nfs_readpage_result_common,
+	.rpc_call_done = nfs_pgio_result,
 	.rpc_release = nfs_pgio_release,
 };
 
@@ -625,4 +614,6 @@ static const struct nfs_rw_ops nfs_rw_read_ops = {
 	.rw_mode		= FMODE_READ,
 	.rw_alloc_header	= nfs_readhdr_alloc,
 	.rw_free_header		= nfs_readhdr_free,
+	.rw_done		= nfs_readpage_done,
+	.rw_result		= nfs_readpage_result,
 };
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ae799c96ec2b..1d3e1d75c8c5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1255,20 +1255,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-/*
- * Handle a write reply that flushes a whole page.
- *
- * FIXME: There is an inherent race with invalidate_inode_pages and
- *	  writebacks since the page->count is kept > 1 for as long
- *	  as the page has a write request pending.
- */
-static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
-{
-	struct nfs_pgio_data	*data = calldata;
-
-	nfs_writeback_done(task, data);
-}
-
 static void nfs_writeback_release_common(struct nfs_pgio_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
@@ -1288,7 +1274,7 @@ static void nfs_writeback_release_common(struct nfs_pgio_data *data)
 
 static const struct rpc_call_ops nfs_write_common_ops = {
 	.rpc_call_prepare = nfs_pgio_prepare,
-	.rpc_call_done = nfs_writeback_done_common,
+	.rpc_call_done = nfs_pgio_result,
 	.rpc_release = nfs_pgio_release,
 };
 
@@ -1320,16 +1306,11 @@ static int nfs_should_remove_suid(const struct inode *inode)
 /*
  * This function is called when the WRITE call is complete.
  */
-void nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+			      struct inode *inode)
 {
-	struct nfs_pgio_args	*argp = &data->args;
-	struct nfs_pgio_res	*resp = &data->res;
-	struct inode		*inode = data->header->inode;
 	int status;
 
-	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
-		task->tk_pid, task->tk_status);
-
 	/*
 	 * ->write_done will attempt to use post-op attributes to detect
 	 * conflicting writes by other clients.  A strict interpretation
@@ -1339,11 +1320,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data)
 	 */
 	status = NFS_PROTO(inode)->write_done(task, data);
 	if (status != 0)
-		return;
-	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+		return status;
+	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
 
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+	if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
 		/* We tried a write call, but the server did not
 		 * commit data to stable storage even though we
 		 * requested it.
@@ -1359,25 +1340,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data)
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
 				NFS_SERVER(inode)->nfs_client->cl_hostname,
-				resp->verf->committed, argp->stable);
+				data->res.verf->committed, data->args.stable);
 			complain = jiffies + 300 * HZ;
 		}
 	}
 #endif
-	if (task->tk_status < 0) {
-		nfs_set_pgio_error(data->header, task->tk_status, argp->offset);
-		return;
-	}
 
 	/* Deal with the suid/sgid bit corner case */
 	if (nfs_should_remove_suid(inode))
 		nfs_mark_for_revalidate(inode);
+	return 0;
+}
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+{
+	struct nfs_pgio_args	*argp = &data->args;
+	struct nfs_pgio_res	*resp = &data->res;
 
 	if (resp->count < argp->count) {
 		static unsigned long    complain;
 
 		/* This a short write! */
-		nfs_inc_stats(inode, NFSIOS_SHORTWRITE);
+		nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
 
 		/* Has the server at least made some progress? */
 		if (resp->count == 0) {
@@ -1911,4 +1898,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
 	.rw_alloc_header	= nfs_writehdr_alloc,
 	.rw_free_header		= nfs_writehdr_free,
 	.rw_release		= nfs_writeback_release_common,
+	.rw_done		= nfs_writeback_done,
+	.rw_result		= nfs_writeback_result,
 };
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 7e0db561d829..919576b8e2cf 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -520,7 +520,6 @@ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern void nfs_writeback_done(struct rpc_task *, struct nfs_pgio_data *);
 
 /*
  * Try to write back everything synchronously (but check the
@@ -553,7 +552,6 @@ nfs_have_writebacks(struct inode *inode)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern int  nfs_readpage_result(struct rpc_task *, struct nfs_pgio_data *);
 extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 			       struct page *);
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index da00a4d6f470..01aa29c5ec42 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -57,6 +57,8 @@ struct nfs_rw_ops {
 	struct nfs_rw_header *(*rw_alloc_header)(void);
 	void (*rw_free_header)(struct nfs_rw_header *);
 	void (*rw_release)(struct nfs_pgio_data *);
+	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_data *, struct inode *);
+	void (*rw_result)(struct rpc_task *, struct nfs_pgio_data *);
 };
 
 struct nfs_pageio_descriptor {
-- 
cgit v1.2.3-71-gd317


From b98abe52fa8e2a3797d3cc2db3d0e109f4549c03 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Wed, 28 May 2014 23:51:53 -0700
Subject: Input: add common DT binding for touchscreens

Add common DT binding documentation for touchscreen devices and
implement input_parse_touchscreen_of_params, which parses the common
properties and configures the input device accordingly.

The method currently does not interpret the axis inversion properties,
since there is no matching flag in the generic linux input device.

Reviewed-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 .../bindings/input/touchscreen/touchscreen.txt     | 27 +++++++++++++
 drivers/input/touchscreen/Kconfig                  |  4 ++
 drivers/input/touchscreen/Makefile                 |  1 +
 drivers/input/touchscreen/of_touchscreen.c         | 45 ++++++++++++++++++++++
 include/linux/input/touchscreen.h                  | 22 +++++++++++
 5 files changed, 99 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
 create mode 100644 drivers/input/touchscreen/of_touchscreen.c
 create mode 100644 include/linux/input/touchscreen.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt b/Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
new file mode 100644
index 000000000000..d8e06163c54e
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
@@ -0,0 +1,27 @@
+General Touchscreen Properties:
+
+Optional properties for Touchscreens:
+ - touchscreen-size-x		: horizontal resolution of touchscreen
+				  (in pixels)
+ - touchscreen-size-y		: vertical resolution of touchscreen
+				  (in pixels)
+ - touchscreen-max-pressure	: maximum reported pressure (arbitrary range
+				  dependent on the controller)
+ - touchscreen-fuzz-x		: horizontal noise value of the absolute input
+				  device (in pixels)
+ - touchscreen-fuzz-y		: vertical noise value of the absolute input
+				  device (in pixels)
+ - touchscreen-fuzz-pressure	: pressure noise value of the absolute input
+				  device (arbitrary range dependent on the
+				  controller)
+ - touchscreen-inverted-x	: X axis is inverted (boolean)
+ - touchscreen-inverted-y	: Y axis is inverted (boolean)
+
+Deprecated properties for Touchscreens:
+ - x-size			: deprecated name for touchscreen-size-x
+ - y-size			: deprecated name for touchscreen-size-y
+ - moving-threshold		: deprecated name for a combination of
+				  touchscreen-fuzz-x and touchscreen-fuzz-y
+ - contact-threshold		: deprecated name for touchscreen-fuzz-pressure
+ - x-invert			: deprecated name for touchscreen-inverted-x
+ - y-invert			: deprecated name for touchscreen-inverted-y
diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index 43389c097b02..8e07fe8505fd 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -11,6 +11,10 @@ menuconfig INPUT_TOUCHSCREEN
 
 if INPUT_TOUCHSCREEN
 
+config OF_TOUCHSCREEN
+	def_tristate INPUT
+	depends on INPUT && OF
+
 config TOUCHSCREEN_88PM860X
 	tristate "Marvell 88PM860x touchscreen"
 	depends on MFD_88PM860X
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 71a97559ce68..4d479fb0a768 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -6,6 +6,7 @@
 
 wm97xx-ts-y := wm97xx-core.o
 
+obj-$(CONFIG_OF_TOUCHSCREEN)		+= of_touchscreen.o
 obj-$(CONFIG_TOUCHSCREEN_88PM860X)	+= 88pm860x-ts.o
 obj-$(CONFIG_TOUCHSCREEN_AD7877)	+= ad7877.o
 obj-$(CONFIG_TOUCHSCREEN_AD7879)	+= ad7879.o
diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen/of_touchscreen.c
new file mode 100644
index 000000000000..f8f9b84230b1
--- /dev/null
+++ b/drivers/input/touchscreen/of_touchscreen.c
@@ -0,0 +1,45 @@
+/*
+ *  Generic DT helper functions for touchscreen devices
+ *
+ *  Copyright (c) 2014 Sebastian Reichel <sre@kernel.org>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/of.h>
+#include <linux/input.h>
+#include <linux/input/touchscreen.h>
+
+/**
+ * touchscreen_parse_of_params - parse common touchscreen DT properties
+ * @dev: device that should be parsed
+ *
+ * This function parses common DT properties for touchscreens and setups the
+ * input device accordingly. The function keeps previously setuped default
+ * values if no value is specified via DT.
+ */
+void touchscreen_parse_of_params(struct input_dev *dev)
+{
+	struct device_node *np = dev->dev.parent->of_node;
+	struct input_absinfo *absinfo;
+
+	input_alloc_absinfo(dev);
+	if (!dev->absinfo)
+		return;
+
+	absinfo = &dev->absinfo[ABS_X];
+	of_property_read_u32(np, "touchscreen-size-x", &absinfo->maximum);
+	of_property_read_u32(np, "touchscreen-fuzz-x", &absinfo->fuzz);
+
+	absinfo = &dev->absinfo[ABS_Y];
+	of_property_read_u32(np, "touchscreen-size-y", &absinfo->maximum);
+	of_property_read_u32(np, "touchscreen-fuzz-y", &absinfo->fuzz);
+
+	absinfo = &dev->absinfo[ABS_PRESSURE];
+	of_property_read_u32(np, "touchscreen-max-pressure", &absinfo->maximum);
+	of_property_read_u32(np, "touchscreen-fuzz-pressure", &absinfo->fuzz);
+}
+EXPORT_SYMBOL(touchscreen_parse_of_params);
diff --git a/include/linux/input/touchscreen.h b/include/linux/input/touchscreen.h
new file mode 100644
index 000000000000..08a5ef6e8f25
--- /dev/null
+++ b/include/linux/input/touchscreen.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2014 Sebastian Reichel <sre@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _TOUCHSCREEN_H
+#define _TOUCHSCREEN_H
+
+#include <linux/input.h>
+
+#ifdef CONFIG_OF
+void touchscreen_parse_of_params(struct input_dev *dev);
+#else
+static inline void touchscreen_parse_of_params(struct input_dev *dev)
+{
+}
+#endif
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 61721c88b8d85c9dc13bfeedf75dfc245f397c3c Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Thu, 29 May 2014 00:30:02 -0700
Subject: Input: omap-keypad - remove platform data support

This is unused since all users (OMAP4/5) are DT only.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/Kconfig             |  2 +-
 drivers/input/keyboard/omap4-keypad.c      | 32 ++++++------------------------
 include/linux/platform_data/omap4-keypad.h | 13 ------------
 3 files changed, 7 insertions(+), 40 deletions(-)
 delete mode 100644 include/linux/platform_data/omap4-keypad.h

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 948a30304870..0f84f2346fe4 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -589,7 +589,7 @@ config KEYBOARD_OMAP
 
 config KEYBOARD_OMAP4
 	tristate "TI OMAP4+ keypad support"
-	depends on ARCH_OMAP2PLUS
+	depends on OF || ARCH_OMAP2PLUS
 	select INPUT_MATRIXKMAP
 	help
 	  Say Y here if you want to use the OMAP4+ keypad.
diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c
index 0400b3f2b4b9..024b7bdffe5b 100644
--- a/drivers/input/keyboard/omap4-keypad.c
+++ b/drivers/input/keyboard/omap4-keypad.c
@@ -28,11 +28,10 @@
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/input.h>
+#include <linux/input/matrix_keypad.h>
 #include <linux/slab.h>
 #include <linux/pm_runtime.h>
 
-#include <linux/platform_data/omap4-keypad.h>
-
 /* OMAP4 registers */
 #define OMAP4_KBD_REVISION		0x00
 #define OMAP4_KBD_SYSCONFIG		0x10
@@ -218,7 +217,6 @@ static void omap4_keypad_close(struct input_dev *input)
 	pm_runtime_put_sync(input->dev.parent);
 }
 
-#ifdef CONFIG_OF
 static int omap4_keypad_parse_dt(struct device *dev,
 				 struct omap4_keypad *keypad_data)
 {
@@ -235,20 +233,9 @@ static int omap4_keypad_parse_dt(struct device *dev,
 
 	return 0;
 }
-#else
-static inline int omap4_keypad_parse_dt(struct device *dev,
-					struct omap4_keypad *keypad_data)
-{
-	return -ENOSYS;
-}
-#endif
 
 static int omap4_keypad_probe(struct platform_device *pdev)
 {
-	const struct omap4_keypad_platform_data *pdata =
-				dev_get_platdata(&pdev->dev);
-	const struct matrix_keymap_data *keymap_data =
-				pdata ? pdata->keymap_data : NULL;
 	struct omap4_keypad *keypad_data;
 	struct input_dev *input_dev;
 	struct resource *res;
@@ -277,14 +264,9 @@ static int omap4_keypad_probe(struct platform_device *pdev)
 
 	keypad_data->irq = irq;
 
-	if (pdata) {
-		keypad_data->rows = pdata->rows;
-		keypad_data->cols = pdata->cols;
-	} else {
-		error = omap4_keypad_parse_dt(&pdev->dev, keypad_data);
-		if (error)
-			return error;
-	}
+	error = omap4_keypad_parse_dt(&pdev->dev, keypad_data);
+	if (error)
+		return error;
 
 	res = request_mem_region(res->start, resource_size(res), pdev->name);
 	if (!res) {
@@ -363,7 +345,7 @@ static int omap4_keypad_probe(struct platform_device *pdev)
 		goto err_free_input;
 	}
 
-	error = matrix_keypad_build_keymap(keymap_data, NULL,
+	error = matrix_keypad_build_keymap(NULL, NULL,
 					   keypad_data->rows, keypad_data->cols,
 					   keypad_data->keymap, input_dev);
 	if (error) {
@@ -434,13 +416,11 @@ static int omap4_keypad_remove(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_OF
 static const struct of_device_id omap_keypad_dt_match[] = {
 	{ .compatible = "ti,omap4-keypad" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, omap_keypad_dt_match);
-#endif
 
 #ifdef CONFIG_PM_SLEEP
 static int omap4_keypad_suspend(struct device *dev)
@@ -482,7 +462,7 @@ static struct platform_driver omap4_keypad_driver = {
 		.name	= "omap4-keypad",
 		.owner	= THIS_MODULE,
 		.pm	= &omap4_keypad_pm_ops,
-		.of_match_table = of_match_ptr(omap_keypad_dt_match),
+		.of_match_table = omap_keypad_dt_match,
 	},
 };
 module_platform_driver(omap4_keypad_driver);
diff --git a/include/linux/platform_data/omap4-keypad.h b/include/linux/platform_data/omap4-keypad.h
deleted file mode 100644
index 4eef5fb05a17..000000000000
--- a/include/linux/platform_data/omap4-keypad.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __LINUX_INPUT_OMAP4_KEYPAD_H
-#define __LINUX_INPUT_OMAP4_KEYPAD_H
-
-#include <linux/input/matrix_keypad.h>
-
-struct omap4_keypad_platform_data {
-	const struct matrix_keymap_data *keymap_data;
-
-	u8 rows;
-	u8 cols;
-};
-
-#endif /* __LINUX_INPUT_OMAP4_KEYPAD_H */
-- 
cgit v1.2.3-71-gd317


From 1ed26f33008e954a8e91d26f97d4380dea8145db Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Tue, 6 May 2014 09:12:37 -0400
Subject: NFS: Create a common initiate_pgio() function

Most of this code is the same for both the read and write paths, so
combine everything and use the rw_ops when necessary.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h        |  9 ++------
 fs/nfs/nfs4filelayout.c  |  6 +++---
 fs/nfs/pagelist.c        | 46 ++++++++++++++++++++++++++++++++++++++++
 fs/nfs/read.c            | 42 ++++++------------------------------
 fs/nfs/write.c           | 55 ++++++------------------------------------------
 include/linux/nfs_page.h |  2 ++
 6 files changed, 66 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 365cdb11d0de..be4f2a7e9178 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -241,6 +241,8 @@ struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
 void nfs_rw_header_free(struct nfs_pgio_header *);
 void nfs_pgio_data_release(struct nfs_pgio_data *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+		      const struct rpc_call_ops *, int, int);
 
 static inline void nfs_iocounter_init(struct nfs_io_counter *c)
 {
@@ -402,9 +404,6 @@ struct nfs_pgio_completion_ops;
 extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
-extern int nfs_initiate_read(struct rpc_clnt *clnt,
-			     struct nfs_pgio_data *data,
-			     const struct rpc_call_ops *call_ops, int flags);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
@@ -425,10 +424,6 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 			const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_commit_free(struct nfs_commit_data *p);
-extern int nfs_initiate_write(struct rpc_clnt *clnt,
-			      struct nfs_pgio_data *data,
-			      const struct rpc_call_ops *call_ops,
-			      int how, int flags);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_initiate_commit(struct rpc_clnt *clnt,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e6936147ad95..7954e16a6d83 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -568,8 +568,8 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
 	data->mds_offset = offset;
 
 	/* Perform an asynchronous read to ds */
-	nfs_initiate_read(ds_clnt, data,
-				  &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
+	nfs_initiate_pgio(ds_clnt, data,
+			    &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
 	return PNFS_ATTEMPTED;
 }
 
@@ -613,7 +613,7 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
 	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
 	/* Perform an asynchronous write */
-	nfs_initiate_write(ds_clnt, data,
+	nfs_initiate_pgio(ds_clnt, data,
 				    &filelayout_write_call_ops, sync,
 				    RPC_TASK_SOFTCONN);
 	return PNFS_ATTEMPTED;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d8d25a4deb88..ab5b1850ca4f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -447,6 +447,52 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 		rpc_exit(task, err);
 }
 
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+		      const struct rpc_call_ops *call_ops, int how, int flags)
+{
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->header->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.task = &data->task,
+		.rpc_message = &msg,
+		.callback_ops = call_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC | flags,
+	};
+	int ret = 0;
+
+	data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+
+	dprintk("NFS: %5u initiated pgio call "
+		"(req %s/%llu, %u bytes @ offset %llu)\n",
+		data->task.tk_pid,
+		data->header->inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(data->header->inode),
+		data->args.count,
+		(unsigned long long)data->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	if (how & FLUSH_SYNC) {
+		ret = rpc_wait_for_completion_task(task);
+		if (ret == 0)
+			ret = task->tk_status;
+	}
+	rpc_put_task(task);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
+
 /**
  * nfs_pgio_error - Clean up from a pageio error
  * @desc: IO descriptor
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4fcef82d78b4..0359b0d76ef6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -151,53 +151,22 @@ out:
 	hdr->release(hdr);
 }
 
-int nfs_initiate_read(struct rpc_clnt *clnt,
-		      struct nfs_pgio_data *data,
-		      const struct rpc_call_ops *call_ops, int flags)
+static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+			      struct rpc_task_setup *task_setup_data, int how)
 {
 	struct inode *inode = data->header->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->header->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.task = &data->task,
-		.rpc_client = clnt,
-		.rpc_message = &msg,
-		.callback_ops = call_ops,
-		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC | swap_flags | flags,
-	};
 
-	/* Set up the initial task struct. */
-	NFS_PROTO(inode)->read_setup(data, &msg);
-
-	dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ "
-			"offset %llu)\n",
-			data->task.tk_pid,
-			inode->i_sb->s_id,
-			(unsigned long long)NFS_FILEID(inode),
-			data->args.count,
-			(unsigned long long)data->args.offset);
-
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-	rpc_put_task(task);
-	return 0;
+	task_setup_data->flags |= swap_flags;
+	NFS_PROTO(inode)->read_setup(data, msg);
 }
-EXPORT_SYMBOL_GPL(nfs_initiate_read);
 
 static int nfs_do_read(struct nfs_pgio_data *data,
 		const struct rpc_call_ops *call_ops)
 {
 	struct inode *inode = data->header->inode;
 
-	return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
+	return nfs_initiate_pgio(NFS_CLIENT(inode), data, call_ops, 0, 0);
 }
 
 static int
@@ -491,4 +460,5 @@ static const struct nfs_rw_ops nfs_rw_read_ops = {
 	.rw_free_header		= nfs_readhdr_free,
 	.rw_done		= nfs_readpage_done,
 	.rw_result		= nfs_readpage_result,
+	.rw_initiate		= nfs_initiate_read,
 };
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0e34c7024195..e46a1fc6c1fe 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -932,60 +932,18 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
-int nfs_initiate_write(struct rpc_clnt *clnt,
-		       struct nfs_pgio_data *data,
-		       const struct rpc_call_ops *call_ops,
-		       int how, int flags)
+static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+			       struct rpc_task_setup *task_setup_data, int how)
 {
 	struct inode *inode = data->header->inode;
 	int priority = flush_task_priority(how);
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->header->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.rpc_client = clnt,
-		.task = &data->task,
-		.rpc_message = &msg,
-		.callback_ops = call_ops,
-		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC | flags,
-		.priority = priority,
-	};
-	int ret = 0;
 
-	/* Set up the initial task struct.  */
-	NFS_PROTO(inode)->write_setup(data, &msg);
-
-	dprintk("NFS: %5u initiated write call "
-		"(req %s/%llu, %u bytes @ offset %llu)\n",
-		data->task.tk_pid,
-		inode->i_sb->s_id,
-		(unsigned long long)NFS_FILEID(inode),
-		data->args.count,
-		(unsigned long long)data->args.offset);
+	task_setup_data->priority = priority;
+	NFS_PROTO(inode)->write_setup(data, msg);
 
 	nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
-				 &task_setup_data.rpc_client, &msg, data);
-
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task)) {
-		ret = PTR_ERR(task);
-		goto out;
-	}
-	if (how & FLUSH_SYNC) {
-		ret = rpc_wait_for_completion_task(task);
-		if (ret == 0)
-			ret = task->tk_status;
-	}
-	rpc_put_task(task);
-out:
-	return ret;
+				 &task_setup_data->rpc_client, msg, data);
 }
-EXPORT_SYMBOL_GPL(nfs_initiate_write);
 
 static int nfs_do_write(struct nfs_pgio_data *data,
 		const struct rpc_call_ops *call_ops,
@@ -993,7 +951,7 @@ static int nfs_do_write(struct nfs_pgio_data *data,
 {
 	struct inode *inode = data->header->inode;
 
-	return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
+	return nfs_initiate_pgio(NFS_CLIENT(inode), data, call_ops, how, 0);
 }
 
 static int nfs_do_multiple_writes(struct list_head *head,
@@ -1743,4 +1701,5 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
 	.rw_release		= nfs_writeback_release_common,
 	.rw_done		= nfs_writeback_done,
 	.rw_result		= nfs_writeback_result,
+	.rw_initiate		= nfs_initiate_write,
 };
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 01aa29c5ec42..c6a587f7118f 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -59,6 +59,8 @@ struct nfs_rw_ops {
 	void (*rw_release)(struct nfs_pgio_data *);
 	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_data *, struct inode *);
 	void (*rw_result)(struct rpc_task *, struct nfs_pgio_data *);
+	void (*rw_initiate)(struct nfs_pgio_data *, struct rpc_message *,
+			    struct rpc_task_setup *, int);
 };
 
 struct nfs_pageio_descriptor {
-- 
cgit v1.2.3-71-gd317


From 12c05792599ec57ebab33096b2c75b863dfe6ea4 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:41 -0400
Subject: nfs: clean up PG_* flags

Remove unused flags PG_NEED_COMMIT and PG_NEED_RESCHED.
Add comments describing how each flag is used.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/nfs_page.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index c6a587f7118f..eb2eb6396874 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -22,12 +22,10 @@
  * Valid flags for a dirty buffer
  */
 enum {
-	PG_BUSY = 0,
-	PG_MAPPED,
-	PG_CLEAN,
-	PG_NEED_COMMIT,
-	PG_NEED_RESCHED,
-	PG_COMMIT_TO_DS,
+	PG_BUSY = 0,		/* nfs_{un}lock_request */
+	PG_MAPPED,		/* page private set for buffered io */
+	PG_CLEAN,		/* write succeeded */
+	PG_COMMIT_TO_DS,	/* used by pnfs layouts */
 };
 
 struct nfs_inode;
-- 
cgit v1.2.3-71-gd317


From 8c8f1ac109726e4ed44a920f5c962c84610d4a17 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:42 -0400
Subject: nfs: remove unused arg from nfs_create_request

@inode is passed but not used.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c          | 6 ++----
 fs/nfs/pagelist.c        | 4 +---
 fs/nfs/read.c            | 5 ++---
 fs/nfs/write.c           | 2 +-
 include/linux/nfs_page.h | 1 -
 5 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 164b0167677b..1dd8c622d719 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -380,8 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 			struct nfs_page *req;
 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 			/* XXX do we need to do the eof zeroing found in async_filler? */
-			req = nfs_create_request(dreq->ctx, dreq->inode,
-						 pagevec[i],
+			req = nfs_create_request(dreq->ctx, pagevec[i],
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
 				result = PTR_ERR(req);
@@ -750,8 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 			struct nfs_page *req;
 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 
-			req = nfs_create_request(dreq->ctx, dreq->inode,
-						 pagevec[i],
+			req = nfs_create_request(dreq->ctx, pagevec[i],
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
 				result = PTR_ERR(req);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29591094125a..4b4b212ec6b2 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -139,7 +139,6 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
 /**
  * nfs_create_request - Create an NFS read/write request.
  * @ctx: open context to use
- * @inode: inode to which the request is attached
  * @page: page to write
  * @offset: starting offset within the page for the write
  * @count: number of bytes to read/write
@@ -149,8 +148,7 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
  * User should ensure it is safe to sleep in this function.
  */
 struct nfs_page *
-nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
-		   struct page *page,
+nfs_create_request(struct nfs_open_context *ctx, struct page *page,
 		   unsigned int offset, unsigned int count)
 {
 	struct nfs_page		*req;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 3986668e4390..46d90448f69b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
-	new = nfs_create_request(ctx, inode, page, 0, len);
+	new = nfs_create_request(ctx, page, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
 		return PTR_ERR(new);
@@ -303,7 +303,6 @@ static int
 readpage_async_filler(void *data, struct page *page)
 {
 	struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
-	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *new;
 	unsigned int len;
 	int error;
@@ -312,7 +311,7 @@ readpage_async_filler(void *data, struct page *page)
 	if (len == 0)
 		return nfs_return_empty_page(page);
 
-	new = nfs_create_request(desc->ctx, inode, page, 0, len);
+	new = nfs_create_request(desc->ctx, page, 0, len);
 	if (IS_ERR(new))
 		goto out_error;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2680f29f8a51..e773df207c05 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -761,7 +761,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
 	req = nfs_try_to_update_request(inode, page, offset, bytes);
 	if (req != NULL)
 		goto out;
-	req = nfs_create_request(ctx, inode, page, offset, bytes);
+	req = nfs_create_request(ctx, page, offset, bytes);
 	if (IS_ERR(req))
 		goto out;
 	nfs_inode_add_request(inode, req);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index eb2eb6396874..be0b0981e7a0 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -85,7 +85,6 @@ struct nfs_pageio_descriptor {
 #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
 
 extern	struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
-					    struct inode *inode,
 					    struct page *page,
 					    unsigned int offset,
 					    unsigned int count);
-- 
cgit v1.2.3-71-gd317


From b4fdac1a5150174df0847a45dc6612ce5ce3daeb Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:43 -0400
Subject: nfs: modify pg_test interface to return size_t

This is a step toward allowing pg_test to inform the the
coalescing code to reduce the size of requests so they may fit in
whatever scheme the pg_test callback wants to define.

For now, just return the size of the request if there is space, or 0
if there is not.  This shouldn't change any behavior as it acts
the same as when the pg_test functions returned bool.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 16 ++++++++++++----
 fs/nfs/nfs4filelayout.c          | 12 +++++++-----
 fs/nfs/objlayout/objio_osd.c     | 15 ++++++++++-----
 fs/nfs/pagelist.c                | 22 +++++++++++++++++++---
 fs/nfs/pnfs.c                    | 12 +++++++++---
 fs/nfs/pnfs.h                    |  3 ++-
 include/linux/nfs_page.h         |  5 +++--
 7 files changed, 62 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 206cc68c9694..9b431f44fad9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1189,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 		pnfs_generic_pg_init_read(pgio, req);
 }
 
-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
 bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		struct nfs_page *req)
 {
 	if (pgio->pg_dreq != NULL &&
 	    !is_aligned_req(req, SECTOR_SIZE))
-		return false;
+		return 0;
 
 	return pnfs_generic_pg_test(pgio, prev, req);
 }
@@ -1241,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 	}
 }
 
-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
 bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		 struct nfs_page *req)
 {
 	if (pgio->pg_dreq != NULL &&
 	    !is_aligned_req(req, PAGE_CACHE_SIZE))
-		return false;
+		return 0;
 
 	return pnfs_generic_pg_test(pgio, prev, req);
 }
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 9fd7cebbff04..ba9a9aadf6c8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -915,10 +915,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 /*
  * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
  *
- * return true  : coalesce page
- * return false : don't coalesce page
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
  */
-static bool
+static size_t
 filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		   struct nfs_page *req)
 {
@@ -927,7 +927,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 
 	if (!pnfs_generic_pg_test(pgio, prev, req) ||
 	    !nfs_generic_pg_test(pgio, prev, req))
-		return false;
+		return 0;
 
 	p_stripe = (u64)req_offset(prev);
 	r_stripe = (u64)req_offset(req);
@@ -936,7 +936,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 	do_div(p_stripe, stripe_unit);
 	do_div(r_stripe, stripe_unit);
 
-	return (p_stripe == r_stripe);
+	if (p_stripe == r_stripe)
+		return req->wb_bytes;
+	return 0;
 }
 
 static void
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 426b366b0b33..71b9c69dbe9c 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -564,14 +564,19 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
 	return 0;
 }
 
-static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
 			  struct nfs_page *prev, struct nfs_page *req)
 {
-	if (!pnfs_generic_pg_test(pgio, prev, req))
-		return false;
+	if (!pnfs_generic_pg_test(pgio, prev, req) ||
+	    pgio->pg_count + req->wb_bytes >
+	    (unsigned long)pgio->pg_layout_private)
+		return 0;
 
-	return pgio->pg_count + req->wb_bytes <=
-			(unsigned long)pgio->pg_layout_private;
+	return req->wb_bytes;
 }
 
 static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4b4b212ec6b2..82233431880d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -280,7 +280,17 @@ nfs_wait_on_request(struct nfs_page *req)
 			TASK_UNINTERRUPTIBLE);
 }
 
-bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+/*
+ * nfs_generic_pg_test - determine if requests can be coalesced
+ * @desc: pointer to descriptor
+ * @prev: previous request in desc, or NULL
+ * @req: this request
+ *
+ * Returns zero if @req can be coalesced into @desc, otherwise it returns
+ * the size of the request.
+ */
+size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *prev, struct nfs_page *req)
 {
 	/*
 	 * FIXME: ideally we should be able to coalesce all requests
@@ -292,7 +302,9 @@ bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *pr
 	if (desc->pg_bsize < PAGE_SIZE)
 		return 0;
 
-	return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+	if (desc->pg_count + req->wb_bytes <= desc->pg_bsize)
+		return req->wb_bytes;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 
@@ -747,6 +759,8 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 				      struct nfs_page *req,
 				      struct nfs_pageio_descriptor *pgio)
 {
+	size_t size;
+
 	if (!nfs_match_open_context(req->wb_context, prev->wb_context))
 		return false;
 	if (req->wb_context->dentry->d_inode->i_flock != NULL &&
@@ -758,7 +772,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 		return false;
 	if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
 		return false;
-	return pgio->pg_ops->pg_test(pgio, prev, req);
+	size = pgio->pg_ops->pg_test(pgio, prev, req);
+	WARN_ON_ONCE(size && size != req->wb_bytes);
+	return size > 0;
 }
 
 /**
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0fe670189fd1..de6eb16f94d1 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1434,7 +1434,11 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
 
-bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+size_t
 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		     struct nfs_page *req)
 {
@@ -1455,8 +1459,10 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 	 * first byte that lies outside the pnfs_layout_range. FIXME?
 	 *
 	 */
-	return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
-					 pgio->pg_lseg->pls_range.length);
+	if (req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
+					 pgio->pg_lseg->pls_range.length))
+		return req->wb_bytes;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0031267d7492..dccf182ec4d8 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -187,7 +187,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
 			        struct nfs_page *req, u64 wb_size);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
-bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+			    struct nfs_page *prev, struct nfs_page *req);
 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index be0b0981e7a0..13d59af561f6 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -46,7 +46,8 @@ struct nfs_page {
 struct nfs_pageio_descriptor;
 struct nfs_pageio_ops {
 	void	(*pg_init)(struct nfs_pageio_descriptor *, struct nfs_page *);
-	bool	(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+	size_t	(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *,
+			   struct nfs_page *);
 	int	(*pg_doio)(struct nfs_pageio_descriptor *);
 };
 
@@ -102,7 +103,7 @@ extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 				   struct nfs_page *);
 extern	void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
 extern	void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
-extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 				struct nfs_page *prev,
 				struct nfs_page *req);
 extern  int nfs_wait_on_request(struct nfs_page *);
-- 
cgit v1.2.3-71-gd317


From 2bfc6e566daa8386c9cffef2f7de17fc330d3835 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:45 -0400
Subject: nfs: add support for multiple nfs reqs per page

Add "page groups" - a circular list of nfs requests (struct nfs_page)
that all reference the same page. This gives nfs read and write paths
the ability to account for sub-page regions independently.  This
somewhat follows the design of struct buffer_head's sub-page
accounting.

Only "head" requests are ever added/removed from the inode list in
the buffered write path. "head" and "sub" requests are treated the
same through the read path and the rest of the write/commit path.
Requests are given an extra reference across the life of the list.

Page groups are never rejoined after being split. If the read/write
request fails and the client falls back to another path (ie revert
to MDS in PNFS case), the already split requests are pushed through
the recoalescing code again, which may split them further and then
coalesce them into properly sized requests on the wire. Fragmentation
shouldn't be a problem with the current design, because we flush all
requests in page group when a non-contiguous request is added, so
the only time resplitting should occur is on a resend of a read or
write.

This patch lays the groundwork for sub-page splitting, but does not
actually do any splitting. For now all page groups have one request
as pg_test functions don't yet split pages. There are several related
patches that are needed support multiple requests per page group.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c          |   7 +-
 fs/nfs/pagelist.c        | 220 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/nfs/read.c            |   4 +-
 fs/nfs/write.c           |  13 ++-
 include/linux/nfs_page.h |  13 ++-
 5 files changed, 236 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1dd8c622d719..2c0e08f4cf71 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 			struct nfs_page *req;
 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 			/* XXX do we need to do the eof zeroing found in async_filler? */
-			req = nfs_create_request(dreq->ctx, pagevec[i],
+			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
 				result = PTR_ERR(req);
@@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 			struct nfs_page *req;
 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 
-			req = nfs_create_request(dreq->ctx, pagevec[i],
+			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
 				result = PTR_ERR(req);
@@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 	spin_unlock(&dreq->lock);
 
 	while (!list_empty(&hdr->pages)) {
+		bool do_destroy = true;
+
 		req = nfs_list_entry(hdr->pages.next);
 		nfs_list_remove_request(req);
 		switch (bit) {
@@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 		case NFS_IOHDR_NEED_COMMIT:
 			kref_get(&req->wb_kref);
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+			do_destroy = false;
 		}
 		nfs_unlock_and_release_request(req);
 	}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f343f49ff596..015fb7b48dfe 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -29,6 +29,8 @@
 static struct kmem_cache *nfs_page_cachep;
 static const struct rpc_call_ops nfs_pgio_common_ops;
 
+static void nfs_free_request(struct nfs_page *);
+
 static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
 {
 	p->npages = pagecount;
@@ -136,10 +138,151 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
 	return __nfs_iocounter_wait(c);
 }
 
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req - request in group that is to be locked
+ *
+ * this lock must be held if modifying the page group list
+ */
+void
+nfs_page_group_lock(struct nfs_page *req)
+{
+	struct nfs_page *head = req->wb_head;
+	int err = -EAGAIN;
+
+	WARN_ON_ONCE(head != head->wb_head);
+
+	while (err)
+		err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req - request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+	struct nfs_page *head = req->wb_head;
+
+	WARN_ON_ONCE(head != head->wb_head);
+
+	smp_mb__before_clear_bit();
+	clear_bit(PG_HEADLOCK, &head->wb_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+	struct nfs_page *head = req->wb_head;
+	struct nfs_page *tmp;
+
+	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+	WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+	tmp = req->wb_this_page;
+	while (tmp != req) {
+		if (!test_bit(bit, &tmp->wb_flags))
+			return false;
+		tmp = tmp->wb_this_page;
+	}
+
+	/* true! reset all bits */
+	tmp = req;
+	do {
+		clear_bit(bit, &tmp->wb_flags);
+		tmp = tmp->wb_this_page;
+	} while (tmp != req);
+
+	return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ *   return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+	bool ret;
+
+	nfs_page_group_lock(req);
+	ret = nfs_page_group_sync_on_bit_locked(req, bit);
+	nfs_page_group_unlock(req);
+
+	return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ *         or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+	WARN_ON_ONCE(prev == req);
+
+	if (!prev) {
+		req->wb_head = req;
+		req->wb_this_page = req;
+	} else {
+		WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+		WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+		req->wb_head = prev->wb_head;
+		req->wb_this_page = prev->wb_this_page;
+		prev->wb_this_page = req;
+
+		/* grab extra ref if head request has extra ref from
+		 * the write/commit path to handle handoff between write
+		 * and commit lists */
+		if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags))
+			kref_get(&req->wb_kref);
+	}
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+	struct nfs_page *tmp, *next;
+
+	if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+		return;
+
+	tmp = req;
+	do {
+		next = tmp->wb_this_page;
+		/* unlink and free */
+		tmp->wb_this_page = tmp;
+		tmp->wb_head = tmp;
+		nfs_free_request(tmp);
+		tmp = next;
+	} while (tmp != req);
+}
+
 /**
  * nfs_create_request - Create an NFS read/write request.
  * @ctx: open context to use
  * @page: page to write
+ * @last: last nfs request created for this page group or NULL if head
  * @offset: starting offset within the page for the write
  * @count: number of bytes to read/write
  *
@@ -149,7 +292,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
  */
 struct nfs_page *
 nfs_create_request(struct nfs_open_context *ctx, struct page *page,
-		   unsigned int offset, unsigned int count)
+		   struct nfs_page *last, unsigned int offset,
+		   unsigned int count)
 {
 	struct nfs_page		*req;
 	struct nfs_lock_context *l_ctx;
@@ -181,6 +325,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
 	req->wb_bytes   = count;
 	req->wb_context = get_nfs_open_context(ctx);
 	kref_init(&req->wb_kref);
+	nfs_page_group_init(req, last);
 	return req;
 }
 
@@ -238,16 +383,18 @@ static void nfs_clear_request(struct nfs_page *req)
 	}
 }
 
-
 /**
  * nfs_release_request - Release the count on an NFS read/write request
  * @req: request to release
  *
  * Note: Should never be called with the spinlock held!
  */
-static void nfs_free_request(struct kref *kref)
+static void nfs_free_request(struct nfs_page *req)
 {
-	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+	WARN_ON_ONCE(req->wb_this_page != req);
+
+	/* extra debug: make sure no sync bits are still set */
+	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
 
 	/* Release struct file and open context */
 	nfs_clear_request(req);
@@ -256,7 +403,7 @@ static void nfs_free_request(struct kref *kref)
 
 void nfs_release_request(struct nfs_page *req)
 {
-	kref_put(&req->wb_kref, nfs_free_request);
+	kref_put(&req->wb_kref, nfs_page_group_destroy);
 }
 
 static int nfs_wait_bit_uninterruptible(void *word)
@@ -832,21 +979,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
  * @desc: destination io descriptor
  * @req: request
  *
+ * This may split a request into subrequests which are all part of the
+ * same page group.
+ *
  * Returns true if the request 'req' was successfully coalesced into the
  * existing list of pages 'desc'.
  */
 static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			   struct nfs_page *req)
 {
-	while (!nfs_pageio_do_add_request(desc, req)) {
-		desc->pg_moreio = 1;
-		nfs_pageio_doio(desc);
-		if (desc->pg_error < 0)
-			return 0;
-		desc->pg_moreio = 0;
-		if (desc->pg_recoalesce)
-			return 0;
-	}
+	struct nfs_page *subreq;
+	unsigned int bytes_left = 0;
+	unsigned int offset, pgbase;
+
+	nfs_page_group_lock(req);
+
+	subreq = req;
+	bytes_left = subreq->wb_bytes;
+	offset = subreq->wb_offset;
+	pgbase = subreq->wb_pgbase;
+
+	do {
+		if (!nfs_pageio_do_add_request(desc, subreq)) {
+			/* make sure pg_test call(s) did nothing */
+			WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
+			WARN_ON_ONCE(subreq->wb_offset != offset);
+			WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
+
+			nfs_page_group_unlock(req);
+			desc->pg_moreio = 1;
+			nfs_pageio_doio(desc);
+			if (desc->pg_error < 0)
+				return 0;
+			desc->pg_moreio = 0;
+			if (desc->pg_recoalesce)
+				return 0;
+			/* retry add_request for this subreq */
+			nfs_page_group_lock(req);
+			continue;
+		}
+
+		/* check for buggy pg_test call(s) */
+		WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
+		WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
+		WARN_ON_ONCE(subreq->wb_bytes == 0);
+
+		bytes_left -= subreq->wb_bytes;
+		offset += subreq->wb_bytes;
+		pgbase += subreq->wb_bytes;
+
+		if (bytes_left) {
+			subreq = nfs_create_request(req->wb_context,
+					req->wb_page,
+					subreq, pgbase, bytes_left);
+			nfs_lock_request(subreq);
+			subreq->wb_offset  = offset;
+			subreq->wb_index = req->wb_index;
+		}
+	} while (bytes_left > 0);
+
+	nfs_page_group_unlock(req);
 	return 1;
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 46d90448f69b..902ba2c63d05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
-	new = nfs_create_request(ctx, page, 0, len);
+	new = nfs_create_request(ctx, page, NULL, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
 		return PTR_ERR(new);
@@ -311,7 +311,7 @@ readpage_async_filler(void *data, struct page *page)
 	if (len == 0)
 		return nfs_return_empty_page(page);
 
-	new = nfs_create_request(desc->ctx, page, 0, len);
+	new = nfs_create_request(desc->ctx, page, NULL, 0, len);
 	if (IS_ERR(new))
 		goto out_error;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e773df207c05..d0f30f12a8b3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -367,6 +367,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
+	WARN_ON_ONCE(req->wb_this_page != req);
+
 	/* Lock the request! */
 	nfs_lock_request(req);
 
@@ -383,6 +385,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 		set_page_private(req->wb_page, (unsigned long)req);
 	}
 	nfsi->npages++;
+	set_bit(PG_INODE_REF, &req->wb_flags);
 	kref_get(&req->wb_kref);
 	spin_unlock(&inode->i_lock);
 }
@@ -567,6 +570,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
 	struct nfs_commit_info cinfo;
 	unsigned long bytes = 0;
+	bool do_destroy;
 
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
 		goto out;
@@ -596,6 +600,7 @@ remove_req:
 next:
 		nfs_unlock_request(req);
 		nfs_end_page_writeback(req->wb_page);
+		do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
 		nfs_release_request(req);
 	}
 out:
@@ -700,6 +705,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		if (req == NULL)
 			goto out_unlock;
 
+		/* should be handled by nfs_flush_incompatible */
+		WARN_ON_ONCE(req->wb_head != req);
+		WARN_ON_ONCE(req->wb_this_page != req);
+
 		rqend = req->wb_offset + req->wb_bytes;
 		/*
 		 * Tell the caller to flush out the request if
@@ -761,7 +770,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
 	req = nfs_try_to_update_request(inode, page, offset, bytes);
 	if (req != NULL)
 		goto out;
-	req = nfs_create_request(ctx, page, offset, bytes);
+	req = nfs_create_request(ctx, page, NULL, offset, bytes);
 	if (IS_ERR(req))
 		goto out;
 	nfs_inode_add_request(inode, req);
@@ -805,6 +814,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 			return 0;
 		l_ctx = req->wb_lock_context;
 		do_flush = req->wb_page != page || req->wb_context != ctx;
+		/* for now, flush if more than 1 request in page_group */
+		do_flush |= req->wb_this_page != req;
 		if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
 			do_flush |= l_ctx->lockowner.l_owner != current->files
 				|| l_ctx->lockowner.l_pid != current->tgid;
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 13d59af561f6..986c0c279d0e 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -26,6 +26,9 @@ enum {
 	PG_MAPPED,		/* page private set for buffered io */
 	PG_CLEAN,		/* write succeeded */
 	PG_COMMIT_TO_DS,	/* used by pnfs layouts */
+	PG_INODE_REF,		/* extra ref held by inode (head req only) */
+	PG_HEADLOCK,		/* page group lock of wb_head */
+	PG_TEARDOWN,		/* page group sync for destroy */
 };
 
 struct nfs_inode;
@@ -41,6 +44,8 @@ struct nfs_page {
 	struct kref		wb_kref;	/* reference count */
 	unsigned long		wb_flags;
 	struct nfs_write_verifier	wb_verf;	/* Commit cookie */
+	struct nfs_page		*wb_this_page;  /* list of reqs for this page */
+	struct nfs_page		*wb_head;       /* head pointer for req list */
 };
 
 struct nfs_pageio_descriptor;
@@ -87,9 +92,10 @@ struct nfs_pageio_descriptor {
 
 extern	struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
 					    struct page *page,
+					    struct nfs_page *last,
 					    unsigned int offset,
 					    unsigned int count);
-extern	void nfs_release_request(struct nfs_page *req);
+extern	void nfs_release_request(struct nfs_page *);
 
 
 extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
@@ -108,7 +114,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 				struct nfs_page *req);
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
-extern	void nfs_unlock_and_release_request(struct nfs_page *req);
+extern	void nfs_unlock_and_release_request(struct nfs_page *);
+extern void nfs_page_group_lock(struct nfs_page *);
+extern void nfs_page_group_unlock(struct nfs_page *);
+extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 
 /*
  * Lock the page of an asynchronous request
-- 
cgit v1.2.3-71-gd317


From 67d0338edd71db9a4f406d8778f7c525d31e9f7f Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:46 -0400
Subject: nfs: page group syncing in read path

Operations that modify state for a whole page must be syncronized across
all requests within a page group. In the read path, this is calling
unlock_page and SetPageUptodate. Both of these functions should not be
called until all requests in a page group have reached the point where
they would call them.

This patch should have no effect yet since all page groups currently
have one request, but will come into play when pg_test functions are
modified to split pages into sub-page regions.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        |  2 ++
 fs/nfs/read.c            | 22 +++++++++++++++++-----
 include/linux/nfs_page.h |  2 ++
 3 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 015fb7b48dfe..18ee4e99347e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -395,6 +395,8 @@ static void nfs_free_request(struct nfs_page *req)
 
 	/* extra debug: make sure no sync bits are still set */
 	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
 
 	/* Release struct file and open context */
 	nfs_clear_request(req);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 902ba2c63d05..53d5b83611ce 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -105,10 +105,16 @@ static void nfs_readpage_release(struct nfs_page *req)
 {
 	struct inode *d_inode = req->wb_context->dentry->d_inode;
 
-	if (PageUptodate(req->wb_page))
-		nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+	dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(d_inode), req->wb_bytes,
+		(long long)req_offset(req));
 
-	unlock_page(req->wb_page);
+	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+		if (PageUptodate(req->wb_page))
+			nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+
+		unlock_page(req->wb_page);
+	}
 
 	dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -118,6 +124,12 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
+static void nfs_page_group_set_uptodate(struct nfs_page *req)
+{
+	if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
+		SetPageUptodate(req->wb_page);
+}
+
 /* Note io was page aligned */
 static void nfs_read_completion(struct nfs_pgio_header *hdr)
 {
@@ -140,9 +152,9 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
 		bytes += req->wb_bytes;
 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
 			if (bytes <= hdr->good_bytes)
-				SetPageUptodate(page);
+				nfs_page_group_set_uptodate(req);
 		} else
-			SetPageUptodate(page);
+			nfs_page_group_set_uptodate(req);
 		nfs_list_remove_request(req);
 		nfs_readpage_release(req);
 	}
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 986c0c279d0e..6385175a127b 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -29,6 +29,8 @@ enum {
 	PG_INODE_REF,		/* extra ref held by inode (head req only) */
 	PG_HEADLOCK,		/* page group lock of wb_head */
 	PG_TEARDOWN,		/* page group sync for destroy */
+	PG_UNLOCKPAGE,		/* page group sync bit in read path */
+	PG_UPTODATE,		/* page group sync bit in read path */
 };
 
 struct nfs_inode;
-- 
cgit v1.2.3-71-gd317


From 20633f042fd0907300069714b98aaf607a8b5bf8 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:47 -0400
Subject: nfs: page group syncing in write path

Operations that modify state for a whole page must be syncronized across
all requests within a page group. In the write path, this is calling
end_page_writeback and removing the head request from an inode.
Both of these operations should not be called until all requests
in a page group have reached the point where they would call them.

This patch should have no effect yet since all page groups currently
have one request, but will come into play when pg_test functions are
modified to split pages into sub-page regions.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        |  2 ++
 fs/nfs/write.c           | 32 ++++++++++++++++++++------------
 include/linux/nfs_page.h |  2 ++
 3 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 18ee4e99347e..ceb4424614aa 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -397,6 +397,8 @@ static void nfs_free_request(struct nfs_page *req)
 	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
 	WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
 	WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
+	WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
 
 	/* Release struct file and open context */
 	nfs_clear_request(req);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d0f30f12a8b3..5d752766139d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,12 +201,15 @@ static void nfs_set_page_writeback(struct page *page)
 	}
 }
 
-static void nfs_end_page_writeback(struct page *page)
+static void nfs_end_page_writeback(struct nfs_page *req)
 {
-	struct inode *inode = page_file_mapping(page)->host;
+	struct inode *inode = page_file_mapping(req->wb_page)->host;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
-	end_page_writeback(page);
+	if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+		return;
+
+	end_page_writeback(req->wb_page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
@@ -397,15 +400,20 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_page *head;
 
-	spin_lock(&inode->i_lock);
-	if (likely(!PageSwapCache(req->wb_page))) {
-		set_page_private(req->wb_page, 0);
-		ClearPagePrivate(req->wb_page);
-		clear_bit(PG_MAPPED, &req->wb_flags);
+	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+		head = req->wb_head;
+
+		spin_lock(&inode->i_lock);
+		if (likely(!PageSwapCache(head->wb_page))) {
+			set_page_private(head->wb_page, 0);
+			ClearPagePrivate(head->wb_page);
+			clear_bit(PG_MAPPED, &head->wb_flags);
+		}
+		nfsi->npages--;
+		spin_unlock(&inode->i_lock);
 	}
-	nfsi->npages--;
-	spin_unlock(&inode->i_lock);
 	nfs_release_request(req);
 }
 
@@ -599,7 +607,7 @@ remove_req:
 		nfs_inode_remove_request(req);
 next:
 		nfs_unlock_request(req);
-		nfs_end_page_writeback(req->wb_page);
+		nfs_end_page_writeback(req);
 		do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
 		nfs_release_request(req);
 	}
@@ -964,7 +972,7 @@ static void nfs_redirty_request(struct nfs_page *req)
 {
 	nfs_mark_request_dirty(req);
 	nfs_unlock_request(req);
-	nfs_end_page_writeback(req->wb_page);
+	nfs_end_page_writeback(req);
 	nfs_release_request(req);
 }
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 6385175a127b..7d9096d95d4a 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -31,6 +31,8 @@ enum {
 	PG_TEARDOWN,		/* page group sync for destroy */
 	PG_UNLOCKPAGE,		/* page group sync bit in read path */
 	PG_UPTODATE,		/* page group sync bit in read path */
+	PG_WB_END,		/* page group sync bit in write path */
+	PG_REMOVE,		/* page group sync bit in write path */
 };
 
 struct nfs_inode;
-- 
cgit v1.2.3-71-gd317


From 7f714720fac03383d687dbe39494cc96b845bd46 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:53 -0400
Subject: nfs: remove data list from pgio header

Since the ability to split pages into subpage requests has been added,
nfs_pgio_header->rpc_list only ever has one pgio data.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c       | 39 ++++++---------------------------------
 fs/nfs/pnfs.c           | 41 +++++++++++++++--------------------------
 include/linux/nfs_xdr.h |  5 +++--
 3 files changed, 24 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ec4311df05d9..fab78d13ee14 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -470,7 +470,6 @@ struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
 		struct nfs_pgio_header *hdr = &header->header;
 
 		INIT_LIST_HEAD(&hdr->pages);
-		INIT_LIST_HEAD(&hdr->rpc_list);
 		spin_lock_init(&hdr->lock);
 		atomic_set(&hdr->refcnt, 0);
 		hdr->rw_ops = ops;
@@ -648,27 +647,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
 
-static int nfs_do_multiple_pgios(struct list_head *head,
-				 const struct rpc_call_ops *call_ops,
-				 int how)
-{
-	struct nfs_pgio_data *data;
-	int ret = 0;
-
-	while (!list_empty(head)) {
-		int ret2;
-
-		data = list_first_entry(head, struct nfs_pgio_data, list);
-		list_del_init(&data->list);
-
-		ret2 = nfs_initiate_pgio(NFS_CLIENT(data->header->inode),
-					 data, call_ops, how, 0);
-		if (ret == 0)
-			 ret = ret2;
-	}
-	return ret;
-}
-
 /**
  * nfs_pgio_error - Clean up from a pageio error
  * @desc: IO descriptor
@@ -677,14 +655,9 @@ static int nfs_do_multiple_pgios(struct list_head *head,
 static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 			  struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_data *data;
-
 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
-	while (!list_empty(&hdr->rpc_list)) {
-		data = list_first_entry(&hdr->rpc_list, struct nfs_pgio_data, list);
-		list_del(&data->list);
-		nfs_pgio_data_release(data);
-	}
+	nfs_pgio_data_release(hdr->data);
+	hdr->data = NULL;
 	desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 	return -ENOMEM;
 }
@@ -794,7 +767,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 
 	/* Set up the argument struct */
 	nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
-	list_add(&data->list, &hdr->rpc_list);
+	hdr->data = data;
 	desc->pg_rpc_callops = &nfs_pgio_common_ops;
 	return 0;
 }
@@ -816,9 +789,9 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 	atomic_inc(&hdr->refcnt);
 	ret = nfs_generic_pgio(desc, hdr);
 	if (ret == 0)
-		ret = nfs_do_multiple_pgios(&hdr->rpc_list,
-					    desc->pg_rpc_callops,
-					    desc->pg_ioflags);
+		ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
+					hdr->data, desc->pg_rpc_callops,
+					desc->pg_ioflags, 0);
 	if (atomic_dec_and_test(&hdr->refcnt))
 		hdr->completion_ops->completion(hdr);
 	return ret;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 354c53cd4095..6ef108b1d85f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1573,23 +1573,18 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
 }
 
 static void
-pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
+pnfs_do_write(struct nfs_pageio_descriptor *desc,
+	      struct nfs_pgio_header *hdr, int how)
 {
-	struct nfs_pgio_data *data;
+	struct nfs_pgio_data *data = hdr->data;
 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
+	enum pnfs_try_status trypnfs;
 
 	desc->pg_lseg = NULL;
-	while (!list_empty(head)) {
-		enum pnfs_try_status trypnfs;
-
-		data = list_first_entry(head, struct nfs_pgio_data, list);
-		list_del_init(&data->list);
-
-		trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
-		if (trypnfs == PNFS_NOT_ATTEMPTED)
-			pnfs_write_through_mds(desc, data);
-	}
+	trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+	if (trypnfs == PNFS_NOT_ATTEMPTED)
+		pnfs_write_through_mds(desc, data);
 	pnfs_put_lseg(lseg);
 }
 
@@ -1623,7 +1618,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 		pnfs_put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 	} else
-		pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
+		pnfs_do_write(desc, hdr, desc->pg_ioflags);
 	if (atomic_dec_and_test(&hdr->refcnt))
 		hdr->completion_ops->completion(hdr);
 	return ret;
@@ -1731,23 +1726,17 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
 }
 
 static void
-pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
+pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_data *data;
+	struct nfs_pgio_data *data = hdr->data;
 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
+	enum pnfs_try_status trypnfs;
 
 	desc->pg_lseg = NULL;
-	while (!list_empty(head)) {
-		enum pnfs_try_status trypnfs;
-
-		data = list_first_entry(head, struct nfs_pgio_data, list);
-		list_del_init(&data->list);
-
-		trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
-		if (trypnfs == PNFS_NOT_ATTEMPTED)
-			pnfs_read_through_mds(desc, data);
-	}
+	trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+	if (trypnfs == PNFS_NOT_ATTEMPTED)
+		pnfs_read_through_mds(desc, data);
 	pnfs_put_lseg(lseg);
 }
 
@@ -1782,7 +1771,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 		pnfs_put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 	} else
-		pnfs_do_multiple_reads(desc, &hdr->rpc_list);
+		pnfs_do_read(desc, hdr);
 	if (atomic_dec_and_test(&hdr->refcnt))
 		hdr->completion_ops->completion(hdr);
 	return ret;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index adef7bd2d06d..ae636013fb1f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1256,11 +1256,13 @@ enum {
 	NFS_IOHDR_NEED_RESCHED,
 };
 
+struct nfs_pgio_data;
+
 struct nfs_pgio_header {
 	struct inode		*inode;
 	struct rpc_cred		*cred;
 	struct list_head	pages;
-	struct list_head	rpc_list;
+	struct nfs_pgio_data	*data;
 	atomic_t		refcnt;
 	struct nfs_page		*req;
 	struct nfs_writeverf	verf;		/* Used for writes */
@@ -1282,7 +1284,6 @@ struct nfs_pgio_header {
 
 struct nfs_pgio_data {
 	struct nfs_pgio_header	*header;
-	struct list_head	list;
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;
 	struct nfs_writeverf	verf;		/* Used for writes */
-- 
cgit v1.2.3-71-gd317


From 5002c58639d41b93e800c8a4b7eca49c40d57822 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 15 May 2014 11:56:54 -0400
Subject: pnfs: support multiple verfs per direct req

Support direct requests that span multiple pnfs data servers by
comparing nfs_pgio_header->verf to a cached verf in pnfs_commit_bucket.
Continue to use dreq->verf if the MDS is used / non-pNFS.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c         | 102 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/nfs/nfs4filelayout.c |   6 +++
 include/linux/nfs.h     |   5 ++-
 include/linux/nfs_xdr.h |   2 +
 4 files changed, 109 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2c0e08f4cf71..4ad7bc388679 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -108,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
 	return atomic_dec_and_test(&dreq->io_count);
 }
 
+/*
+ * nfs_direct_select_verf - select the right verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
+ * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ *
+ * returns the correct verifier to use given the role of the server
+ */
+static struct nfs_writeverf *
+nfs_direct_select_verf(struct nfs_direct_req *dreq,
+		       struct nfs_client *ds_clp,
+		       int ds_idx)
+{
+	struct nfs_writeverf *verfp = &dreq->verf;
+
+#ifdef CONFIG_NFS_V4_1
+	if (ds_clp) {
+		/* pNFS is in use, use the DS verf */
+		if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
+			verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+		else
+			WARN_ON_ONCE(1);
+	}
+#endif
+	return verfp;
+}
+
+
+/*
+ * nfs_direct_set_hdr_verf - set the write/commit verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verfs
+ *
+ * Set the server's (MDS or DS) "seen" verifier
+ */
+static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
+				    struct nfs_pgio_header *hdr)
+{
+	struct nfs_writeverf *verfp;
+
+	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+				      hdr->data->ds_idx);
+	WARN_ON_ONCE(verfp->committed >= 0);
+	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+	WARN_ON_ONCE(verfp->committed < 0);
+}
+
+/*
+ * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verf
+ *
+ * set the server's "seen" verf if not initialized.
+ * returns result of comparison between @hdr->verf and the "seen"
+ * verf of the server used by @hdr (DS or MDS)
+ */
+static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
+					  struct nfs_pgio_header *hdr)
+{
+	struct nfs_writeverf *verfp;
+
+	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+					 hdr->data->ds_idx);
+	if (verfp->committed < 0) {
+		nfs_direct_set_hdr_verf(dreq, hdr);
+		return 0;
+	}
+	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+}
+
+#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
+ * @dreq - direct request possibly spanning multiple servers
+ * @data - commit data to validate against previously seen verf
+ *
+ * returns result of comparison between @data->verf and the verf of
+ * the server used by @data (DS or MDS)
+ */
+static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
+					   struct nfs_commit_data *data)
+{
+	struct nfs_writeverf *verfp;
+
+	verfp = nfs_direct_select_verf(dreq, data->ds_clp,
+					 data->ds_commit_index);
+	WARN_ON_ONCE(verfp->committed < 0);
+	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+}
+#endif
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -168,6 +259,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
 	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+	dreq->verf.committed = NFS_INVALID_STABLE_HOW;	/* not set yet */
 	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
 	spin_lock_init(&dreq->lock);
 
@@ -602,7 +694,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 		dprintk("NFS: %5u commit failed with error %d.\n",
 			data->task.tk_pid, status);
 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+	} else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
 		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 	}
@@ -811,13 +903,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
 				bit = NFS_IOHDR_NEED_RESCHED;
 			else if (dreq->flags == 0) {
-				memcpy(&dreq->verf, &hdr->verf,
-				       sizeof(dreq->verf));
+				nfs_direct_set_hdr_verf(dreq, hdr);
 				bit = NFS_IOHDR_NEED_COMMIT;
 				dreq->flags = NFS_ODIRECT_DO_COMMIT;
 			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
-				if (memcmp(&dreq->verf, &hdr->verf, sizeof(dreq->verf))) {
-					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+					dreq->flags =
+						NFS_ODIRECT_RESCHED_WRITES;
 					bit = NFS_IOHDR_NEED_RESCHED;
 				} else
 					bit = NFS_IOHDR_NEED_COMMIT;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 7a665e0f35b7..0ebc521ea6fc 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -560,6 +560,7 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
 	/* No multipath support. Use first DS */
 	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
+	data->ds_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
 		data->args.fh = fh;
@@ -603,6 +604,7 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
 	data->pgio_done_cb = filelayout_write_done_cb;
 	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
+	data->ds_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
 		data->args.fh = fh;
@@ -875,6 +877,8 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 	for (i = 0; i < size; i++) {
 		INIT_LIST_HEAD(&buckets[i].written);
 		INIT_LIST_HEAD(&buckets[i].committing);
+		/* mark direct verifier as unset */
+		buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
 	}
 
 	spin_lock(cinfo->lock);
@@ -885,6 +889,8 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 			    &buckets[i].written);
 		list_splice(&cinfo->ds->buckets[i].committing,
 			    &buckets[i].committing);
+		buckets[i].direct_verf.committed =
+			cinfo->ds->buckets[i].direct_verf.committed;
 		buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
 		buckets[i].clseg = cinfo->ds->buckets[i].clseg;
 	}
diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 3e794c12e90a..610af5155ef2 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -46,6 +46,9 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc
 enum nfs3_stable_how {
 	NFS_UNSTABLE = 0,
 	NFS_DATA_SYNC = 1,
-	NFS_FILE_SYNC = 2
+	NFS_FILE_SYNC = 2,
+
+	/* used by direct.c to mark verf as invalid */
+	NFS_INVALID_STABLE_HOW = -1
 };
 #endif /* _LINUX_NFS_H */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ae636013fb1f..9a1396e70310 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1112,6 +1112,7 @@ struct pnfs_commit_bucket {
 	struct list_head committing;
 	struct pnfs_layout_segment *wlseg;
 	struct pnfs_layout_segment *clseg;
+	struct nfs_writeverf direct_verf;
 };
 
 struct pnfs_ds_commit_info {
@@ -1294,6 +1295,7 @@ struct nfs_pgio_data {
 	__u64			mds_offset;	/* Filelayout dense stripe */
 	struct nfs_page_array	pages;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
+	int			ds_idx;		/* ds index if ds_clp is set */
 };
 
 struct nfs_rw_header {
-- 
cgit v1.2.3-71-gd317


From 86f6cf41272de9d6ffa05ab46028b15d160a6f3e Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Sat, 24 May 2014 09:34:26 +0200
Subject: net: of_mdio: add of_mdiobus_link_phydev()

Add a function to walk the list of subnodes of a mdio bus and look for
a node that matches the phy's address with its 'reg' property. If found,
set the of_node pointer for the phy. This allows auto-probed pyh
devices to be augmented by information passed in via DT.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c |  6 ++++++
 drivers/of/of_mdio.c       | 33 +++++++++++++++++++++++++++++++++
 include/linux/of_mdio.h    |  8 ++++++++
 3 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index a6284964b711..2e58aa54484c 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -300,6 +300,12 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr)
 	if (IS_ERR(phydev) || phydev == NULL)
 		return phydev;
 
+	/*
+	 * For DT, see if the auto-probed phy has a correspoding child
+	 * in the bus node, and set the of_node pointer in this case.
+	 */
+	of_mdiobus_link_phydev(bus, phydev);
+
 	err = phy_device_register(phydev);
 	if (err) {
 		phy_device_free(phydev);
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 731d3d9052d7..7c8c142e4eb8 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -183,6 +183,39 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 }
 EXPORT_SYMBOL(of_mdiobus_register);
 
+/**
+ * of_mdiobus_link_phydev - Find a device node for a phy
+ * @mdio: pointer to mii_bus structure
+ * @phydev: phydev for which the of_node pointer should be set
+ *
+ * Walk the list of subnodes of a mdio bus and look for a node that matches the
+ * phy's address with its 'reg' property. If found, set the of_node pointer for
+ * the phy. This allows auto-probed pyh devices to be supplied with information
+ * passed in via DT.
+ */
+void of_mdiobus_link_phydev(struct mii_bus *mdio,
+			    struct phy_device *phydev)
+{
+	struct device *dev = &phydev->dev;
+	struct device_node *child;
+
+	if (dev->of_node || !mdio->dev.of_node)
+		return;
+
+	for_each_available_child_of_node(mdio->dev.of_node, child) {
+		int addr;
+
+		addr = of_mdio_parse_addr(&mdio->dev, child);
+		if (addr < 0)
+			continue;
+
+		if (addr == phydev->addr) {
+			dev->of_node = child;
+			return;
+		}
+	}
+}
+
 /* Helper function for of_phy_find_device */
 static int of_phy_match(struct device *dev, void *phy_np)
 {
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index d449018d0726..a70c9493d55a 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -25,6 +25,9 @@ struct phy_device *of_phy_attach(struct net_device *dev,
 
 extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np);
 
+extern void of_mdiobus_link_phydev(struct mii_bus *mdio,
+				   struct phy_device *phydev);
+
 #else /* CONFIG_OF */
 static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 {
@@ -60,6 +63,11 @@ static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np)
 {
 	return NULL;
 }
+
+static inline void of_mdiobus_link_phydev(struct mii_bus *mdio,
+					  struct phy_device *phydev)
+{
+}
 #endif /* CONFIG_OF */
 
 #if defined(CONFIG_OF) && defined(CONFIG_FIXED_PHY)
-- 
cgit v1.2.3-71-gd317


From 97982f5a91e91dab26dd0246083b9adf3ba8b2e3 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 29 May 2014 16:31:02 +0300
Subject: IB/mlx4: Preparation for VFs to issue/receive SMI (QP0)
 requests/responses

Currently, VFs in SRIOV VFs are denied QP0 access.  The main reason
for this decision is security, since Subnet Management Datagrams
(SMPs) are not restricted by network partitioning and may affect the
physical network topology.  Moreover, even the SM may be denied access
from portions of the network by setting management keys unknown to the
SM.

However, it is desirable to grant SMI access to certain privileged
VFs, so that certain network management activities may be conducted
within virtual machines instead of the hypervisor.

This commit does the following:

1. Create QP0 tunnel QPs for all VFs.

2. Discard SMI mads sent-from/received-for non-privileged VFs in the
   hypervisor MAD multiplex/demultiplex logic.  SMI mads from/for
   privileged VFs are allowed to pass.

3. MAD_IFC wrapper changes/fixes.  For non-privileged VFs, only
   host-view MAD_IFC commands are allowed, and only for SMI LID-Routed
   GET mads.  For privileged VFs, there are no restrictions.

This commit does not allow privileged VFs as yet.  To determine if a VF
is privileged, it calls function mlx4_vf_smi_enabled().  This function
returns 0 unconditionally for now.

The next two commits allow defining and activating privileged VFs.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/mad.c         | 40 ++++++++++++++++-----------
 drivers/infiniband/hw/mlx4/qp.c          | 16 +++++------
 drivers/net/ethernet/mellanox/mlx4/cmd.c | 47 ++++++++++++++++++++++----------
 include/linux/mlx4/device.h              |  1 +
 4 files changed, 66 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index fd36ec672632..287ad0564acd 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -478,10 +478,6 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
 	if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
 		return -EAGAIN;
 
-	/* QP0 forwarding only for Dom0 */
-	if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave))
-		return -EINVAL;
-
 	if (!dest_qpt)
 		tun_qp = &tun_ctx->qp[0];
 	else
@@ -667,6 +663,21 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
 	}
 	/* Class-specific handling */
 	switch (mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		/* 255 indicates the dom0 */
+		if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) {
+			if (!mlx4_vf_smi_enabled(dev->dev, slave, port))
+				return -EPERM;
+			/* for a VF. drop unsolicited MADs */
+			if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) {
+				mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n",
+					     slave, mad->mad_hdr.mgmt_class,
+					     mad->mad_hdr.method);
+				return -EINVAL;
+			}
+		}
+		break;
 	case IB_MGMT_CLASS_SUBN_ADM:
 		if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
 					     (struct ib_sa_mad *) mad))
@@ -1165,10 +1176,6 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 	if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
 		return -EAGAIN;
 
-	/* QP0 forwarding only for Dom0 */
-	if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave))
-		return -EINVAL;
-
 	if (dest_qpt == IB_QPT_SMI) {
 		src_qpnum = 0;
 		sqp = &sqp_ctx->qp[0];
@@ -1285,11 +1292,6 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
 			     "belongs to another slave\n", wc->src_qp);
 		return;
 	}
-	if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) {
-		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
-			     "non-master trying to send QP0 packets\n", wc->src_qp);
-		return;
-	}
 
 	/* Map transaction ID */
 	ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
@@ -1317,6 +1319,12 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
 
 	/* Class-specific handling */
 	switch (tunnel->mad.mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		if (slave != mlx4_master_func_num(dev->dev) &&
+		    !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port))
+			return;
+		break;
 	case IB_MGMT_CLASS_SUBN_ADM:
 		if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
 			      (struct ib_sa_mad *) &tunnel->mad))
@@ -1749,9 +1757,9 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 		return -EEXIST;
 
 	ctx->state = DEMUX_PV_STATE_STARTING;
-	/* have QP0 only on port owner, and only if link layer is IB */
-	if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) &&
-	    rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND)
+	/* have QP0 only if link layer is IB */
+	if (rdma_port_get_link_layer(ibdev, ctx->port) ==
+	    IB_LINK_LAYER_INFINIBAND)
 		ctx->has_smi = 1;
 
 	if (ctx->has_smi) {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 41308af4163c..2e8c58806e2f 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -2370,7 +2370,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
 
 static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
 				    struct mlx4_wqe_datagram_seg *dseg,
-				    struct ib_send_wr *wr, enum ib_qp_type qpt)
+				    struct ib_send_wr *wr,
+				    enum mlx4_ib_qp_type qpt)
 {
 	union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;
 	struct mlx4_av sqp_av = {0};
@@ -2383,8 +2384,10 @@ static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
 			cpu_to_be32(0xf0000000);
 
 	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
-	/* This function used only for sending on QP1 proxies */
-	dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+	if (qpt == MLX4_IB_QPT_PROXY_GSI)
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+	else
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]);
 	/* Use QKEY from the QP context, which is set by master */
 	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
 }
@@ -2700,16 +2703,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			size += seglen / 16;
 			break;
 		case MLX4_IB_QPT_PROXY_SMI:
-			/* don't allow QP0 sends on guests */
-			err = -ENOSYS;
-			*bad_wr = wr;
-			goto out;
 		case MLX4_IB_QPT_PROXY_GSI:
 			/* If we are tunneling special qps, this is a UD qp.
 			 * In this case we first add a UD segment targeting
 			 * the tunnel qp, and then add a header with address
 			 * information */
-			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type);
+			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr,
+						qp->mlx4_ib_qp_type);
 			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
 			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 			build_tunnel_header(wr, wqe, &seglen);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 78099eab7673..b0e48d426fce 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -705,20 +705,28 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 	struct ib_smp *smp = inbox->buf;
 	u32 index;
 	u8 port;
+	u8 opcode_modifier;
 	u16 *table;
 	int err;
 	int vidx, pidx;
+	int network_view;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct ib_smp *outsmp = outbox->buf;
 	__be16 *outtab = (__be16 *)(outsmp->data);
 	__be32 slave_cap_mask;
 	__be64 slave_node_guid;
+
 	port = vhcr->in_modifier;
 
+	/* network-view bit is for driver use only, and should not be passed to FW */
+	opcode_modifier = vhcr->op_modifier & ~0x8; /* clear netw view bit */
+	network_view = !!(vhcr->op_modifier & 0x8);
+
 	if (smp->base_version == 1 &&
 	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
 	    smp->class_version == 1) {
-		if (smp->method	== IB_MGMT_METHOD_GET) {
+		/* host view is paravirtualized */
+		if (!network_view && smp->method == IB_MGMT_METHOD_GET) {
 			if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) {
 				index = be32_to_cpu(smp->attr_mod);
 				if (port < 1 || port > dev->caps.num_ports)
@@ -743,7 +751,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 				/*get the slave specific caps:*/
 				/*do the command */
 				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-					    vhcr->in_modifier, vhcr->op_modifier,
+					    vhcr->in_modifier, opcode_modifier,
 					    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 				/* modify the response for slaves */
 				if (!err && slave != mlx4_master_func_num(dev)) {
@@ -760,7 +768,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 				smp->attr_mod = cpu_to_be32(slave / 8);
 				/* execute cmd */
 				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-					     vhcr->in_modifier, vhcr->op_modifier,
+					     vhcr->in_modifier, opcode_modifier,
 					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 				if (!err) {
 					/* if needed, move slave gid to index 0 */
@@ -774,7 +782,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 			}
 			if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
 				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-					     vhcr->in_modifier, vhcr->op_modifier,
+					     vhcr->in_modifier, opcode_modifier,
 					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 				if (!err) {
 					slave_node_guid =  mlx4_get_slave_node_guid(dev, slave);
@@ -784,19 +792,24 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 			}
 		}
 	}
+
+	/* Non-privileged VFs are only allowed "host" view LID-routed 'Get' MADs.
+	 * These are the MADs used by ib verbs (such as ib_query_gids).
+	 */
 	if (slave != mlx4_master_func_num(dev) &&
-	    ((smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) ||
-	     (smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
-	      smp->method == IB_MGMT_METHOD_SET))) {
-		mlx4_err(dev, "slave %d is trying to execute a Subnet MGMT MAD, "
-			 "class 0x%x, method 0x%x for attr 0x%x. Rejecting\n",
-			 slave, smp->method, smp->mgmt_class,
-			 be16_to_cpu(smp->attr_id));
-		return -EPERM;
+	    !mlx4_vf_smi_enabled(dev, slave, port)) {
+		if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+		      smp->method == IB_MGMT_METHOD_GET) || network_view) {
+			mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n",
+				 slave, smp->method, smp->mgmt_class,
+				 network_view ? "Network" : "Host",
+				 be16_to_cpu(smp->attr_id));
+			return -EPERM;
+		}
 	}
-	/*default:*/
+
 	return mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-				    vhcr->in_modifier, vhcr->op_modifier,
+				    vhcr->in_modifier, opcode_modifier,
 				    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 }
 
@@ -2537,3 +2550,9 @@ int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_stat
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
+
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ba87bd21295a..83612faa819c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1234,4 +1234,5 @@ int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port);
 int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
 
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 #endif /* MLX4_DEVICE_H */
-- 
cgit v1.2.3-71-gd317


From 99ec41d0a48cb6d14af25765f9449762f9d101f6 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 29 May 2014 16:31:03 +0300
Subject: mlx4: Add infrastructure for selecting VFs to enable QP0 via MLX
 proxy QPs

This commit adds the infrastructure for enabling selected VFs to
operate SMI (QP0) MADs without restriction.

Additionally, for these enabled VFs, their QP0 proxy and tunnel QPs
are MLX QPs.  As such, they operate over VL15.  Therefore, they are
not affected by "credit" problems or changes in the VLArb table (which
may shut down VL0).

Non-enabled VFs may only create UD proxy QP0 qps (which are forced by
the hypervisor to send packets using the q-key it assigns and places
in the qp-context).  Thus, non-enabled VFs will not pose a security
risk.  The hypervisor discards any privileged MADs it receives from
these non-enabled VFs.

By default, all VFs are NOT enabled, and must explicitly be enabled
by the administrator.

The sysfs interface which operates the VF enablement infrastructure
is provided in the next commit.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/qp.c                    | 52 +++++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx4/cmd.c           | 13 +++++-
 drivers/net/ethernet/mellanox/mlx4/fw.c            | 44 ++++++++++++------
 drivers/net/ethernet/mellanox/mlx4/fw.h            |  1 +
 drivers/net/ethernet/mellanox/mlx4/main.c          | 16 +++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  8 ++++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 26 ++++++++---
 include/linux/mlx4/device.h                        |  1 +
 8 files changed, 126 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 2e8c58806e2f..b25600997cb6 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -608,6 +608,16 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)
 	return !attr->srq;
 }
 
+static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.qp0_proxy[i])
+			return !!dev->caps.qp0_qkey[i];
+	}
+	return 0;
+}
+
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp)
@@ -625,10 +635,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		     !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
 			if (init_attr->qp_type == IB_QPT_GSI)
 				qp_type = MLX4_IB_QPT_PROXY_GSI;
-			else if (mlx4_is_master(dev->dev))
-				qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
-			else
-				qp_type = MLX4_IB_QPT_PROXY_SMI;
+			else {
+				if (mlx4_is_master(dev->dev) ||
+				    qp0_enabled_vf(dev->dev, sqpn))
+					qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+				else
+					qp_type = MLX4_IB_QPT_PROXY_SMI;
+			}
 		}
 		qpn = sqpn;
 		/* add extra sg entry for tunneling */
@@ -643,7 +656,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			return -EINVAL;
 		if (tnl_init->proxy_qp_type == IB_QPT_GSI)
 			qp_type = MLX4_IB_QPT_TUN_GSI;
-		else if (tnl_init->slave == mlx4_master_func_num(dev->dev))
+		else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
+			 mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
+					     tnl_init->port))
 			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
 		else
 			qp_type = MLX4_IB_QPT_TUN_SMI;
@@ -1930,6 +1945,19 @@ out:
 	return err;
 }
 
+static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.qp0_proxy[i] ||
+		    qpn == dev->caps.qp0_tunnel[i]) {
+			*qkey = dev->caps.qp0_qkey[i];
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
 static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 				  struct ib_send_wr *wr,
 				  void *wqe, unsigned *mlx_seg_len)
@@ -1987,8 +2015,13 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 			cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
 
 	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
-	if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
-		return -EINVAL;
+	if (mlx4_is_master(mdev->dev)) {
+		if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	} else {
+		if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	}
 	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
 	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
 
@@ -2682,11 +2715,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			break;
 
 		case MLX4_IB_QPT_PROXY_SMI_OWNER:
-			if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) {
-				err = -ENOSYS;
-				*bad_wr = wr;
-				goto out;
-			}
 			err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
 			if (unlikely(err)) {
 				*bad_wr = wr;
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index b0e48d426fce..26c3ebaa49d1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1666,6 +1666,8 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 	for (port = min_port; port <= max_port; port++) {
 		if (!test_bit(port - 1, actv_ports.ports))
 			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			priv->mfunc.master.vf_admin[slave].enable_smi[port];
 		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
 		vp_oper->state = *vp_admin;
@@ -1717,6 +1719,8 @@ static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave
 	for (port = min_port; port <= max_port; port++) {
 		if (!test_bit(port - 1, actv_ports.ports))
 			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			MLX4_VF_SMI_DISABLED;
 		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 		if (NO_INDX != vp_oper->vlan_idx) {
 			__mlx4_unregister_vlan(&priv->dev,
@@ -2553,6 +2557,13 @@ EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
 
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
 {
-	return 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_oper[slave].smi_enabled[port] ==
+		MLX4_VF_SMI_ENABLED;
 }
 EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index ef242e19766f..01e6dd61ee3c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -178,8 +178,8 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 				struct mlx4_cmd_info *cmd)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	u8	field;
-	u32	size;
+	u8	field, port;
+	u32	size, proxy_qp, qkey;
 	int	err = 0;
 
 #define QUERY_FUNC_CAP_FLAGS_OFFSET		0x0
@@ -209,6 +209,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 
 /* when opcode modifier = 1 */
 #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
+#define QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET	0x4
 #define QUERY_FUNC_CAP_FLAGS0_OFFSET		0x8
 #define QUERY_FUNC_CAP_FLAGS1_OFFSET		0xc
 
@@ -221,6 +222,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_FLAGS1_FORCE_MAC		0x40
 #define QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN	0x80
 #define QUERY_FUNC_CAP_FLAGS1_NIC_INFO			0x10
+#define QUERY_FUNC_CAP_VF_ENABLE_QP0		0x08
 
 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
 
@@ -234,28 +236,35 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 			return -EINVAL;
 
 		vhcr->in_modifier = converted_port;
-		/* Set nic_info bit to mark new fields support */
-		field  = QUERY_FUNC_CAP_FLAGS1_NIC_INFO;
-		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET);
-
 		/* phys-port = logical-port */
 		field = vhcr->in_modifier -
 			find_first_bit(actv_ports.ports, dev->caps.num_ports);
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
 
-		field = vhcr->in_modifier;
+		port = vhcr->in_modifier;
+		proxy_qp = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1;
+
+		/* Set nic_info bit to mark new fields support */
+		field  = QUERY_FUNC_CAP_FLAGS1_NIC_INFO;
+
+		if (mlx4_vf_smi_enabled(dev, slave, port) &&
+		    !mlx4_get_parav_qkey(dev, proxy_qp, &qkey)) {
+			field |= QUERY_FUNC_CAP_VF_ENABLE_QP0;
+			MLX4_PUT(outbox->buf, qkey,
+				 QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		}
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+
 		/* size is now the QP number */
-		size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + field - 1;
+		size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL);
 
 		size += 2;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL);
 
-		size = dev->phys_caps.base_proxy_sqpn + 8 * slave + field - 1;
-		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_PROXY);
-
-		size += 2;
-		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_PROXY);
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP0_PROXY);
+		proxy_qp += 2;
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP1_PROXY);
 
 		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
 			 QUERY_FUNC_CAP_PHYS_PORT_ID);
@@ -326,7 +335,7 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 	struct mlx4_cmd_mailbox *mailbox;
 	u32			*outbox;
 	u8			field, op_modifier;
-	u32			size;
+	u32			size, qkey;
 	int			err = 0, quotas = 0;
 
 	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
@@ -442,6 +451,13 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 		goto out;
 	}
 
+	if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) {
+		MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		func_cap->qp0_qkey = qkey;
+	} else {
+		func_cap->qp0_qkey = 0;
+	}
+
 	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL);
 	func_cap->qp0_tunnel_qpn = size & 0xFFFFFF;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 6811ee00ba7c..1fce03ebe5c4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -134,6 +134,7 @@ struct mlx4_func_cap {
 	int	max_eq;
 	int	reserved_eq;
 	int	mcg_quota;
+	u32	qp0_qkey;
 	u32	qp0_tunnel_qpn;
 	u32	qp0_proxy_qpn;
 	u32	qp1_tunnel_qpn;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 12a7ee2e6098..908326876ab5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -666,13 +666,15 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 		return -ENODEV;
 	}
 
+	dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 	dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 	dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 
 	if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
-	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) {
+	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy ||
+	    !dev->caps.qp0_qkey) {
 		err = -ENOMEM;
 		goto err_mem;
 	}
@@ -684,6 +686,7 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 				 " port %d, aborting (%d).\n", i, err);
 			goto err_mem;
 		}
+		dev->caps.qp0_qkey[i - 1] = func_cap.qp0_qkey;
 		dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn;
 		dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn;
 		dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn;
@@ -729,12 +732,16 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	return 0;
 
 err_mem:
+	kfree(dev->caps.qp0_qkey);
 	kfree(dev->caps.qp0_tunnel);
 	kfree(dev->caps.qp0_proxy);
 	kfree(dev->caps.qp1_tunnel);
 	kfree(dev->caps.qp1_proxy);
-	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
-		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	dev->caps.qp0_qkey = NULL;
+	dev->caps.qp0_tunnel = NULL;
+	dev->caps.qp0_proxy = NULL;
+	dev->caps.qp1_tunnel = NULL;
+	dev->caps.qp1_proxy = NULL;
 
 	return err;
 }
@@ -1697,6 +1704,7 @@ unmap_bf:
 	unmap_bf_area(dev);
 
 	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_qkey);
 		kfree(dev->caps.qp0_tunnel);
 		kfree(dev->caps.qp0_proxy);
 		kfree(dev->caps.qp1_tunnel);
@@ -2573,6 +2581,7 @@ err_master_mfunc:
 		mlx4_multi_func_cleanup(dev);
 
 	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_qkey);
 		kfree(dev->caps.qp0_tunnel);
 		kfree(dev->caps.qp0_proxy);
 		kfree(dev->caps.qp1_tunnel);
@@ -2702,6 +2711,7 @@ static void __mlx4_remove_one(struct pci_dev *pdev)
 	if (!mlx4_is_slave(dev))
 		mlx4_free_ownership(dev);
 
+	kfree(dev->caps.qp0_qkey);
 	kfree(dev->caps.qp0_tunnel);
 	kfree(dev->caps.qp0_proxy);
 	kfree(dev->caps.qp1_tunnel);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index f9c465101963..0efd1738fcb3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -133,6 +133,11 @@ enum {
 	MLX4_COMM_CMD_FLR = 254
 };
 
+enum {
+	MLX4_VF_SMI_DISABLED,
+	MLX4_VF_SMI_ENABLED
+};
+
 /*The flag indicates that the slave should delay the RESET cmd*/
 #define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb
 /*indicates how many retries will be done if we are in the middle of FLR*/
@@ -488,6 +493,7 @@ struct mlx4_vport_state {
 
 struct mlx4_vf_admin_state {
 	struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1];
+	u8 enable_smi[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_vport_oper_state {
@@ -495,8 +501,10 @@ struct mlx4_vport_oper_state {
 	int mac_idx;
 	int vlan_idx;
 };
+
 struct mlx4_vf_oper_state {
 	struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1];
+	u8 smi_enabled[MLX4_MAX_PORTS + 1];
 };
 
 struct slave_list {
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 1c3fdd4a1f7d..ad98162a8d79 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2827,10 +2827,12 @@ static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
 }
 
 static int verify_qp_parameters(struct mlx4_dev *dev,
+				struct mlx4_vhcr *vhcr,
 				struct mlx4_cmd_mailbox *inbox,
 				enum qp_transition transition, u8 slave)
 {
 	u32			qp_type;
+	u32			qpn;
 	struct mlx4_qp_context	*qp_ctx;
 	enum mlx4_qp_optpar	optpar;
 	int port;
@@ -2870,6 +2872,20 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
 						return -EINVAL;
 				}
 			break;
+		case MLX4_QP_ST_MLX:
+			qpn = vhcr->in_modifier & 0x7fffff;
+			port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+			if (transition == QP_TRANS_INIT2RTR &&
+			    slave != mlx4_master_func_num(dev) &&
+			    mlx4_is_qp_reserved(dev, qpn) &&
+			    !mlx4_vf_smi_enabled(dev, slave, port)) {
+				/* only enabled VFs may create MLX proxy QPs */
+				mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n",
+					 __func__, slave, port);
+				return -EPERM;
+			}
+			break;
+
 		default:
 			break;
 		}
@@ -3454,7 +3470,7 @@ int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
 	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
 	if (err)
 		return err;
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_INIT2RTR, slave);
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_INIT2RTR, slave);
 	if (err)
 		return err;
 
@@ -3508,7 +3524,7 @@ int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
 	err = adjust_qp_sched_queue(dev, slave, context, inbox);
 	if (err)
 		return err;
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_RTR2RTS, slave);
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTR2RTS, slave);
 	if (err)
 		return err;
 
@@ -3530,7 +3546,7 @@ int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
 	err = adjust_qp_sched_queue(dev, slave, context, inbox);
 	if (err)
 		return err;
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_RTS2RTS, slave);
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTS2RTS, slave);
 	if (err)
 		return err;
 
@@ -3567,7 +3583,7 @@ int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
 	err = adjust_qp_sched_queue(dev, slave, context, inbox);
 	if (err)
 		return err;
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2SQD, slave);
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2SQD, slave);
 	if (err)
 		return err;
 
@@ -3589,7 +3605,7 @@ int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
 	err = adjust_qp_sched_queue(dev, slave, context, inbox);
 	if (err)
 		return err;
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2RTS, slave);
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2RTS, slave);
 	if (err)
 		return err;
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 83612faa819c..e2fc7011314c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -401,6 +401,7 @@ struct mlx4_caps {
 	int			max_rq_desc_sz;
 	int			max_qp_init_rdma;
 	int			max_qp_dest_rdma;
+	u32			*qp0_qkey;
 	u32			*qp0_proxy;
 	u32			*qp1_proxy;
 	u32			*qp0_tunnel;
-- 
cgit v1.2.3-71-gd317


From 65fed8a8c155271cf647651bd62eecb5928ae3a4 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 29 May 2014 16:31:04 +0300
Subject: IB/mlx4: Add interface for selecting VFs to enable QP0 via MLX proxy
 QPs

This commit adds the sysfs interface for enabling QP0 on VFs for
selected VF/port.

By default, no VFs are enabled for QP0 operation.

To enable QP0 operation on a VF/port, under
/sys/class/infiniband/mlx4_x/iov/<b:d:f>/ports/x there are two new entries:

- smi_enabled (read-only). Indicates whether smi is currently
  enabled for the indicated VF/port

- enable_smi_admin (rw). Used by the admin to request that smi
  capability be enabled or disabled for the indicated VF/port.
  0 = disable, 1 = enable.
  The requested enablement will occur at the next reset of the
  VF (e.g. driver restart on the VM which owns the VF).

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/sysfs.c       | 105 ++++++++++++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/cmd.c |  34 ++++++++++
 include/linux/mlx4/device.h              |   3 +
 3 files changed, 141 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index 5a38e43eca65..cb4c66e723b5 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -389,8 +389,10 @@ struct mlx4_port {
 	struct mlx4_ib_dev    *dev;
 	struct attribute_group pkey_group;
 	struct attribute_group gid_group;
-	u8                     port_num;
+	struct device_attribute	enable_smi_admin;
+	struct device_attribute	smi_enabled;
 	int		       slave;
+	u8                     port_num;
 };
 
 
@@ -558,6 +560,101 @@ err:
 	return NULL;
 }
 
+static ssize_t sysfs_show_smi_enabled(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, smi_enabled);
+	ssize_t len = 0;
+
+	if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_show_enable_smi_admin(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	ssize_t len = 0;
+
+	if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_store_enable_smi_admin(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	int enable;
+
+	if (sscanf(buf, "%i", &enable) != 1 ||
+	    enable < 0 || enable > 1)
+		return -EINVAL;
+
+	if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable))
+		return -EINVAL;
+	return count;
+}
+
+static int add_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+	int ret;
+
+	/* do not display entries if eth transport, or if master */
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return 0;
+
+	sysfs_attr_init(&p->smi_enabled.attr);
+	p->smi_enabled.show = sysfs_show_smi_enabled;
+	p->smi_enabled.store = NULL;
+	p->smi_enabled.attr.name = "smi_enabled";
+	p->smi_enabled.attr.mode = 0444;
+	ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr);
+	if (ret) {
+		pr_err("failed to create smi_enabled\n");
+		return ret;
+	}
+
+	sysfs_attr_init(&p->enable_smi_admin.attr);
+	p->enable_smi_admin.show = sysfs_show_enable_smi_admin;
+	p->enable_smi_admin.store = sysfs_store_enable_smi_admin;
+	p->enable_smi_admin.attr.name = "enable_smi_admin";
+	p->enable_smi_admin.attr.mode = 0644;
+	ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr);
+	if (ret) {
+		pr_err("failed to create enable_smi_admin\n");
+		sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+		return ret;
+	}
+	return 0;
+}
+
+static void remove_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return;
+
+	sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+	sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr);
+}
+
 static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
 {
 	struct mlx4_port *p;
@@ -602,6 +699,10 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
 	if (ret)
 		goto err_free_gid;
 
+	ret = add_vf_smi_entries(p);
+	if (ret)
+		goto err_free_gid;
+
 	list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
 	return 0;
 
@@ -669,6 +770,7 @@ err_add:
 		mport = container_of(p, struct mlx4_port, kobj);
 		sysfs_remove_group(p, &mport->pkey_group);
 		sysfs_remove_group(p, &mport->gid_group);
+		remove_vf_smi_entries(mport);
 		kobject_put(p);
 	}
 	kobject_put(dev->dev_ports_parent[slave]);
@@ -713,6 +815,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device)
 			port = container_of(p, struct mlx4_port, kobj);
 			sysfs_remove_group(p, &port->pkey_group);
 			sysfs_remove_group(p, &port->gid_group);
+			remove_vf_smi_entries(port);
 			kobject_put(p);
 			kobject_put(device->dev_ports_parent[slave]);
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 26c3ebaa49d1..3370ecb8c3d2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2567,3 +2567,37 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
 		MLX4_VF_SMI_ENABLED;
 }
 EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled);
+
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 1;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_admin[slave].enable_smi[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_get_enable_smi_admin);
+
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enabled)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 0;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS ||
+	    enabled < 0 || enabled > 1)
+		return -EINVAL;
+
+	priv->mfunc.master.vf_admin[slave].enable_smi[port] = enabled;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_set_enable_smi_admin);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index e2fc7011314c..dcabe11f37f7 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1236,4 +1236,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
 
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enable);
 #endif /* MLX4_DEVICE_H */
-- 
cgit v1.2.3-71-gd317


From 3e19ce762b537dd9aeefdd0849ba5f2f01ff83cf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 25 Feb 2014 17:44:21 -0500
Subject: rpc: xdr_truncate_encode

This will be used in the server side in a few cases:
	- when certain operations (read, readdir, readlink) fail after
	  encoding a partial response.
	- when we run out of space after encoding a partial response.
	- in readlink, where we initially reserve PAGE_SIZE bytes for
	  data, then truncate to the actual size.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/xdr.c           | 66 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 15f9204ee70b..e7bb2e3bd0fb 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -215,6 +215,7 @@ typedef int	(*kxdrdproc_t)(void *rqstp, struct xdr_stream *xdr, void *obj);
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
+extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 		unsigned int base, unsigned int len);
 extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index dd97ba3c4456..352f3b35bbe5 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -508,6 +508,72 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
 }
 EXPORT_SYMBOL_GPL(xdr_reserve_space);
 
+/**
+ * xdr_truncate_encode - truncate an encode buffer
+ * @xdr: pointer to xdr_stream
+ * @len: new length of buffer
+ *
+ * Truncates the xdr stream, so that xdr->buf->len == len,
+ * and xdr->p points at offset len from the start of the buffer, and
+ * head, tail, and page lengths are adjusted to correspond.
+ *
+ * If this means moving xdr->p to a different buffer, we assume that
+ * that the end pointer should be set to the end of the current page,
+ * except in the case of the head buffer when we assume the head
+ * buffer's current length represents the end of the available buffer.
+ *
+ * This is *not* safe to use on a buffer that already has inlined page
+ * cache pages (as in a zero-copy server read reply), except for the
+ * simple case of truncating from one position in the tail to another.
+ *
+ */
+void xdr_truncate_encode(struct xdr_stream *xdr, size_t len)
+{
+	struct xdr_buf *buf = xdr->buf;
+	struct kvec *head = buf->head;
+	struct kvec *tail = buf->tail;
+	int fraglen;
+	int new, old;
+
+	if (len > buf->len) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	fraglen = min_t(int, buf->len - len, tail->iov_len);
+	tail->iov_len -= fraglen;
+	buf->len -= fraglen;
+	if (tail->iov_len && buf->len == len) {
+		xdr->p = tail->iov_base + tail->iov_len;
+		/* xdr->end, xdr->iov should be set already */
+		return;
+	}
+	WARN_ON_ONCE(fraglen);
+	fraglen = min_t(int, buf->len - len, buf->page_len);
+	buf->page_len -= fraglen;
+	buf->len -= fraglen;
+
+	new = buf->page_base + buf->page_len;
+	old = new + fraglen;
+	xdr->page_ptr -= (old >> PAGE_SHIFT) - (new >> PAGE_SHIFT);
+
+	if (buf->page_len && buf->len == len) {
+		xdr->p = page_address(*xdr->page_ptr);
+		xdr->end = (void *)xdr->p + PAGE_SIZE;
+		xdr->p = (void *)xdr->p + (new % PAGE_SIZE);
+		/* xdr->iov should already be NULL */
+		return;
+	}
+	if (fraglen)
+		xdr->end = head->iov_base + head->iov_len;
+	/* (otherwise assume xdr->end is already set) */
+	head->iov_len = len;
+	buf->len = len;
+	xdr->p = head->iov_base + head->iov_len;
+	xdr->iov = buf->head;
+}
+EXPORT_SYMBOL(xdr_truncate_encode);
+
 /**
  * xdr_write_pages - Insert a list of pages into an XDR buffer for sending
  * @xdr: pointer to xdr_stream
-- 
cgit v1.2.3-71-gd317


From 2825a7f90753012babe7ee292f4a1eadd3706f92 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 26 Aug 2013 16:04:46 -0400
Subject: nfsd4: allow encoding across page boundaries

After this we can handle for example getattr of very large ACLs.

Read, readdir, readlink are still special cases with their own limits.

Also we can't handle a new operation starting close to the end of a
page.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c         |  4 +++
 fs/nfsd/nfs4xdr.c          | 60 +++++++++++++++++++++++++----------
 include/linux/sunrpc/svc.h |  1 +
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/svc_xprt.c      |  1 +
 net/sunrpc/xdr.c           | 78 ++++++++++++++++++++++++++++++++++++++++++++--
 6 files changed, 126 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 3ce431b9b577..5d8f9158701d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1264,6 +1264,10 @@ static void svcxdr_init_encode(struct svc_rqst *rqstp,
 	xdr->end = head->iov_base + PAGE_SIZE - 2 * RPC_MAX_AUTH_SIZE;
 	/* Tail and page_len should be zero at this point: */
 	buf->len = buf->head[0].iov_len;
+	xdr->scratch.iov_len = 0;
+	xdr->page_ptr = buf->pages;
+	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
+		- 2 * RPC_MAX_AUTH_SIZE;
 }
 
 /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bd529e523087..d3a576dbd99b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1624,6 +1624,7 @@ static int nfsd4_max_reply(u32 opnum)
 		 * the head and tail in another page:
 		 */
 		return 2 * PAGE_SIZE;
+	case OP_GETATTR:
 	case OP_READ:
 		return INT_MAX;
 	default:
@@ -2560,21 +2561,31 @@ out_resource:
 	goto out;
 }
 
+static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
+				struct xdr_buf *buf, __be32 *p, int bytes)
+{
+	xdr->scratch.iov_len = 0;
+	memset(buf, 0, sizeof(struct xdr_buf));
+	buf->head[0].iov_base = p;
+	buf->head[0].iov_len = 0;
+	buf->len = 0;
+	xdr->buf = buf;
+	xdr->iov = buf->head;
+	xdr->p = p;
+	xdr->end = (void *)p + bytes;
+	buf->buflen = bytes;
+}
+
 __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
 			struct svc_fh *fhp, struct svc_export *exp,
 			struct dentry *dentry, u32 *bmval,
 			struct svc_rqst *rqstp, int ignore_crossmnt)
 {
-	struct xdr_buf dummy = {
-			.head[0] = {
-				.iov_base = *p,
-			},
-			.buflen = words << 2,
-		};
+	struct xdr_buf dummy;
 	struct xdr_stream xdr;
 	__be32 ret;
 
-	xdr_init_encode(&xdr, &dummy, NULL);
+	svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
 	ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
 							ignore_crossmnt);
 	*p = xdr.p;
@@ -3064,8 +3075,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 	if (nfserr)
 		return nfserr;
-	if (resp->xdr.buf->page_len)
-		return nfserr_resource;
 
 	p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
 	if (!p)
@@ -3075,6 +3084,9 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	if (xdr->end - xdr->p < 1)
 		return nfserr_resource;
 
+	if (resp->xdr.buf->page_len)
+		return nfserr_resource;
+
 	maxcount = svc_max_payload(resp->rqstp);
 	if (maxcount > read->rd_length)
 		maxcount = read->rd_length;
@@ -3119,6 +3131,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 				- (char *)resp->xdr.buf->head[0].iov_base);
 	resp->xdr.buf->page_len = maxcount;
 	xdr->buf->len += maxcount;
+	xdr->page_ptr += v;
+	xdr->buf->buflen = maxcount + PAGE_SIZE - 2 * RPC_MAX_AUTH_SIZE;
 	xdr->iov = xdr->buf->tail;
 
 	/* Use rest of head for padding and remaining ops: */
@@ -3145,6 +3159,11 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
 
 	if (nfserr)
 		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+
 	if (resp->xdr.buf->page_len)
 		return nfserr_resource;
 	if (!*resp->rqstp->rq_next_page)
@@ -3154,10 +3173,6 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
 
 	maxcount = PAGE_SIZE;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-
 	if (xdr->end - xdr->p < 1)
 		return nfserr_resource;
 
@@ -3180,6 +3195,8 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
 				- (char *)resp->xdr.buf->head[0].iov_base;
 	resp->xdr.buf->page_len = maxcount;
 	xdr->buf->len += maxcount;
+	xdr->page_ptr += 1;
+	xdr->buf->buflen -= PAGE_SIZE;
 	xdr->iov = xdr->buf->tail;
 
 	/* Use rest of head for padding and remaining ops: */
@@ -3206,15 +3223,16 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 
 	if (nfserr)
 		return nfserr;
-	if (resp->xdr.buf->page_len)
-		return nfserr_resource;
-	if (!*resp->rqstp->rq_next_page)
-		return nfserr_resource;
 
 	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
 	if (!p)
 		return nfserr_resource;
 
+	if (resp->xdr.buf->page_len)
+		return nfserr_resource;
+	if (!*resp->rqstp->rq_next_page)
+		return nfserr_resource;
+
 	/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
 	WRITE32(0);
 	WRITE32(0);
@@ -3266,6 +3284,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 
 	xdr->iov = xdr->buf->tail;
 
+	xdr->page_ptr++;
+	xdr->buf->buflen -= PAGE_SIZE;
+	xdr->iov = xdr->buf->tail;
+
 	/* Use rest of head for padding and remaining ops: */
 	resp->xdr.buf->tail[0].iov_base = tailbase;
 	resp->xdr.buf->tail[0].iov_len = 0;
@@ -3800,6 +3822,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	       !nfsd4_enc_ops[op->opnum]);
 	encoder = nfsd4_enc_ops[op->opnum];
 	op->status = encoder(resp, op->status, &op->u);
+	xdr_commit_encode(xdr);
+
 	/* nfsd4_check_resp_size guarantees enough room for error status */
 	if (!op->status) {
 		int space_needed = 0;
@@ -3919,6 +3943,8 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 	WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
 				 buf->tail[0].iov_len);
 
+	rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
 	p = resp->tagp;
 	*p++ = htonl(resp->taglen);
 	memcpy(p, resp->tag, resp->taglen);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index a0dbbd1e00e9..85cb6472a423 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -244,6 +244,7 @@ struct svc_rqst {
 	struct page *		rq_pages[RPCSVC_MAXPAGES];
 	struct page *		*rq_respages;	/* points into rq_pages */
 	struct page *		*rq_next_page; /* next reply page to use */
+	struct page *		*rq_page_end;  /* one past the last page */
 
 	struct kvec		rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */
 
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index e7bb2e3bd0fb..b23d69ffd5ec 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -215,6 +215,7 @@ typedef int	(*kxdrdproc_t)(void *rqstp, struct xdr_stream *xdr, void *obj);
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
+extern void xdr_commit_encode(struct xdr_stream *xdr);
 extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 		unsigned int base, unsigned int len);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 29772e01b179..b4737fbdec13 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -597,6 +597,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
 			}
 			rqstp->rq_pages[i] = p;
 		}
+	rqstp->rq_page_end = &rqstp->rq_pages[i];
 	rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
 
 	/* Make arg->head point to first page and arg->pages point to rest */
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 352f3b35bbe5..2b546e8ce43d 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -462,6 +462,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 	struct kvec *iov = buf->head;
 	int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
 
+	xdr_set_scratch_buffer(xdr, NULL, 0);
 	BUG_ON(scratch_len < 0);
 	xdr->buf = buf;
 	xdr->iov = iov;
@@ -481,6 +482,74 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 }
 EXPORT_SYMBOL_GPL(xdr_init_encode);
 
+/**
+ * xdr_commit_encode - Ensure all data is written to buffer
+ * @xdr: pointer to xdr_stream
+ *
+ * We handle encoding across page boundaries by giving the caller a
+ * temporary location to write to, then later copying the data into
+ * place; xdr_commit_encode does that copying.
+ *
+ * Normally the caller doesn't need to call this directly, as the
+ * following xdr_reserve_space will do it.  But an explicit call may be
+ * required at the end of encoding, or any other time when the xdr_buf
+ * data might be read.
+ */
+void xdr_commit_encode(struct xdr_stream *xdr)
+{
+	int shift = xdr->scratch.iov_len;
+	void *page;
+
+	if (shift == 0)
+		return;
+	page = page_address(*xdr->page_ptr);
+	memcpy(xdr->scratch.iov_base, page, shift);
+	memmove(page, page + shift, (void *)xdr->p - page);
+	xdr->scratch.iov_len = 0;
+}
+EXPORT_SYMBOL_GPL(xdr_commit_encode);
+
+__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes)
+{
+	static __be32 *p;
+	int space_left;
+	int frag1bytes, frag2bytes;
+
+	if (nbytes > PAGE_SIZE)
+		return NULL; /* Bigger buffers require special handling */
+	if (xdr->buf->len + nbytes > xdr->buf->buflen)
+		return NULL; /* Sorry, we're totally out of space */
+	frag1bytes = (xdr->end - xdr->p) << 2;
+	frag2bytes = nbytes - frag1bytes;
+	if (xdr->iov)
+		xdr->iov->iov_len += frag1bytes;
+	else {
+		xdr->buf->page_len += frag1bytes;
+		xdr->page_ptr++;
+	}
+	xdr->iov = NULL;
+	/*
+	 * If the last encode didn't end exactly on a page boundary, the
+	 * next one will straddle boundaries.  Encode into the next
+	 * page, then copy it back later in xdr_commit_encode.  We use
+	 * the "scratch" iov to track any temporarily unused fragment of
+	 * space at the end of the previous buffer:
+	 */
+	xdr->scratch.iov_base = xdr->p;
+	xdr->scratch.iov_len = frag1bytes;
+	p = page_address(*xdr->page_ptr);
+	/*
+	 * Note this is where the next encode will start after we've
+	 * shifted this one back:
+	 */
+	xdr->p = (void *)p + frag2bytes;
+	space_left = xdr->buf->buflen - xdr->buf->len;
+	xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE);
+	xdr->buf->page_len += frag2bytes;
+	xdr->buf->len += nbytes;
+	return p;
+}
+
 /**
  * xdr_reserve_space - Reserve buffer space for sending
  * @xdr: pointer to xdr_stream
@@ -495,14 +564,18 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
 	__be32 *p = xdr->p;
 	__be32 *q;
 
+	xdr_commit_encode(xdr);
 	/* align nbytes on the next 32-bit boundary */
 	nbytes += 3;
 	nbytes &= ~3;
 	q = p + (nbytes >> 2);
 	if (unlikely(q > xdr->end || q < p))
-		return NULL;
+		return xdr_get_next_encode_buffer(xdr, nbytes);
 	xdr->p = q;
-	xdr->iov->iov_len += nbytes;
+	if (xdr->iov)
+		xdr->iov->iov_len += nbytes;
+	else
+		xdr->buf->page_len += nbytes;
 	xdr->buf->len += nbytes;
 	return p;
 }
@@ -539,6 +612,7 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len)
 		WARN_ON_ONCE(1);
 		return;
 	}
+	xdr_commit_encode(xdr);
 
 	fraglen = min_t(int, buf->len - len, tail->iov_len);
 	tail->iov_len -= fraglen;
-- 
cgit v1.2.3-71-gd317


From db3f58a95beea6752d90fed03f9f198d282a3913 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 6 Mar 2014 13:22:18 -0500
Subject: rpc: define xdr_restrict_buflen

With this xdr_reserve_space can help us enforce various limits.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/xdr.c           | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index b23d69ffd5ec..70c6b92e15a7 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -217,6 +217,7 @@ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_commit_encode(struct xdr_stream *xdr);
 extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
+extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 		unsigned int base, unsigned int len);
 extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 2b546e8ce43d..39928444c7fb 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -648,6 +648,35 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len)
 }
 EXPORT_SYMBOL(xdr_truncate_encode);
 
+/**
+ * xdr_restrict_buflen - decrease available buffer space
+ * @xdr: pointer to xdr_stream
+ * @newbuflen: new maximum number of bytes available
+ *
+ * Adjust our idea of how much space is available in the buffer.
+ * If we've already used too much space in the buffer, returns -1.
+ * If the available space is already smaller than newbuflen, returns 0
+ * and does nothing.  Otherwise, adjusts xdr->buf->buflen to newbuflen
+ * and ensures xdr->end is set at most offset newbuflen from the start
+ * of the buffer.
+ */
+int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen)
+{
+	struct xdr_buf *buf = xdr->buf;
+	int left_in_this_buf = (void *)xdr->end - (void *)xdr->p;
+	int end_offset = buf->len + left_in_this_buf;
+
+	if (newbuflen < 0 || newbuflen < buf->len)
+		return -1;
+	if (newbuflen > buf->buflen)
+		return 0;
+	if (newbuflen < end_offset)
+		xdr->end = (void *)xdr->end + newbuflen - end_offset;
+	buf->buflen = newbuflen;
+	return 0;
+}
+EXPORT_SYMBOL(xdr_restrict_buflen);
+
 /**
  * xdr_write_pages - Insert a list of pages into an XDR buffer for sending
  * @xdr: pointer to xdr_stream
-- 
cgit v1.2.3-71-gd317


From a5cddc885b99458df963a75abbe0b40cbef56c48 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 12 May 2014 18:10:58 -0400
Subject: nfsd4: better reservation of head space for krb5

RPC_MAX_AUTH_SIZE is scattered around several places.  Better to set it
once in the auth code, where this kind of estimate should be made.  And
while we're at it we can leave it zero when we're not using krb5i or
krb5p.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c                |  4 ++--
 fs/nfsd/nfs4state.c               |  2 +-
 fs/nfsd/nfs4xdr.c                 |  5 +++--
 include/linux/sunrpc/svc.h        | 11 +++++------
 net/sunrpc/auth_gss/svcauth_gss.c |  2 ++
 net/sunrpc/svcauth.c              |  2 ++
 6 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2d786b813c7e..16e71d033ea5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1261,13 +1261,13 @@ static void svcxdr_init_encode(struct svc_rqst *rqstp,
 	xdr->buf = buf;
 	xdr->iov = head;
 	xdr->p   = head->iov_base + head->iov_len;
-	xdr->end = head->iov_base + PAGE_SIZE - 2 * RPC_MAX_AUTH_SIZE;
+	xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
 	/* Tail and page_len should be zero at this point: */
 	buf->len = buf->head[0].iov_len;
 	xdr->scratch.iov_len = 0;
 	xdr->page_ptr = buf->pages;
 	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
-		- 2 * RPC_MAX_AUTH_SIZE;
+		- rqstp->rq_auth_slack;
 }
 
 /*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 62b882dc48ec..d0a016a502be 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2288,7 +2288,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 			session->se_fchannel.maxresp_sz;
 	status = (seq->cachethis) ? nfserr_rep_too_big_to_cache :
 				    nfserr_rep_too_big;
-	if (xdr_restrict_buflen(xdr, buflen - 2 * RPC_MAX_AUTH_SIZE))
+	if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack))
 		goto out_put_session;
 	svc_reserve(rqstp, buflen);
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 3e347a1caec4..470fe8998c9b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1611,7 +1611,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	DECODE_HEAD;
 	struct nfsd4_op *op;
 	bool cachethis = false;
-	int max_reply = 2 * RPC_MAX_AUTH_SIZE + 8; /* opcnt, status */
+	int auth_slack= argp->rqstp->rq_auth_slack;
+	int max_reply = auth_slack + 8; /* opcnt, status */
 	int readcount = 0;
 	int readbytes = 0;
 	int i;
@@ -1677,7 +1678,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	svc_reserve(argp->rqstp, max_reply + readbytes);
 	argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
 
-	if (readcount > 1 || max_reply > PAGE_SIZE - 2*RPC_MAX_AUTH_SIZE)
+	if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
 		argp->rqstp->rq_splice_ok = false;
 
 	DECODE_TAIL;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 85cb6472a423..1bc7cd05b22e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -260,7 +260,10 @@ struct svc_rqst {
 	void *			rq_argp;	/* decoded arguments */
 	void *			rq_resp;	/* xdr'd results */
 	void *			rq_auth_data;	/* flavor-specific data */
-
+	int			rq_auth_slack;	/* extra space xdr code
+						 * should leave in head
+						 * for krb5i, krb5p.
+						 */
 	int			rq_reserved;	/* space on socket outq
 						 * reserved for this request
 						 */
@@ -456,11 +459,7 @@ char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
  */
 static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space)
 {
-	int added_space = 0;
-
-	if (rqstp->rq_authop->flavour)
-		added_space = RPC_MAX_AUTH_SIZE;
-	svc_reserve(rqstp, space + added_space);
+	svc_reserve(rqstp, space + rqstp->rq_auth_slack);
 }
 
 #endif /* SUNRPC_SVC_H */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 0f73f4507746..4ce5eccec1f6 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1503,6 +1503,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 			if (unwrap_integ_data(rqstp, &rqstp->rq_arg,
 					gc->gc_seq, rsci->mechctx))
 				goto garbage_args;
+			rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE;
 			break;
 		case RPC_GSS_SVC_PRIVACY:
 			/* placeholders for length and seq. number: */
@@ -1511,6 +1512,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 			if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
 					gc->gc_seq, rsci->mechctx))
 				goto garbage_args;
+			rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2;
 			break;
 		default:
 			goto auth_err;
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 2af7b0cba43a..79c0f3459b5c 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -54,6 +54,8 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
 	}
 	spin_unlock(&authtab_lock);
 
+	rqstp->rq_auth_slack = 0;
+
 	rqstp->rq_authop = aops;
 	return aops->accept(rqstp, authp);
 }
-- 
cgit v1.2.3-71-gd317


From c8865b64b05b2f4eeefd369373e9c8aeb069e7a1 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Sun, 25 May 2014 17:47:26 +0300
Subject: cpumask: Utility function to set n'th cpu - local cpu first

This function sets the n'th cpu - local cpu's first.
For example: in a 16 cores server with even cpu's local, will get the
following values:
cpumask_set_cpu_local_first(0, numa, cpumask) => cpu 0 is set
cpumask_set_cpu_local_first(1, numa, cpumask) => cpu 2 is set
...
cpumask_set_cpu_local_first(7, numa, cpumask) => cpu 14 is set
cpumask_set_cpu_local_first(8, numa, cpumask) => cpu 1 is set
cpumask_set_cpu_local_first(9, numa, cpumask) => cpu 3 is set
...
cpumask_set_cpu_local_first(15, numa, cpumask) => cpu 15 is set

Curently this function will be used by multi queue networking devices to
calculate the irq affinity mask, such that as many local cpu's as
possible will be utilized to handle the mq device irq's.

Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cpumask.h |  2 ++
 lib/cpumask.c           | 64 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d08e4d2a9b92..3551d667ef9f 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -257,6 +257,8 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
 	set_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
 
+int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp);
+
 /**
  * cpumask_clear_cpu - clear a cpu in a cpumask
  * @cpu: cpu number (< nr_cpu_ids)
diff --git a/lib/cpumask.c b/lib/cpumask.c
index b810b753c607..14049a96f04a 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -163,4 +163,68 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 	memblock_free_early(__pa(mask), cpumask_size());
 }
+
+/**
+ * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first
+ *
+ * @i: index number
+ * @numa_node: local numa_node
+ * @dstp: cpumask with the relevant cpu bit set according to the policy
+ *
+ * This function sets the cpumask according to a numa aware policy.
+ * cpumask could be used as an affinity hint for the IRQ related to a
+ * queue. When the policy is to spread queues across cores - local cores
+ * first.
+ *
+ * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set
+ * the cpu bit and need to re-call the function.
+ */
+int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp)
+{
+	cpumask_var_t mask;
+	int cpu;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	i %= num_online_cpus();
+
+	if (!cpumask_of_node(numa_node)) {
+		/* Use all online cpu's for non numa aware system */
+		cpumask_copy(mask, cpu_online_mask);
+	} else {
+		int n;
+
+		cpumask_and(mask,
+			    cpumask_of_node(numa_node), cpu_online_mask);
+
+		n = cpumask_weight(mask);
+		if (i >= n) {
+			i -= n;
+
+			/* If index > number of local cpu's, mask out local
+			 * cpu's
+			 */
+			cpumask_andnot(mask, cpu_online_mask, mask);
+		}
+	}
+
+	for_each_cpu(cpu, mask) {
+		if (--i < 0)
+			goto out;
+	}
+
+	ret = -EAGAIN;
+
+out:
+	free_cpumask_var(mask);
+
+	if (!ret)
+		cpumask_set_cpu(cpu, dstp);
+
+	return ret;
+}
+EXPORT_SYMBOL(cpumask_set_cpu_local_first);
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 70a640d0dae3a9b1b222ce673eb5d92c263ddd61 Mon Sep 17 00:00:00 2001
From: Yuval Atias <yuvala@mellanox.com>
Date: Sun, 25 May 2014 17:47:27 +0300
Subject: net/mlx4_en: Use affinity hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The “affinity hint” mechanism is used by the user space
daemon, irqbalancer, to indicate a preferred CPU mask for irqs.
Irqbalancer can use this hint to balance the irqs between the
cpus indicated by the mask.

We wish the HCA to preferentially map the IRQs it uses to numa cores
close to it.  To accomplish this, we use cpumask_set_cpu_local_first(), that
sets the affinity hint according the following policy:
First it maps IRQs to “close” numa cores.  If these are exhausted, the
remaining IRQs are mapped to “far” numa cores.

Signed-off-by: Yuval Atias <yuvala@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/main.c              |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_cq.c     |  6 +++++-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 30 ++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/eq.c        | 13 ++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  1 +
 include/linux/mlx4/device.h                    |  2 +-
 6 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 199c7896f081..58b1f239ac2b 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1897,7 +1897,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 				 i, j, dev->pdev->bus->name);
 			/* Set IRQ for specific name (per ring) */
 			if (mlx4_assign_eq(dev, name, NULL,
-					   &ibdev->eq_table[eq])) {
+					   &ibdev->eq_table[eq], NULL)) {
 				/* Use legacy (same as mlx4_en driver) */
 				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
 				ibdev->eq_table[eq] =
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 636963db598a..ea2cd72e5368 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -118,11 +118,15 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	if (cq->is_tx == RX) {
 		if (mdev->dev->caps.comp_pool) {
 			if (!cq->vector) {
+				struct mlx4_en_rx_ring *ring =
+					priv->rx_ring[cq->ring];
+
 				sprintf(name, "%s-%d", priv->dev->name,
 					cq->ring);
 				/* Set IRQ for specific name (per ring) */
 				if (mlx4_assign_eq(mdev->dev, name, rmap,
-						   &cq->vector)) {
+						   &cq->vector,
+						   ring->affinity_mask)) {
 					cq->vector = (cq->ring + 1 + priv->port)
 					    % mdev->dev->caps.num_comp_vectors;
 					mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n",
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 58209bd0c94c..05d135572abc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1526,6 +1526,32 @@ static void mlx4_en_linkstate(struct work_struct *work)
 	mutex_unlock(&mdev->state_lock);
 }
 
+static void mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx];
+	int numa_node = priv->mdev->dev->numa_node;
+
+	if (numa_node == -1)
+		return;
+
+	if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL)) {
+		en_err(priv, "Failed to allocate core mask\n");
+		return;
+	}
+
+	if (cpumask_set_cpu_local_first(ring_idx, numa_node,
+					ring->affinity_mask)) {
+		en_err(priv, "Failed setting affinity hint\n");
+		free_cpumask_var(ring->affinity_mask);
+		ring->affinity_mask = NULL;
+	}
+}
+
+static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
+	priv->rx_ring[ring_idx]->affinity_mask = NULL;
+}
 
 int mlx4_en_start_port(struct net_device *dev)
 {
@@ -1567,6 +1593,8 @@ int mlx4_en_start_port(struct net_device *dev)
 
 		mlx4_en_cq_init_lock(cq);
 
+		mlx4_en_init_affinity_hint(priv, i);
+
 		err = mlx4_en_activate_cq(priv, cq, i);
 		if (err) {
 			en_err(priv, "Failed activating Rx CQ\n");
@@ -1847,6 +1875,8 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 			msleep(1);
 		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
 		mlx4_en_deactivate_cq(priv, cq);
+
+		mlx4_en_free_affinity_hint(priv, i);
 	}
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index d954ec1eac17..f91659e5fa13 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -1376,7 +1376,7 @@ int mlx4_test_interrupts(struct mlx4_dev *dev)
 EXPORT_SYMBOL(mlx4_test_interrupts);
 
 int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
-		   int *vector)
+		   int *vector, cpumask_var_t cpu_hint_mask)
 {
 
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1411,6 +1411,15 @@ int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
 			}
 			mlx4_assign_irq_notifier(priv, dev,
 						 priv->eq_table.eq[vec].irq);
+			if (cpu_hint_mask) {
+				err = irq_set_affinity_hint(
+						priv->eq_table.eq[vec].irq,
+						cpu_hint_mask);
+				if (err) {
+					mlx4_warn(dev, "Failed setting affinity hint\n");
+					/*we dont want to break here*/
+				}
+			}
 
 			eq_set_ci(&priv->eq_table.eq[vec], 1);
 		}
@@ -1441,6 +1450,8 @@ void mlx4_release_eq(struct mlx4_dev *dev, int vec)
 			irq_set_affinity_notifier(
 				priv->eq_table.eq[vec].irq,
 				NULL);
+			irq_set_affinity_hint(priv->eq_table.eq[vec].irq,
+					      NULL);
 			free_irq(priv->eq_table.eq[vec].irq,
 				 &priv->eq_table.eq[vec]);
 			priv->msix_ctl.pool_bm &= ~(1ULL << i);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index b5db1bf361dc..0e15295bedd6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -313,6 +313,7 @@ struct mlx4_en_rx_ring {
 	unsigned long csum_ok;
 	unsigned long csum_none;
 	int hwtstamp_rx_filter;
+	cpumask_var_t affinity_mask;
 };
 
 struct mlx4_en_cq {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ca38871a585c..b9b70e00e3c1 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1161,7 +1161,7 @@ int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
 int mlx4_test_interrupts(struct mlx4_dev *dev);
 int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
-		   int *vector);
+		   int *vector, cpumask_t *cpu_hint_mask);
 void mlx4_release_eq(struct mlx4_dev *dev, int vec);
 
 int mlx4_get_phys_port_id(struct mlx4_dev *dev);
-- 
cgit v1.2.3-71-gd317


From ee39facbf82e73e468c504d2b40e83e2d223c28c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 1 Jun 2014 21:58:02 -0700
Subject: net: Revert mlx4 cpumask changes.

This reverts commit 70a640d0dae3a9b1b222ce673eb5d92c263ddd61
("net/mlx4_en: Use affinity hint") and commit
c8865b64b05b2f4eeefd369373e9c8aeb069e7a1 ("cpumask: Utility function
to set n'th cpu - local cpu first") because these changes break
the build when SMP is disabled amongst other things.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/micrel/ks8851.c | 50 +++++++++++++++-------------
 include/linux/cpumask.h              |  2 --
 lib/cpumask.c                        | 64 ------------------------------------
 3 files changed, 28 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c
index 66d4ab703f45..e72918970a58 100644
--- a/drivers/net/ethernet/micrel/ks8851.c
+++ b/drivers/net/ethernet/micrel/ks8851.c
@@ -1441,30 +1441,32 @@ static int ks8851_probe(struct spi_device *spi)
 		}
 	}
 
-	ks->vdd_io = devm_regulator_get(&spi->dev, "vdd-io");
+	ks->vdd_io = devm_regulator_get_optional(&spi->dev, "vdd-io");
 	if (IS_ERR(ks->vdd_io)) {
 		ret = PTR_ERR(ks->vdd_io);
-		goto err_reg_io;
-	}
-
-	ret = regulator_enable(ks->vdd_io);
-	if (ret) {
-		dev_err(&spi->dev, "regulator vdd_io enable fail: %d\n",
-			ret);
-		goto err_reg_io;
+		if (ret == -EPROBE_DEFER)
+			goto err_reg_io;
+	} else {
+		ret = regulator_enable(ks->vdd_io);
+		if (ret) {
+			dev_err(&spi->dev, "regulator vdd_io enable fail: %d\n",
+				ret);
+			goto err_reg_io;
+		}
 	}
 
-	ks->vdd_reg = devm_regulator_get(&spi->dev, "vdd");
+	ks->vdd_reg = devm_regulator_get_optional(&spi->dev, "vdd");
 	if (IS_ERR(ks->vdd_reg)) {
 		ret = PTR_ERR(ks->vdd_reg);
-		goto err_reg;
-	}
-
-	ret = regulator_enable(ks->vdd_reg);
-	if (ret) {
-		dev_err(&spi->dev, "regulator vdd enable fail: %d\n",
-			ret);
-		goto err_reg;
+		if (ret == -EPROBE_DEFER)
+			goto err_reg;
+	} else {
+		ret = regulator_enable(ks->vdd_reg);
+		if (ret) {
+			dev_err(&spi->dev, "regulator vdd enable fail: %d\n",
+				ret);
+			goto err_reg;
+		}
 	}
 
 	if (gpio_is_valid(gpio)) {
@@ -1570,9 +1572,11 @@ err_irq:
 	if (gpio_is_valid(gpio))
 		gpio_set_value(gpio, 0);
 err_id:
-	regulator_disable(ks->vdd_reg);
+	if (!IS_ERR(ks->vdd_reg))
+		regulator_disable(ks->vdd_reg);
 err_reg:
-	regulator_disable(ks->vdd_io);
+	if (!IS_ERR(ks->vdd_io))
+		regulator_disable(ks->vdd_io);
 err_reg_io:
 err_gpio:
 	free_netdev(ndev);
@@ -1590,8 +1594,10 @@ static int ks8851_remove(struct spi_device *spi)
 	free_irq(spi->irq, priv);
 	if (gpio_is_valid(priv->gpio))
 		gpio_set_value(priv->gpio, 0);
-	regulator_disable(priv->vdd_reg);
-	regulator_disable(priv->vdd_io);
+	if (!IS_ERR(priv->vdd_reg))
+		regulator_disable(priv->vdd_reg);
+	if (!IS_ERR(priv->vdd_io))
+		regulator_disable(priv->vdd_io);
 	free_netdev(priv->netdev);
 
 	return 0;
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 3551d667ef9f..d08e4d2a9b92 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -257,8 +257,6 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
 	set_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
 
-int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp);
-
 /**
  * cpumask_clear_cpu - clear a cpu in a cpumask
  * @cpu: cpu number (< nr_cpu_ids)
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 14049a96f04a..b810b753c607 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -163,68 +163,4 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 	memblock_free_early(__pa(mask), cpumask_size());
 }
-
-/**
- * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first
- *
- * @i: index number
- * @numa_node: local numa_node
- * @dstp: cpumask with the relevant cpu bit set according to the policy
- *
- * This function sets the cpumask according to a numa aware policy.
- * cpumask could be used as an affinity hint for the IRQ related to a
- * queue. When the policy is to spread queues across cores - local cores
- * first.
- *
- * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set
- * the cpu bit and need to re-call the function.
- */
-int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp)
-{
-	cpumask_var_t mask;
-	int cpu;
-	int ret = 0;
-
-	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	i %= num_online_cpus();
-
-	if (!cpumask_of_node(numa_node)) {
-		/* Use all online cpu's for non numa aware system */
-		cpumask_copy(mask, cpu_online_mask);
-	} else {
-		int n;
-
-		cpumask_and(mask,
-			    cpumask_of_node(numa_node), cpu_online_mask);
-
-		n = cpumask_weight(mask);
-		if (i >= n) {
-			i -= n;
-
-			/* If index > number of local cpu's, mask out local
-			 * cpu's
-			 */
-			cpumask_andnot(mask, cpu_online_mask, mask);
-		}
-	}
-
-	for_each_cpu(cpu, mask) {
-		if (--i < 0)
-			goto out;
-	}
-
-	ret = -EAGAIN;
-
-out:
-	free_cpumask_var(mask);
-
-	if (!ret)
-		cpumask_set_cpu(cpu, dstp);
-
-	return ret;
-}
-EXPORT_SYMBOL(cpumask_set_cpu_local_first);
-
 #endif
-- 
cgit v1.2.3-71-gd317


From 3480593131e0b781287dae0139bf7ccee7cba7ff Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 29 May 2014 10:22:50 +0200
Subject: net: filter: get rid of BPF_S_* enum

This patch finally allows us to get rid of the BPF_S_* enum.
Currently, the code performs unnecessary encode and decode
workarounds in seccomp and filter migration itself when a filter
is being attached in order to overcome BPF_S_* encoding which
is not used anymore by the new interpreter resp. JIT compilers.

Keeping it around would mean that also in future we would need
to extend and maintain this enum and related encoders/decoders.
We can get rid of all that and save us these operations during
filter attaching. Naturally, also JIT compilers need to be updated
by this.

Before JIT conversion is being done, each compiler checks if A
is being loaded at startup to obtain information if it needs to
emit instructions to clear A first. Since BPF extensions are a
subset of BPF_LD | BPF_{W,H,B} | BPF_ABS variants, case statements
for extensions can be removed at that point. To ease and minimalize
code changes in the classic JITs, we have introduced bpf_anc_helper().

Tested with test_bpf on x86_64 (JIT, int), s390x (JIT, int),
arm (JIT, int), i368 (int), ppc64 (JIT, int); for sparc we
unfortunately didn't have access, but changes are analogous to
the rest.

Joint work with Alexei Starovoitov.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mircea Gherzan <mgherzan@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Chema Gonzalez <chemag@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/net/bpf_jit_32.c       | 139 ++++++++--------
 arch/powerpc/net/bpf_jit_64.S   |   2 +-
 arch/powerpc/net/bpf_jit_comp.c | 157 +++++++++---------
 arch/s390/net/bpf_jit_comp.c    | 163 +++++++++----------
 arch/sparc/net/bpf_jit_comp.c   | 154 +++++++++---------
 include/linux/filter.h          | 108 +++++--------
 kernel/seccomp.c                |  83 +++++-----
 net/core/filter.c               | 341 +++++++++++++++-------------------------
 8 files changed, 498 insertions(+), 649 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 6f879c319a9d..fb5503ce016f 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -136,7 +136,7 @@ static u16 saved_regs(struct jit_ctx *ctx)
 	u16 ret = 0;
 
 	if ((ctx->skf->len > 1) ||
-	    (ctx->skf->insns[0].code == BPF_S_RET_A))
+	    (ctx->skf->insns[0].code == (BPF_RET | BPF_A)))
 		ret |= 1 << r_A;
 
 #ifdef CONFIG_FRAME_POINTER
@@ -164,18 +164,10 @@ static inline int mem_words_used(struct jit_ctx *ctx)
 static inline bool is_load_to_a(u16 inst)
 {
 	switch (inst) {
-	case BPF_S_LD_W_LEN:
-	case BPF_S_LD_W_ABS:
-	case BPF_S_LD_H_ABS:
-	case BPF_S_LD_B_ABS:
-	case BPF_S_ANC_CPU:
-	case BPF_S_ANC_IFINDEX:
-	case BPF_S_ANC_MARK:
-	case BPF_S_ANC_PROTOCOL:
-	case BPF_S_ANC_RXHASH:
-	case BPF_S_ANC_VLAN_TAG:
-	case BPF_S_ANC_VLAN_TAG_PRESENT:
-	case BPF_S_ANC_QUEUE:
+	case BPF_LD | BPF_W | BPF_LEN:
+	case BPF_LD | BPF_W | BPF_ABS:
+	case BPF_LD | BPF_H | BPF_ABS:
+	case BPF_LD | BPF_B | BPF_ABS:
 		return true;
 	default:
 		return false;
@@ -215,7 +207,7 @@ static void build_prologue(struct jit_ctx *ctx)
 		emit(ARM_MOV_I(r_X, 0), ctx);
 
 	/* do not leak kernel data to userspace */
-	if ((first_inst != BPF_S_RET_K) && !(is_load_to_a(first_inst)))
+	if ((first_inst != (BPF_RET | BPF_K)) && !(is_load_to_a(first_inst)))
 		emit(ARM_MOV_I(r_A, 0), ctx);
 
 	/* stack space for the BPF_MEM words */
@@ -480,36 +472,39 @@ static int build_body(struct jit_ctx *ctx)
 	u32 k;
 
 	for (i = 0; i < prog->len; i++) {
+		u16 code;
+
 		inst = &(prog->insns[i]);
 		/* K as an immediate value operand */
 		k = inst->k;
+		code = bpf_anc_helper(inst);
 
 		/* compute offsets only in the fake pass */
 		if (ctx->target == NULL)
 			ctx->offsets[i] = ctx->idx * 4;
 
-		switch (inst->code) {
-		case BPF_S_LD_IMM:
+		switch (code) {
+		case BPF_LD | BPF_IMM:
 			emit_mov_i(r_A, k, ctx);
 			break;
-		case BPF_S_LD_W_LEN:
+		case BPF_LD | BPF_W | BPF_LEN:
 			ctx->seen |= SEEN_SKB;
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
 			emit(ARM_LDR_I(r_A, r_skb,
 				       offsetof(struct sk_buff, len)), ctx);
 			break;
-		case BPF_S_LD_MEM:
+		case BPF_LD | BPF_MEM:
 			/* A = scratch[k] */
 			ctx->seen |= SEEN_MEM_WORD(k);
 			emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
 			break;
-		case BPF_S_LD_W_ABS:
+		case BPF_LD | BPF_W | BPF_ABS:
 			load_order = 2;
 			goto load;
-		case BPF_S_LD_H_ABS:
+		case BPF_LD | BPF_H | BPF_ABS:
 			load_order = 1;
 			goto load;
-		case BPF_S_LD_B_ABS:
+		case BPF_LD | BPF_B | BPF_ABS:
 			load_order = 0;
 load:
 			/* the interpreter will deal with the negative K */
@@ -552,31 +547,31 @@ load_common:
 			emit_err_ret(ARM_COND_NE, ctx);
 			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
 			break;
-		case BPF_S_LD_W_IND:
+		case BPF_LD | BPF_W | BPF_IND:
 			load_order = 2;
 			goto load_ind;
-		case BPF_S_LD_H_IND:
+		case BPF_LD | BPF_H | BPF_IND:
 			load_order = 1;
 			goto load_ind;
-		case BPF_S_LD_B_IND:
+		case BPF_LD | BPF_B | BPF_IND:
 			load_order = 0;
 load_ind:
 			OP_IMM3(ARM_ADD, r_off, r_X, k, ctx);
 			goto load_common;
-		case BPF_S_LDX_IMM:
+		case BPF_LDX | BPF_IMM:
 			ctx->seen |= SEEN_X;
 			emit_mov_i(r_X, k, ctx);
 			break;
-		case BPF_S_LDX_W_LEN:
+		case BPF_LDX | BPF_W | BPF_LEN:
 			ctx->seen |= SEEN_X | SEEN_SKB;
 			emit(ARM_LDR_I(r_X, r_skb,
 				       offsetof(struct sk_buff, len)), ctx);
 			break;
-		case BPF_S_LDX_MEM:
+		case BPF_LDX | BPF_MEM:
 			ctx->seen |= SEEN_X | SEEN_MEM_WORD(k);
 			emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
 			break;
-		case BPF_S_LDX_B_MSH:
+		case BPF_LDX | BPF_B | BPF_MSH:
 			/* x = ((*(frame + k)) & 0xf) << 2; */
 			ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL;
 			/* the interpreter should deal with the negative K */
@@ -606,113 +601,113 @@ load_ind:
 			emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx);
 			emit(ARM_LSL_I(r_X, r_X, 2), ctx);
 			break;
-		case BPF_S_ST:
+		case BPF_ST:
 			ctx->seen |= SEEN_MEM_WORD(k);
 			emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
 			break;
-		case BPF_S_STX:
+		case BPF_STX:
 			update_on_xread(ctx);
 			ctx->seen |= SEEN_MEM_WORD(k);
 			emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
 			break;
-		case BPF_S_ALU_ADD_K:
+		case BPF_ALU | BPF_ADD | BPF_K:
 			/* A += K */
 			OP_IMM3(ARM_ADD, r_A, r_A, k, ctx);
 			break;
-		case BPF_S_ALU_ADD_X:
+		case BPF_ALU | BPF_ADD | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_ADD_R(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_SUB_K:
+		case BPF_ALU | BPF_SUB | BPF_K:
 			/* A -= K */
 			OP_IMM3(ARM_SUB, r_A, r_A, k, ctx);
 			break;
-		case BPF_S_ALU_SUB_X:
+		case BPF_ALU | BPF_SUB | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_SUB_R(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_MUL_K:
+		case BPF_ALU | BPF_MUL | BPF_K:
 			/* A *= K */
 			emit_mov_i(r_scratch, k, ctx);
 			emit(ARM_MUL(r_A, r_A, r_scratch), ctx);
 			break;
-		case BPF_S_ALU_MUL_X:
+		case BPF_ALU | BPF_MUL | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_MUL(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_DIV_K:
+		case BPF_ALU | BPF_DIV | BPF_K:
 			if (k == 1)
 				break;
 			emit_mov_i(r_scratch, k, ctx);
 			emit_udiv(r_A, r_A, r_scratch, ctx);
 			break;
-		case BPF_S_ALU_DIV_X:
+		case BPF_ALU | BPF_DIV | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_CMP_I(r_X, 0), ctx);
 			emit_err_ret(ARM_COND_EQ, ctx);
 			emit_udiv(r_A, r_A, r_X, ctx);
 			break;
-		case BPF_S_ALU_OR_K:
+		case BPF_ALU | BPF_OR | BPF_K:
 			/* A |= K */
 			OP_IMM3(ARM_ORR, r_A, r_A, k, ctx);
 			break;
-		case BPF_S_ALU_OR_X:
+		case BPF_ALU | BPF_OR | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_ORR_R(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_XOR_K:
+		case BPF_ALU | BPF_XOR | BPF_K:
 			/* A ^= K; */
 			OP_IMM3(ARM_EOR, r_A, r_A, k, ctx);
 			break;
-		case BPF_S_ANC_ALU_XOR_X:
-		case BPF_S_ALU_XOR_X:
+		case BPF_ANC | SKF_AD_ALU_XOR_X:
+		case BPF_ALU | BPF_XOR | BPF_X:
 			/* A ^= X */
 			update_on_xread(ctx);
 			emit(ARM_EOR_R(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_AND_K:
+		case BPF_ALU | BPF_AND | BPF_K:
 			/* A &= K */
 			OP_IMM3(ARM_AND, r_A, r_A, k, ctx);
 			break;
-		case BPF_S_ALU_AND_X:
+		case BPF_ALU | BPF_AND | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_AND_R(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_LSH_K:
+		case BPF_ALU | BPF_LSH | BPF_K:
 			if (unlikely(k > 31))
 				return -1;
 			emit(ARM_LSL_I(r_A, r_A, k), ctx);
 			break;
-		case BPF_S_ALU_LSH_X:
+		case BPF_ALU | BPF_LSH | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_LSL_R(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_RSH_K:
+		case BPF_ALU | BPF_RSH | BPF_K:
 			if (unlikely(k > 31))
 				return -1;
 			emit(ARM_LSR_I(r_A, r_A, k), ctx);
 			break;
-		case BPF_S_ALU_RSH_X:
+		case BPF_ALU | BPF_RSH | BPF_X:
 			update_on_xread(ctx);
 			emit(ARM_LSR_R(r_A, r_A, r_X), ctx);
 			break;
-		case BPF_S_ALU_NEG:
+		case BPF_ALU | BPF_NEG:
 			/* A = -A */
 			emit(ARM_RSB_I(r_A, r_A, 0), ctx);
 			break;
-		case BPF_S_JMP_JA:
+		case BPF_JMP | BPF_JA:
 			/* pc += K */
 			emit(ARM_B(b_imm(i + k + 1, ctx)), ctx);
 			break;
-		case BPF_S_JMP_JEQ_K:
+		case BPF_JMP | BPF_JEQ | BPF_K:
 			/* pc += (A == K) ? pc->jt : pc->jf */
 			condt  = ARM_COND_EQ;
 			goto cmp_imm;
-		case BPF_S_JMP_JGT_K:
+		case BPF_JMP | BPF_JGT | BPF_K:
 			/* pc += (A > K) ? pc->jt : pc->jf */
 			condt  = ARM_COND_HI;
 			goto cmp_imm;
-		case BPF_S_JMP_JGE_K:
+		case BPF_JMP | BPF_JGE | BPF_K:
 			/* pc += (A >= K) ? pc->jt : pc->jf */
 			condt  = ARM_COND_HS;
 cmp_imm:
@@ -731,22 +726,22 @@ cond_jump:
 				_emit(condt ^ 1, ARM_B(b_imm(i + inst->jf + 1,
 							     ctx)), ctx);
 			break;
-		case BPF_S_JMP_JEQ_X:
+		case BPF_JMP | BPF_JEQ | BPF_X:
 			/* pc += (A == X) ? pc->jt : pc->jf */
 			condt   = ARM_COND_EQ;
 			goto cmp_x;
-		case BPF_S_JMP_JGT_X:
+		case BPF_JMP | BPF_JGT | BPF_X:
 			/* pc += (A > X) ? pc->jt : pc->jf */
 			condt   = ARM_COND_HI;
 			goto cmp_x;
-		case BPF_S_JMP_JGE_X:
+		case BPF_JMP | BPF_JGE | BPF_X:
 			/* pc += (A >= X) ? pc->jt : pc->jf */
 			condt   = ARM_COND_CS;
 cmp_x:
 			update_on_xread(ctx);
 			emit(ARM_CMP_R(r_A, r_X), ctx);
 			goto cond_jump;
-		case BPF_S_JMP_JSET_K:
+		case BPF_JMP | BPF_JSET | BPF_K:
 			/* pc += (A & K) ? pc->jt : pc->jf */
 			condt  = ARM_COND_NE;
 			/* not set iff all zeroes iff Z==1 iff EQ */
@@ -759,16 +754,16 @@ cmp_x:
 				emit(ARM_TST_I(r_A, imm12), ctx);
 			}
 			goto cond_jump;
-		case BPF_S_JMP_JSET_X:
+		case BPF_JMP | BPF_JSET | BPF_X:
 			/* pc += (A & X) ? pc->jt : pc->jf */
 			update_on_xread(ctx);
 			condt  = ARM_COND_NE;
 			emit(ARM_TST_R(r_A, r_X), ctx);
 			goto cond_jump;
-		case BPF_S_RET_A:
+		case BPF_RET | BPF_A:
 			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
 			goto b_epilogue;
-		case BPF_S_RET_K:
+		case BPF_RET | BPF_K:
 			if ((k == 0) && (ctx->ret0_fp_idx < 0))
 				ctx->ret0_fp_idx = i;
 			emit_mov_i(ARM_R0, k, ctx);
@@ -776,17 +771,17 @@ b_epilogue:
 			if (i != ctx->skf->len - 1)
 				emit(ARM_B(b_imm(prog->len, ctx)), ctx);
 			break;
-		case BPF_S_MISC_TAX:
+		case BPF_MISC | BPF_TAX:
 			/* X = A */
 			ctx->seen |= SEEN_X;
 			emit(ARM_MOV_R(r_X, r_A), ctx);
 			break;
-		case BPF_S_MISC_TXA:
+		case BPF_MISC | BPF_TXA:
 			/* A = X */
 			update_on_xread(ctx);
 			emit(ARM_MOV_R(r_A, r_X), ctx);
 			break;
-		case BPF_S_ANC_PROTOCOL:
+		case BPF_ANC | SKF_AD_PROTOCOL:
 			/* A = ntohs(skb->protocol) */
 			ctx->seen |= SEEN_SKB;
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
@@ -795,7 +790,7 @@ b_epilogue:
 			emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx);
 			emit_swap16(r_A, r_scratch, ctx);
 			break;
-		case BPF_S_ANC_CPU:
+		case BPF_ANC | SKF_AD_CPU:
 			/* r_scratch = current_thread_info() */
 			OP_IMM3(ARM_BIC, r_scratch, ARM_SP, THREAD_SIZE - 1, ctx);
 			/* A = current_thread_info()->cpu */
@@ -803,7 +798,7 @@ b_epilogue:
 			off = offsetof(struct thread_info, cpu);
 			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
 			break;
-		case BPF_S_ANC_IFINDEX:
+		case BPF_ANC | SKF_AD_IFINDEX:
 			/* A = skb->dev->ifindex */
 			ctx->seen |= SEEN_SKB;
 			off = offsetof(struct sk_buff, dev);
@@ -817,30 +812,30 @@ b_epilogue:
 			off = offsetof(struct net_device, ifindex);
 			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
 			break;
-		case BPF_S_ANC_MARK:
+		case BPF_ANC | SKF_AD_MARK:
 			ctx->seen |= SEEN_SKB;
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 			off = offsetof(struct sk_buff, mark);
 			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
 			break;
-		case BPF_S_ANC_RXHASH:
+		case BPF_ANC | SKF_AD_RXHASH:
 			ctx->seen |= SEEN_SKB;
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 			off = offsetof(struct sk_buff, hash);
 			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
 			break;
-		case BPF_S_ANC_VLAN_TAG:
-		case BPF_S_ANC_VLAN_TAG_PRESENT:
+		case BPF_ANC | SKF_AD_VLAN_TAG:
+		case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
 			ctx->seen |= SEEN_SKB;
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 			off = offsetof(struct sk_buff, vlan_tci);
 			emit(ARM_LDRH_I(r_A, r_skb, off), ctx);
-			if (inst->code == BPF_S_ANC_VLAN_TAG)
+			if (code == (BPF_ANC | SKF_AD_VLAN_TAG))
 				OP_IMM3(ARM_AND, r_A, r_A, VLAN_VID_MASK, ctx);
 			else
 				OP_IMM3(ARM_AND, r_A, r_A, VLAN_TAG_PRESENT, ctx);
 			break;
-		case BPF_S_ANC_QUEUE:
+		case BPF_ANC | SKF_AD_QUEUE:
 			ctx->seen |= SEEN_SKB;
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
 						  queue_mapping) != 2);
diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S
index e76eba74d9da..8f87d9217122 100644
--- a/arch/powerpc/net/bpf_jit_64.S
+++ b/arch/powerpc/net/bpf_jit_64.S
@@ -78,7 +78,7 @@ sk_load_byte_positive_offset:
 	blr
 
 /*
- * BPF_S_LDX_B_MSH: ldxb  4*([offset]&0xf)
+ * BPF_LDX | BPF_B | BPF_MSH: ldxb  4*([offset]&0xf)
  * r_addr is the offset value
  */
 	.globl sk_load_byte_msh
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 808ce1cae21a..6dcdadefd8d0 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -79,19 +79,11 @@ static void bpf_jit_build_prologue(struct sk_filter *fp, u32 *image,
 	}
 
 	switch (filter[0].code) {
-	case BPF_S_RET_K:
-	case BPF_S_LD_W_LEN:
-	case BPF_S_ANC_PROTOCOL:
-	case BPF_S_ANC_IFINDEX:
-	case BPF_S_ANC_MARK:
-	case BPF_S_ANC_RXHASH:
-	case BPF_S_ANC_VLAN_TAG:
-	case BPF_S_ANC_VLAN_TAG_PRESENT:
-	case BPF_S_ANC_CPU:
-	case BPF_S_ANC_QUEUE:
-	case BPF_S_LD_W_ABS:
-	case BPF_S_LD_H_ABS:
-	case BPF_S_LD_B_ABS:
+	case BPF_RET | BPF_K:
+	case BPF_LD | BPF_W | BPF_LEN:
+	case BPF_LD | BPF_W | BPF_ABS:
+	case BPF_LD | BPF_H | BPF_ABS:
+	case BPF_LD | BPF_B | BPF_ABS:
 		/* first instruction sets A register (or is RET 'constant') */
 		break;
 	default:
@@ -144,6 +136,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 
 	for (i = 0; i < flen; i++) {
 		unsigned int K = filter[i].k;
+		u16 code = bpf_anc_helper(&filter[i]);
 
 		/*
 		 * addrs[] maps a BPF bytecode address into a real offset from
@@ -151,35 +144,35 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 		 */
 		addrs[i] = ctx->idx * 4;
 
-		switch (filter[i].code) {
+		switch (code) {
 			/*** ALU ops ***/
-		case BPF_S_ALU_ADD_X: /* A += X; */
+		case BPF_ALU | BPF_ADD | BPF_X: /* A += X; */
 			ctx->seen |= SEEN_XREG;
 			PPC_ADD(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_ADD_K: /* A += K; */
+		case BPF_ALU | BPF_ADD | BPF_K: /* A += K; */
 			if (!K)
 				break;
 			PPC_ADDI(r_A, r_A, IMM_L(K));
 			if (K >= 32768)
 				PPC_ADDIS(r_A, r_A, IMM_HA(K));
 			break;
-		case BPF_S_ALU_SUB_X: /* A -= X; */
+		case BPF_ALU | BPF_SUB | BPF_X: /* A -= X; */
 			ctx->seen |= SEEN_XREG;
 			PPC_SUB(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_SUB_K: /* A -= K */
+		case BPF_ALU | BPF_SUB | BPF_K: /* A -= K */
 			if (!K)
 				break;
 			PPC_ADDI(r_A, r_A, IMM_L(-K));
 			if (K >= 32768)
 				PPC_ADDIS(r_A, r_A, IMM_HA(-K));
 			break;
-		case BPF_S_ALU_MUL_X: /* A *= X; */
+		case BPF_ALU | BPF_MUL | BPF_X: /* A *= X; */
 			ctx->seen |= SEEN_XREG;
 			PPC_MUL(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_MUL_K: /* A *= K */
+		case BPF_ALU | BPF_MUL | BPF_K: /* A *= K */
 			if (K < 32768)
 				PPC_MULI(r_A, r_A, K);
 			else {
@@ -187,7 +180,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 				PPC_MUL(r_A, r_A, r_scratch1);
 			}
 			break;
-		case BPF_S_ALU_MOD_X: /* A %= X; */
+		case BPF_ALU | BPF_MOD | BPF_X: /* A %= X; */
 			ctx->seen |= SEEN_XREG;
 			PPC_CMPWI(r_X, 0);
 			if (ctx->pc_ret0 != -1) {
@@ -201,13 +194,13 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			PPC_MUL(r_scratch1, r_X, r_scratch1);
 			PPC_SUB(r_A, r_A, r_scratch1);
 			break;
-		case BPF_S_ALU_MOD_K: /* A %= K; */
+		case BPF_ALU | BPF_MOD | BPF_K: /* A %= K; */
 			PPC_LI32(r_scratch2, K);
 			PPC_DIVWU(r_scratch1, r_A, r_scratch2);
 			PPC_MUL(r_scratch1, r_scratch2, r_scratch1);
 			PPC_SUB(r_A, r_A, r_scratch1);
 			break;
-		case BPF_S_ALU_DIV_X: /* A /= X; */
+		case BPF_ALU | BPF_DIV | BPF_X: /* A /= X; */
 			ctx->seen |= SEEN_XREG;
 			PPC_CMPWI(r_X, 0);
 			if (ctx->pc_ret0 != -1) {
@@ -223,17 +216,17 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			}
 			PPC_DIVWU(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_DIV_K: /* A /= K */
+		case BPF_ALU | BPF_DIV | BPF_K: /* A /= K */
 			if (K == 1)
 				break;
 			PPC_LI32(r_scratch1, K);
 			PPC_DIVWU(r_A, r_A, r_scratch1);
 			break;
-		case BPF_S_ALU_AND_X:
+		case BPF_ALU | BPF_AND | BPF_X:
 			ctx->seen |= SEEN_XREG;
 			PPC_AND(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_AND_K:
+		case BPF_ALU | BPF_AND | BPF_K:
 			if (!IMM_H(K))
 				PPC_ANDI(r_A, r_A, K);
 			else {
@@ -241,51 +234,51 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 				PPC_AND(r_A, r_A, r_scratch1);
 			}
 			break;
-		case BPF_S_ALU_OR_X:
+		case BPF_ALU | BPF_OR | BPF_X:
 			ctx->seen |= SEEN_XREG;
 			PPC_OR(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_OR_K:
+		case BPF_ALU | BPF_OR | BPF_K:
 			if (IMM_L(K))
 				PPC_ORI(r_A, r_A, IMM_L(K));
 			if (K >= 65536)
 				PPC_ORIS(r_A, r_A, IMM_H(K));
 			break;
-		case BPF_S_ANC_ALU_XOR_X:
-		case BPF_S_ALU_XOR_X: /* A ^= X */
+		case BPF_ANC | SKF_AD_ALU_XOR_X:
+		case BPF_ALU | BPF_XOR | BPF_X: /* A ^= X */
 			ctx->seen |= SEEN_XREG;
 			PPC_XOR(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_XOR_K: /* A ^= K */
+		case BPF_ALU | BPF_XOR | BPF_K: /* A ^= K */
 			if (IMM_L(K))
 				PPC_XORI(r_A, r_A, IMM_L(K));
 			if (K >= 65536)
 				PPC_XORIS(r_A, r_A, IMM_H(K));
 			break;
-		case BPF_S_ALU_LSH_X: /* A <<= X; */
+		case BPF_ALU | BPF_LSH | BPF_X: /* A <<= X; */
 			ctx->seen |= SEEN_XREG;
 			PPC_SLW(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_LSH_K:
+		case BPF_ALU | BPF_LSH | BPF_K:
 			if (K == 0)
 				break;
 			else
 				PPC_SLWI(r_A, r_A, K);
 			break;
-		case BPF_S_ALU_RSH_X: /* A >>= X; */
+		case BPF_ALU | BPF_RSH | BPF_X: /* A >>= X; */
 			ctx->seen |= SEEN_XREG;
 			PPC_SRW(r_A, r_A, r_X);
 			break;
-		case BPF_S_ALU_RSH_K: /* A >>= K; */
+		case BPF_ALU | BPF_RSH | BPF_K: /* A >>= K; */
 			if (K == 0)
 				break;
 			else
 				PPC_SRWI(r_A, r_A, K);
 			break;
-		case BPF_S_ALU_NEG:
+		case BPF_ALU | BPF_NEG:
 			PPC_NEG(r_A, r_A);
 			break;
-		case BPF_S_RET_K:
+		case BPF_RET | BPF_K:
 			PPC_LI32(r_ret, K);
 			if (!K) {
 				if (ctx->pc_ret0 == -1)
@@ -312,7 +305,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 					PPC_BLR();
 			}
 			break;
-		case BPF_S_RET_A:
+		case BPF_RET | BPF_A:
 			PPC_MR(r_ret, r_A);
 			if (i != flen - 1) {
 				if (ctx->seen)
@@ -321,53 +314,53 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 					PPC_BLR();
 			}
 			break;
-		case BPF_S_MISC_TAX: /* X = A */
+		case BPF_MISC | BPF_TAX: /* X = A */
 			PPC_MR(r_X, r_A);
 			break;
-		case BPF_S_MISC_TXA: /* A = X */
+		case BPF_MISC | BPF_TXA: /* A = X */
 			ctx->seen |= SEEN_XREG;
 			PPC_MR(r_A, r_X);
 			break;
 
 			/*** Constant loads/M[] access ***/
-		case BPF_S_LD_IMM: /* A = K */
+		case BPF_LD | BPF_IMM: /* A = K */
 			PPC_LI32(r_A, K);
 			break;
-		case BPF_S_LDX_IMM: /* X = K */
+		case BPF_LDX | BPF_IMM: /* X = K */
 			PPC_LI32(r_X, K);
 			break;
-		case BPF_S_LD_MEM: /* A = mem[K] */
+		case BPF_LD | BPF_MEM: /* A = mem[K] */
 			PPC_MR(r_A, r_M + (K & 0xf));
 			ctx->seen |= SEEN_MEM | (1<<(K & 0xf));
 			break;
-		case BPF_S_LDX_MEM: /* X = mem[K] */
+		case BPF_LDX | BPF_MEM: /* X = mem[K] */
 			PPC_MR(r_X, r_M + (K & 0xf));
 			ctx->seen |= SEEN_MEM | (1<<(K & 0xf));
 			break;
-		case BPF_S_ST: /* mem[K] = A */
+		case BPF_ST: /* mem[K] = A */
 			PPC_MR(r_M + (K & 0xf), r_A);
 			ctx->seen |= SEEN_MEM | (1<<(K & 0xf));
 			break;
-		case BPF_S_STX: /* mem[K] = X */
+		case BPF_STX: /* mem[K] = X */
 			PPC_MR(r_M + (K & 0xf), r_X);
 			ctx->seen |= SEEN_XREG | SEEN_MEM | (1<<(K & 0xf));
 			break;
-		case BPF_S_LD_W_LEN: /*	A = skb->len; */
+		case BPF_LD | BPF_W | BPF_LEN: /*	A = skb->len; */
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
 			PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, len));
 			break;
-		case BPF_S_LDX_W_LEN: /* X = skb->len; */
+		case BPF_LDX | BPF_W | BPF_LEN: /* X = skb->len; */
 			PPC_LWZ_OFFS(r_X, r_skb, offsetof(struct sk_buff, len));
 			break;
 
 			/*** Ancillary info loads ***/
-		case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
+		case BPF_ANC | SKF_AD_PROTOCOL: /* A = ntohs(skb->protocol); */
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
 						  protocol) != 2);
 			PPC_NTOHS_OFFS(r_A, r_skb, offsetof(struct sk_buff,
 							    protocol));
 			break;
-		case BPF_S_ANC_IFINDEX:
+		case BPF_ANC | SKF_AD_IFINDEX:
 			PPC_LD_OFFS(r_scratch1, r_skb, offsetof(struct sk_buff,
 								dev));
 			PPC_CMPDI(r_scratch1, 0);
@@ -384,33 +377,33 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			PPC_LWZ_OFFS(r_A, r_scratch1,
 				     offsetof(struct net_device, ifindex));
 			break;
-		case BPF_S_ANC_MARK:
+		case BPF_ANC | SKF_AD_MARK:
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 			PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff,
 							  mark));
 			break;
-		case BPF_S_ANC_RXHASH:
+		case BPF_ANC | SKF_AD_RXHASH:
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 			PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff,
 							  hash));
 			break;
-		case BPF_S_ANC_VLAN_TAG:
-		case BPF_S_ANC_VLAN_TAG_PRESENT:
+		case BPF_ANC | SKF_AD_VLAN_TAG:
+		case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 			PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff,
 							  vlan_tci));
-			if (filter[i].code == BPF_S_ANC_VLAN_TAG)
+			if (code == (BPF_ANC | SKF_AD_VLAN_TAG))
 				PPC_ANDI(r_A, r_A, VLAN_VID_MASK);
 			else
 				PPC_ANDI(r_A, r_A, VLAN_TAG_PRESENT);
 			break;
-		case BPF_S_ANC_QUEUE:
+		case BPF_ANC | SKF_AD_QUEUE:
 			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
 						  queue_mapping) != 2);
 			PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff,
 							  queue_mapping));
 			break;
-		case BPF_S_ANC_CPU:
+		case BPF_ANC | SKF_AD_CPU:
 #ifdef CONFIG_SMP
 			/*
 			 * PACA ptr is r13:
@@ -426,13 +419,13 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			break;
 
 			/*** Absolute loads from packet header/data ***/
-		case BPF_S_LD_W_ABS:
+		case BPF_LD | BPF_W | BPF_ABS:
 			func = CHOOSE_LOAD_FUNC(K, sk_load_word);
 			goto common_load;
-		case BPF_S_LD_H_ABS:
+		case BPF_LD | BPF_H | BPF_ABS:
 			func = CHOOSE_LOAD_FUNC(K, sk_load_half);
 			goto common_load;
-		case BPF_S_LD_B_ABS:
+		case BPF_LD | BPF_B | BPF_ABS:
 			func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
 		common_load:
 			/* Load from [K]. */
@@ -449,13 +442,13 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			break;
 
 			/*** Indirect loads from packet header/data ***/
-		case BPF_S_LD_W_IND:
+		case BPF_LD | BPF_W | BPF_IND:
 			func = sk_load_word;
 			goto common_load_ind;
-		case BPF_S_LD_H_IND:
+		case BPF_LD | BPF_H | BPF_IND:
 			func = sk_load_half;
 			goto common_load_ind;
-		case BPF_S_LD_B_IND:
+		case BPF_LD | BPF_B | BPF_IND:
 			func = sk_load_byte;
 		common_load_ind:
 			/*
@@ -473,31 +466,31 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			PPC_BCC(COND_LT, exit_addr);
 			break;
 
-		case BPF_S_LDX_B_MSH:
+		case BPF_LDX | BPF_B | BPF_MSH:
 			func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
 			goto common_load;
 			break;
 
 			/*** Jump and branches ***/
-		case BPF_S_JMP_JA:
+		case BPF_JMP | BPF_JA:
 			if (K != 0)
 				PPC_JMP(addrs[i + 1 + K]);
 			break;
 
-		case BPF_S_JMP_JGT_K:
-		case BPF_S_JMP_JGT_X:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_X:
 			true_cond = COND_GT;
 			goto cond_branch;
-		case BPF_S_JMP_JGE_K:
-		case BPF_S_JMP_JGE_X:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_X:
 			true_cond = COND_GE;
 			goto cond_branch;
-		case BPF_S_JMP_JEQ_K:
-		case BPF_S_JMP_JEQ_X:
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JEQ | BPF_X:
 			true_cond = COND_EQ;
 			goto cond_branch;
-		case BPF_S_JMP_JSET_K:
-		case BPF_S_JMP_JSET_X:
+		case BPF_JMP | BPF_JSET | BPF_K:
+		case BPF_JMP | BPF_JSET | BPF_X:
 			true_cond = COND_NE;
 			/* Fall through */
 		cond_branch:
@@ -508,20 +501,20 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 				break;
 			}
 
-			switch (filter[i].code) {
-			case BPF_S_JMP_JGT_X:
-			case BPF_S_JMP_JGE_X:
-			case BPF_S_JMP_JEQ_X:
+			switch (code) {
+			case BPF_JMP | BPF_JGT | BPF_X:
+			case BPF_JMP | BPF_JGE | BPF_X:
+			case BPF_JMP | BPF_JEQ | BPF_X:
 				ctx->seen |= SEEN_XREG;
 				PPC_CMPLW(r_A, r_X);
 				break;
-			case BPF_S_JMP_JSET_X:
+			case BPF_JMP | BPF_JSET | BPF_X:
 				ctx->seen |= SEEN_XREG;
 				PPC_AND_DOT(r_scratch1, r_A, r_X);
 				break;
-			case BPF_S_JMP_JEQ_K:
-			case BPF_S_JMP_JGT_K:
-			case BPF_S_JMP_JGE_K:
+			case BPF_JMP | BPF_JEQ | BPF_K:
+			case BPF_JMP | BPF_JGT | BPF_K:
+			case BPF_JMP | BPF_JGE | BPF_K:
 				if (K < 32768)
 					PPC_CMPLWI(r_A, K);
 				else {
@@ -529,7 +522,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 					PPC_CMPLW(r_A, r_scratch1);
 				}
 				break;
-			case BPF_S_JMP_JSET_K:
+			case BPF_JMP | BPF_JSET | BPF_K:
 				if (K < 32768)
 					/* PPC_ANDI is /only/ dot-form */
 					PPC_ANDI(r_scratch1, r_A, K);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index e9f8fa9337fe..a2cbd875543a 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -269,27 +269,17 @@ static void bpf_jit_noleaks(struct bpf_jit *jit, struct sock_filter *filter)
 		EMIT4(0xa7c80000);
 	/* Clear A if the first register does not set it. */
 	switch (filter[0].code) {
-	case BPF_S_LD_W_ABS:
-	case BPF_S_LD_H_ABS:
-	case BPF_S_LD_B_ABS:
-	case BPF_S_LD_W_LEN:
-	case BPF_S_LD_W_IND:
-	case BPF_S_LD_H_IND:
-	case BPF_S_LD_B_IND:
-	case BPF_S_LD_IMM:
-	case BPF_S_LD_MEM:
-	case BPF_S_MISC_TXA:
-	case BPF_S_ANC_PROTOCOL:
-	case BPF_S_ANC_PKTTYPE:
-	case BPF_S_ANC_IFINDEX:
-	case BPF_S_ANC_MARK:
-	case BPF_S_ANC_QUEUE:
-	case BPF_S_ANC_HATYPE:
-	case BPF_S_ANC_RXHASH:
-	case BPF_S_ANC_CPU:
-	case BPF_S_ANC_VLAN_TAG:
-	case BPF_S_ANC_VLAN_TAG_PRESENT:
-	case BPF_S_RET_K:
+	case BPF_LD | BPF_W | BPF_ABS:
+	case BPF_LD | BPF_H | BPF_ABS:
+	case BPF_LD | BPF_B | BPF_ABS:
+	case BPF_LD | BPF_W | BPF_LEN:
+	case BPF_LD | BPF_W | BPF_IND:
+	case BPF_LD | BPF_H | BPF_IND:
+	case BPF_LD | BPF_B | BPF_IND:
+	case BPF_LD | BPF_IMM:
+	case BPF_LD | BPF_MEM:
+	case BPF_MISC | BPF_TXA:
+	case BPF_RET | BPF_K:
 		/* first instruction sets A register */
 		break;
 	default: /* A = 0 */
@@ -304,15 +294,18 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 	unsigned int K;
 	int offset;
 	unsigned int mask;
+	u16 code;
 
 	K = filter->k;
-	switch (filter->code) {
-	case BPF_S_ALU_ADD_X: /* A += X */
+	code = bpf_anc_helper(filter);
+
+	switch (code) {
+	case BPF_ALU | BPF_ADD | BPF_X: /* A += X */
 		jit->seen |= SEEN_XREG;
 		/* ar %r5,%r12 */
 		EMIT2(0x1a5c);
 		break;
-	case BPF_S_ALU_ADD_K: /* A += K */
+	case BPF_ALU | BPF_ADD | BPF_K: /* A += K */
 		if (!K)
 			break;
 		if (K <= 16383)
@@ -325,12 +318,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 			/* a %r5,<d(K)>(%r13) */
 			EMIT4_DISP(0x5a50d000, EMIT_CONST(K));
 		break;
-	case BPF_S_ALU_SUB_X: /* A -= X */
+	case BPF_ALU | BPF_SUB | BPF_X: /* A -= X */
 		jit->seen |= SEEN_XREG;
 		/* sr %r5,%r12 */
 		EMIT2(0x1b5c);
 		break;
-	case BPF_S_ALU_SUB_K: /* A -= K */
+	case BPF_ALU | BPF_SUB | BPF_K: /* A -= K */
 		if (!K)
 			break;
 		if (K <= 16384)
@@ -343,12 +336,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 			/* s %r5,<d(K)>(%r13) */
 			EMIT4_DISP(0x5b50d000, EMIT_CONST(K));
 		break;
-	case BPF_S_ALU_MUL_X: /* A *= X */
+	case BPF_ALU | BPF_MUL | BPF_X: /* A *= X */
 		jit->seen |= SEEN_XREG;
 		/* msr %r5,%r12 */
 		EMIT4(0xb252005c);
 		break;
-	case BPF_S_ALU_MUL_K: /* A *= K */
+	case BPF_ALU | BPF_MUL | BPF_K: /* A *= K */
 		if (K <= 16383)
 			/* mhi %r5,K */
 			EMIT4_IMM(0xa75c0000, K);
@@ -359,7 +352,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 			/* ms %r5,<d(K)>(%r13) */
 			EMIT4_DISP(0x7150d000, EMIT_CONST(K));
 		break;
-	case BPF_S_ALU_DIV_X: /* A /= X */
+	case BPF_ALU | BPF_DIV | BPF_X: /* A /= X */
 		jit->seen |= SEEN_XREG | SEEN_RET0;
 		/* ltr %r12,%r12 */
 		EMIT2(0x12cc);
@@ -370,7 +363,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 		/* dlr %r4,%r12 */
 		EMIT4(0xb997004c);
 		break;
-	case BPF_S_ALU_DIV_K: /* A /= K */
+	case BPF_ALU | BPF_DIV | BPF_K: /* A /= K */
 		if (K == 1)
 			break;
 		/* lhi %r4,0 */
@@ -378,7 +371,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 		/* dl %r4,<d(K)>(%r13) */
 		EMIT6_DISP(0xe340d000, 0x0097, EMIT_CONST(K));
 		break;
-	case BPF_S_ALU_MOD_X: /* A %= X */
+	case BPF_ALU | BPF_MOD | BPF_X: /* A %= X */
 		jit->seen |= SEEN_XREG | SEEN_RET0;
 		/* ltr %r12,%r12 */
 		EMIT2(0x12cc);
@@ -391,7 +384,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 		/* lr %r5,%r4 */
 		EMIT2(0x1854);
 		break;
-	case BPF_S_ALU_MOD_K: /* A %= K */
+	case BPF_ALU | BPF_MOD | BPF_K: /* A %= K */
 		if (K == 1) {
 			/* lhi %r5,0 */
 			EMIT4(0xa7580000);
@@ -404,12 +397,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 		/* lr %r5,%r4 */
 		EMIT2(0x1854);
 		break;
-	case BPF_S_ALU_AND_X: /* A &= X */
+	case BPF_ALU | BPF_AND | BPF_X: /* A &= X */
 		jit->seen |= SEEN_XREG;
 		/* nr %r5,%r12 */
 		EMIT2(0x145c);
 		break;
-	case BPF_S_ALU_AND_K: /* A &= K */
+	case BPF_ALU | BPF_AND | BPF_K: /* A &= K */
 		if (test_facility(21))
 			/* nilf %r5,<K> */
 			EMIT6_IMM(0xc05b0000, K);
@@ -417,12 +410,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 			/* n %r5,<d(K)>(%r13) */
 			EMIT4_DISP(0x5450d000, EMIT_CONST(K));
 		break;
-	case BPF_S_ALU_OR_X: /* A |= X */
+	case BPF_ALU | BPF_OR | BPF_X: /* A |= X */
 		jit->seen |= SEEN_XREG;
 		/* or %r5,%r12 */
 		EMIT2(0x165c);
 		break;
-	case BPF_S_ALU_OR_K: /* A |= K */
+	case BPF_ALU | BPF_OR | BPF_K: /* A |= K */
 		if (test_facility(21))
 			/* oilf %r5,<K> */
 			EMIT6_IMM(0xc05d0000, K);
@@ -430,55 +423,55 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter,
 			/* o %r5,<d(K)>(%r13) */
 			EMIT4_DISP(0x5650d000, EMIT_CONST(K));
 		break;
-	case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
-	case BPF_S_ALU_XOR_X:
+	case BPF_ANC | SKF_AD_ALU_XOR_X: /* A ^= X; */
+	case BPF_ALU | BPF_XOR | BPF_X:
 		jit->seen |= SEEN_XREG;
 		/* xr %r5,%r12 */
 		EMIT2(0x175c);
 		break;
-	case BPF_S_ALU_XOR_K: /* A ^= K */
+	case BPF_ALU | BPF_XOR | BPF_K: /* A ^= K */
 		if (!K)
 			break;
 		/* x %r5,<d(K)>(%r13) */
 		EMIT4_DISP(0x5750d000, EMIT_CONST(K));
 		break;
-	case BPF_S_ALU_LSH_X: /* A <<= X; */
+	case BPF_ALU | BPF_LSH | BPF_X: /* A <<= X; */
 		jit->seen |= SEEN_XREG;
 		/* sll %r5,0(%r12) */
 		EMIT4(0x8950c000);
 		break;
-	case BPF_S_ALU_LSH_K: /* A <<= K */
+	case BPF_ALU | BPF_LSH | BPF_K: /* A <<= K */
 		if (K == 0)
 			break;
 		/* sll %r5,K */
 		EMIT4_DISP(0x89500000, K);
 		break;
-	case BPF_S_ALU_RSH_X: /* A >>= X; */
+	case BPF_ALU | BPF_RSH | BPF_X: /* A >>= X; */
 		jit->seen |= SEEN_XREG;
 		/* srl %r5,0(%r12) */
 		EMIT4(0x8850c000);
 		break;
-	case BPF_S_ALU_RSH_K: /* A >>= K; */
+	case BPF_ALU | BPF_RSH | BPF_K: /* A >>= K; */
 		if (K == 0)
 			break;
 		/* srl %r5,K */
 		EMIT4_DISP(0x88500000, K);
 		break;
-	case BPF_S_ALU_NEG: /* A = -A */
+	case BPF_ALU | BPF_NEG: /* A = -A */
 		/* lnr %r5,%r5 */
 		EMIT2(0x1155);
 		break;
-	case BPF_S_JMP_JA: /* ip += K */
+	case BPF_JMP | BPF_JA: /* ip += K */
 		offset = addrs[i + K] + jit->start - jit->prg;
 		EMIT4_PCREL(0xa7f40000, offset);
 		break;
-	case BPF_S_JMP_JGT_K: /* ip += (A > K) ? jt : jf */
+	case BPF_JMP | BPF_JGT | BPF_K: /* ip += (A > K) ? jt : jf */
 		mask = 0x200000; /* jh */
 		goto kbranch;
-	case BPF_S_JMP_JGE_K: /* ip += (A >= K) ? jt : jf */
+	case BPF_JMP | BPF_JGE | BPF_K: /* ip += (A >= K) ? jt : jf */
 		mask = 0xa00000; /* jhe */
 		goto kbranch;
-	case BPF_S_JMP_JEQ_K: /* ip += (A == K) ? jt : jf */
+	case BPF_JMP | BPF_JEQ | BPF_K: /* ip += (A == K) ? jt : jf */
 		mask = 0x800000; /* je */
 kbranch:	/* Emit compare if the branch targets are different */
 		if (filter->jt != filter->jf) {
@@ -511,7 +504,7 @@ branch:		if (filter->jt == filter->jf) {
 			EMIT4_PCREL(0xa7040000 | (mask ^ 0xf00000), offset);
 		}
 		break;
-	case BPF_S_JMP_JSET_K: /* ip += (A & K) ? jt : jf */
+	case BPF_JMP | BPF_JSET | BPF_K: /* ip += (A & K) ? jt : jf */
 		mask = 0x700000; /* jnz */
 		/* Emit test if the branch targets are different */
 		if (filter->jt != filter->jf) {
@@ -525,13 +518,13 @@ branch:		if (filter->jt == filter->jf) {
 				EMIT4_IMM(0xa7510000, K);
 		}
 		goto branch;
-	case BPF_S_JMP_JGT_X: /* ip += (A > X) ? jt : jf */
+	case BPF_JMP | BPF_JGT | BPF_X: /* ip += (A > X) ? jt : jf */
 		mask = 0x200000; /* jh */
 		goto xbranch;
-	case BPF_S_JMP_JGE_X: /* ip += (A >= X) ? jt : jf */
+	case BPF_JMP | BPF_JGE | BPF_X: /* ip += (A >= X) ? jt : jf */
 		mask = 0xa00000; /* jhe */
 		goto xbranch;
-	case BPF_S_JMP_JEQ_X: /* ip += (A == X) ? jt : jf */
+	case BPF_JMP | BPF_JEQ | BPF_X: /* ip += (A == X) ? jt : jf */
 		mask = 0x800000; /* je */
 xbranch:	/* Emit compare if the branch targets are different */
 		if (filter->jt != filter->jf) {
@@ -540,7 +533,7 @@ xbranch:	/* Emit compare if the branch targets are different */
 			EMIT2(0x195c);
 		}
 		goto branch;
-	case BPF_S_JMP_JSET_X: /* ip += (A & X) ? jt : jf */
+	case BPF_JMP | BPF_JSET | BPF_X: /* ip += (A & X) ? jt : jf */
 		mask = 0x700000; /* jnz */
 		/* Emit test if the branch targets are different */
 		if (filter->jt != filter->jf) {
@@ -551,15 +544,15 @@ xbranch:	/* Emit compare if the branch targets are different */
 			EMIT2(0x144c);
 		}
 		goto branch;
-	case BPF_S_LD_W_ABS: /* A = *(u32 *) (skb->data+K) */
+	case BPF_LD | BPF_W | BPF_ABS: /* A = *(u32 *) (skb->data+K) */
 		jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_WORD;
 		offset = jit->off_load_word;
 		goto load_abs;
-	case BPF_S_LD_H_ABS: /* A = *(u16 *) (skb->data+K) */
+	case BPF_LD | BPF_H | BPF_ABS: /* A = *(u16 *) (skb->data+K) */
 		jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_HALF;
 		offset = jit->off_load_half;
 		goto load_abs;
-	case BPF_S_LD_B_ABS: /* A = *(u8 *) (skb->data+K) */
+	case BPF_LD | BPF_B | BPF_ABS: /* A = *(u8 *) (skb->data+K) */
 		jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_BYTE;
 		offset = jit->off_load_byte;
 load_abs:	if ((int) K < 0)
@@ -573,19 +566,19 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 		/* jnz <ret0> */
 		EMIT4_PCREL(0xa7740000, (jit->ret0_ip - jit->prg));
 		break;
-	case BPF_S_LD_W_IND: /* A = *(u32 *) (skb->data+K+X) */
+	case BPF_LD | BPF_W | BPF_IND: /* A = *(u32 *) (skb->data+K+X) */
 		jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_IWORD;
 		offset = jit->off_load_iword;
 		goto call_fn;
-	case BPF_S_LD_H_IND: /* A = *(u16 *) (skb->data+K+X) */
+	case BPF_LD | BPF_H | BPF_IND: /* A = *(u16 *) (skb->data+K+X) */
 		jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_IHALF;
 		offset = jit->off_load_ihalf;
 		goto call_fn;
-	case BPF_S_LD_B_IND: /* A = *(u8 *) (skb->data+K+X) */
+	case BPF_LD | BPF_B | BPF_IND: /* A = *(u8 *) (skb->data+K+X) */
 		jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_IBYTE;
 		offset = jit->off_load_ibyte;
 		goto call_fn;
-	case BPF_S_LDX_B_MSH:
+	case BPF_LDX | BPF_B | BPF_MSH:
 		/* X = (*(u8 *)(skb->data+K) & 0xf) << 2 */
 		jit->seen |= SEEN_RET0;
 		if ((int) K < 0) {
@@ -596,17 +589,17 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 		jit->seen |= SEEN_DATAREF | SEEN_LOAD_BMSH;
 		offset = jit->off_load_bmsh;
 		goto call_fn;
-	case BPF_S_LD_W_LEN: /*	A = skb->len; */
+	case BPF_LD | BPF_W | BPF_LEN: /*	A = skb->len; */
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
 		/* l %r5,<d(len)>(%r2) */
 		EMIT4_DISP(0x58502000, offsetof(struct sk_buff, len));
 		break;
-	case BPF_S_LDX_W_LEN: /* X = skb->len; */
+	case BPF_LDX | BPF_W | BPF_LEN: /* X = skb->len; */
 		jit->seen |= SEEN_XREG;
 		/* l %r12,<d(len)>(%r2) */
 		EMIT4_DISP(0x58c02000, offsetof(struct sk_buff, len));
 		break;
-	case BPF_S_LD_IMM: /* A = K */
+	case BPF_LD | BPF_IMM: /* A = K */
 		if (K <= 16383)
 			/* lhi %r5,K */
 			EMIT4_IMM(0xa7580000, K);
@@ -617,7 +610,7 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 			/* l %r5,<d(K)>(%r13) */
 			EMIT4_DISP(0x5850d000, EMIT_CONST(K));
 		break;
-	case BPF_S_LDX_IMM: /* X = K */
+	case BPF_LDX | BPF_IMM: /* X = K */
 		jit->seen |= SEEN_XREG;
 		if (K <= 16383)
 			/* lhi %r12,<K> */
@@ -629,29 +622,29 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 			/* l %r12,<d(K)>(%r13) */
 			EMIT4_DISP(0x58c0d000, EMIT_CONST(K));
 		break;
-	case BPF_S_LD_MEM: /* A = mem[K] */
+	case BPF_LD | BPF_MEM: /* A = mem[K] */
 		jit->seen |= SEEN_MEM;
 		/* l %r5,<K>(%r15) */
 		EMIT4_DISP(0x5850f000,
 			   (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4);
 		break;
-	case BPF_S_LDX_MEM: /* X = mem[K] */
+	case BPF_LDX | BPF_MEM: /* X = mem[K] */
 		jit->seen |= SEEN_XREG | SEEN_MEM;
 		/* l %r12,<K>(%r15) */
 		EMIT4_DISP(0x58c0f000,
 			   (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4);
 		break;
-	case BPF_S_MISC_TAX: /* X = A */
+	case BPF_MISC | BPF_TAX: /* X = A */
 		jit->seen |= SEEN_XREG;
 		/* lr %r12,%r5 */
 		EMIT2(0x18c5);
 		break;
-	case BPF_S_MISC_TXA: /* A = X */
+	case BPF_MISC | BPF_TXA: /* A = X */
 		jit->seen |= SEEN_XREG;
 		/* lr %r5,%r12 */
 		EMIT2(0x185c);
 		break;
-	case BPF_S_RET_K:
+	case BPF_RET | BPF_K:
 		if (K == 0) {
 			jit->seen |= SEEN_RET0;
 			if (last)
@@ -671,33 +664,33 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 			EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg);
 		}
 		break;
-	case BPF_S_RET_A:
+	case BPF_RET | BPF_A:
 		/* llgfr %r2,%r5 */
 		EMIT4(0xb9160025);
 		/* j <exit> */
 		EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg);
 		break;
-	case BPF_S_ST: /* mem[K] = A */
+	case BPF_ST: /* mem[K] = A */
 		jit->seen |= SEEN_MEM;
 		/* st %r5,<K>(%r15) */
 		EMIT4_DISP(0x5050f000,
 			   (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4);
 		break;
-	case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
+	case BPF_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
 		jit->seen |= SEEN_XREG | SEEN_MEM;
 		/* st %r12,<K>(%r15) */
 		EMIT4_DISP(0x50c0f000,
 			   (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4);
 		break;
-	case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
+	case BPF_ANC | SKF_AD_PROTOCOL: /* A = ntohs(skb->protocol); */
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 		/* lhi %r5,0 */
 		EMIT4(0xa7580000);
 		/* icm	%r5,3,<d(protocol)>(%r2) */
 		EMIT4_DISP(0xbf532000, offsetof(struct sk_buff, protocol));
 		break;
-	case BPF_S_ANC_IFINDEX:	/* if (!skb->dev) return 0;
-				 * A = skb->dev->ifindex */
+	case BPF_ANC | SKF_AD_IFINDEX:	/* if (!skb->dev) return 0;
+					 * A = skb->dev->ifindex */
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 		jit->seen |= SEEN_RET0;
 		/* lg %r1,<d(dev)>(%r2) */
@@ -709,20 +702,20 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 		/* l %r5,<d(ifindex)>(%r1) */
 		EMIT4_DISP(0x58501000, offsetof(struct net_device, ifindex));
 		break;
-	case BPF_S_ANC_MARK: /* A = skb->mark */
+	case BPF_ANC | SKF_AD_MARK: /* A = skb->mark */
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 		/* l %r5,<d(mark)>(%r2) */
 		EMIT4_DISP(0x58502000, offsetof(struct sk_buff, mark));
 		break;
-	case BPF_S_ANC_QUEUE: /* A = skb->queue_mapping */
+	case BPF_ANC | SKF_AD_QUEUE: /* A = skb->queue_mapping */
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
 		/* lhi %r5,0 */
 		EMIT4(0xa7580000);
 		/* icm	%r5,3,<d(queue_mapping)>(%r2) */
 		EMIT4_DISP(0xbf532000, offsetof(struct sk_buff, queue_mapping));
 		break;
-	case BPF_S_ANC_HATYPE:	/* if (!skb->dev) return 0;
-				 * A = skb->dev->type */
+	case BPF_ANC | SKF_AD_HATYPE:	/* if (!skb->dev) return 0;
+					 * A = skb->dev->type */
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
 		jit->seen |= SEEN_RET0;
 		/* lg %r1,<d(dev)>(%r2) */
@@ -736,20 +729,20 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 		/* icm	%r5,3,<d(type)>(%r1) */
 		EMIT4_DISP(0xbf531000, offsetof(struct net_device, type));
 		break;
-	case BPF_S_ANC_RXHASH: /* A = skb->hash */
+	case BPF_ANC | SKF_AD_RXHASH: /* A = skb->hash */
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 		/* l %r5,<d(hash)>(%r2) */
 		EMIT4_DISP(0x58502000, offsetof(struct sk_buff, hash));
 		break;
-	case BPF_S_ANC_VLAN_TAG:
-	case BPF_S_ANC_VLAN_TAG_PRESENT:
+	case BPF_ANC | SKF_AD_VLAN_TAG:
+	case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
 		/* lhi %r5,0 */
 		EMIT4(0xa7580000);
 		/* icm	%r5,3,<d(vlan_tci)>(%r2) */
 		EMIT4_DISP(0xbf532000, offsetof(struct sk_buff, vlan_tci));
-		if (filter->code == BPF_S_ANC_VLAN_TAG) {
+		if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) {
 			/* nill %r5,0xefff */
 			EMIT4_IMM(0xa5570000, ~VLAN_TAG_PRESENT);
 		} else {
@@ -759,7 +752,7 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 			EMIT4_DISP(0x88500000, 12);
 		}
 		break;
-	case BPF_S_ANC_PKTTYPE:
+	case BPF_ANC | SKF_AD_PKTTYPE:
 		if (pkt_type_offset < 0)
 			goto out;
 		/* lhi %r5,0 */
@@ -769,7 +762,7 @@ call_fn:	/* lg %r1,<d(function)>(%r13) */
 		/* srl %r5,5 */
 		EMIT4_DISP(0x88500000, 5);
 		break;
-	case BPF_S_ANC_CPU: /* A = smp_processor_id() */
+	case BPF_ANC | SKF_AD_CPU: /* A = smp_processor_id() */
 #ifdef CONFIG_SMP
 		/* l %r5,<d(cpu_nr)> */
 		EMIT4_DISP(0x58500000, offsetof(struct _lowcore, cpu_nr));
diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
index a82c6b2a9780..c88cf147deed 100644
--- a/arch/sparc/net/bpf_jit_comp.c
+++ b/arch/sparc/net/bpf_jit_comp.c
@@ -415,20 +415,11 @@ void bpf_jit_compile(struct sk_filter *fp)
 		emit_reg_move(O7, r_saved_O7);
 
 		switch (filter[0].code) {
-		case BPF_S_RET_K:
-		case BPF_S_LD_W_LEN:
-		case BPF_S_ANC_PROTOCOL:
-		case BPF_S_ANC_PKTTYPE:
-		case BPF_S_ANC_IFINDEX:
-		case BPF_S_ANC_MARK:
-		case BPF_S_ANC_RXHASH:
-		case BPF_S_ANC_VLAN_TAG:
-		case BPF_S_ANC_VLAN_TAG_PRESENT:
-		case BPF_S_ANC_CPU:
-		case BPF_S_ANC_QUEUE:
-		case BPF_S_LD_W_ABS:
-		case BPF_S_LD_H_ABS:
-		case BPF_S_LD_B_ABS:
+		case BPF_RET | BPF_K:
+		case BPF_LD | BPF_W | BPF_LEN:
+		case BPF_LD | BPF_W | BPF_ABS:
+		case BPF_LD | BPF_H | BPF_ABS:
+		case BPF_LD | BPF_B | BPF_ABS:
 			/* The first instruction sets the A register (or is
 			 * a "RET 'constant'")
 			 */
@@ -445,59 +436,60 @@ void bpf_jit_compile(struct sk_filter *fp)
 			unsigned int t_offset;
 			unsigned int f_offset;
 			u32 t_op, f_op;
+			u16 code = bpf_anc_helper(&filter[i]);
 			int ilen;
 
-			switch (filter[i].code) {
-			case BPF_S_ALU_ADD_X:	/* A += X; */
+			switch (code) {
+			case BPF_ALU | BPF_ADD | BPF_X:	/* A += X; */
 				emit_alu_X(ADD);
 				break;
-			case BPF_S_ALU_ADD_K:	/* A += K; */
+			case BPF_ALU | BPF_ADD | BPF_K:	/* A += K; */
 				emit_alu_K(ADD, K);
 				break;
-			case BPF_S_ALU_SUB_X:	/* A -= X; */
+			case BPF_ALU | BPF_SUB | BPF_X:	/* A -= X; */
 				emit_alu_X(SUB);
 				break;
-			case BPF_S_ALU_SUB_K:	/* A -= K */
+			case BPF_ALU | BPF_SUB | BPF_K:	/* A -= K */
 				emit_alu_K(SUB, K);
 				break;
-			case BPF_S_ALU_AND_X:	/* A &= X */
+			case BPF_ALU | BPF_AND | BPF_X:	/* A &= X */
 				emit_alu_X(AND);
 				break;
-			case BPF_S_ALU_AND_K:	/* A &= K */
+			case BPF_ALU | BPF_AND | BPF_K:	/* A &= K */
 				emit_alu_K(AND, K);
 				break;
-			case BPF_S_ALU_OR_X:	/* A |= X */
+			case BPF_ALU | BPF_OR | BPF_X:	/* A |= X */
 				emit_alu_X(OR);
 				break;
-			case BPF_S_ALU_OR_K:	/* A |= K */
+			case BPF_ALU | BPF_OR | BPF_K:	/* A |= K */
 				emit_alu_K(OR, K);
 				break;
-			case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
-			case BPF_S_ALU_XOR_X:
+			case BPF_ANC | SKF_AD_ALU_XOR_X: /* A ^= X; */
+			case BPF_ALU | BPF_XOR | BPF_X:
 				emit_alu_X(XOR);
 				break;
-			case BPF_S_ALU_XOR_K:	/* A ^= K */
+			case BPF_ALU | BPF_XOR | BPF_K:	/* A ^= K */
 				emit_alu_K(XOR, K);
 				break;
-			case BPF_S_ALU_LSH_X:	/* A <<= X */
+			case BPF_ALU | BPF_LSH | BPF_X:	/* A <<= X */
 				emit_alu_X(SLL);
 				break;
-			case BPF_S_ALU_LSH_K:	/* A <<= K */
+			case BPF_ALU | BPF_LSH | BPF_K:	/* A <<= K */
 				emit_alu_K(SLL, K);
 				break;
-			case BPF_S_ALU_RSH_X:	/* A >>= X */
+			case BPF_ALU | BPF_RSH | BPF_X:	/* A >>= X */
 				emit_alu_X(SRL);
 				break;
-			case BPF_S_ALU_RSH_K:	/* A >>= K */
+			case BPF_ALU | BPF_RSH | BPF_K:	/* A >>= K */
 				emit_alu_K(SRL, K);
 				break;
-			case BPF_S_ALU_MUL_X:	/* A *= X; */
+			case BPF_ALU | BPF_MUL | BPF_X:	/* A *= X; */
 				emit_alu_X(MUL);
 				break;
-			case BPF_S_ALU_MUL_K:	/* A *= K */
+			case BPF_ALU | BPF_MUL | BPF_K:	/* A *= K */
 				emit_alu_K(MUL, K);
 				break;
-			case BPF_S_ALU_DIV_K:	/* A /= K with K != 0*/
+			case BPF_ALU | BPF_DIV | BPF_K:	/* A /= K with K != 0*/
 				if (K == 1)
 					break;
 				emit_write_y(G0);
@@ -512,7 +504,7 @@ void bpf_jit_compile(struct sk_filter *fp)
 #endif
 				emit_alu_K(DIV, K);
 				break;
-			case BPF_S_ALU_DIV_X:	/* A /= X; */
+			case BPF_ALU | BPF_DIV | BPF_X:	/* A /= X; */
 				emit_cmpi(r_X, 0);
 				if (pc_ret0 > 0) {
 					t_offset = addrs[pc_ret0 - 1];
@@ -544,10 +536,10 @@ void bpf_jit_compile(struct sk_filter *fp)
 #endif
 				emit_alu_X(DIV);
 				break;
-			case BPF_S_ALU_NEG:
+			case BPF_ALU | BPF_NEG:
 				emit_neg();
 				break;
-			case BPF_S_RET_K:
+			case BPF_RET | BPF_K:
 				if (!K) {
 					if (pc_ret0 == -1)
 						pc_ret0 = i;
@@ -556,7 +548,7 @@ void bpf_jit_compile(struct sk_filter *fp)
 					emit_loadimm(K, r_A);
 				}
 				/* Fallthrough */
-			case BPF_S_RET_A:
+			case BPF_RET | BPF_A:
 				if (seen_or_pass0) {
 					if (i != flen - 1) {
 						emit_jump(cleanup_addr);
@@ -573,18 +565,18 @@ void bpf_jit_compile(struct sk_filter *fp)
 				emit_jmpl(r_saved_O7, 8, G0);
 				emit_reg_move(r_A, O0); /* delay slot */
 				break;
-			case BPF_S_MISC_TAX:
+			case BPF_MISC | BPF_TAX:
 				seen |= SEEN_XREG;
 				emit_reg_move(r_A, r_X);
 				break;
-			case BPF_S_MISC_TXA:
+			case BPF_MISC | BPF_TXA:
 				seen |= SEEN_XREG;
 				emit_reg_move(r_X, r_A);
 				break;
-			case BPF_S_ANC_CPU:
+			case BPF_ANC | SKF_AD_CPU:
 				emit_load_cpu(r_A);
 				break;
-			case BPF_S_ANC_PROTOCOL:
+			case BPF_ANC | SKF_AD_PROTOCOL:
 				emit_skb_load16(protocol, r_A);
 				break;
 #if 0
@@ -592,38 +584,38 @@ void bpf_jit_compile(struct sk_filter *fp)
 				 * a bit field even though we very much
 				 * know what we are doing here.
 				 */
-			case BPF_S_ANC_PKTTYPE:
+			case BPF_ANC | SKF_AD_PKTTYPE:
 				__emit_skb_load8(pkt_type, r_A);
 				emit_alu_K(SRL, 5);
 				break;
 #endif
-			case BPF_S_ANC_IFINDEX:
+			case BPF_ANC | SKF_AD_IFINDEX:
 				emit_skb_loadptr(dev, r_A);
 				emit_cmpi(r_A, 0);
 				emit_branch(BNE_PTR, cleanup_addr + 4);
 				emit_nop();
 				emit_load32(r_A, struct net_device, ifindex, r_A);
 				break;
-			case BPF_S_ANC_MARK:
+			case BPF_ANC | SKF_AD_MARK:
 				emit_skb_load32(mark, r_A);
 				break;
-			case BPF_S_ANC_QUEUE:
+			case BPF_ANC | SKF_AD_QUEUE:
 				emit_skb_load16(queue_mapping, r_A);
 				break;
-			case BPF_S_ANC_HATYPE:
+			case BPF_ANC | SKF_AD_HATYPE:
 				emit_skb_loadptr(dev, r_A);
 				emit_cmpi(r_A, 0);
 				emit_branch(BNE_PTR, cleanup_addr + 4);
 				emit_nop();
 				emit_load16(r_A, struct net_device, type, r_A);
 				break;
-			case BPF_S_ANC_RXHASH:
+			case BPF_ANC | SKF_AD_RXHASH:
 				emit_skb_load32(hash, r_A);
 				break;
-			case BPF_S_ANC_VLAN_TAG:
-			case BPF_S_ANC_VLAN_TAG_PRESENT:
+			case BPF_ANC | SKF_AD_VLAN_TAG:
+			case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
 				emit_skb_load16(vlan_tci, r_A);
-				if (filter[i].code == BPF_S_ANC_VLAN_TAG) {
+				if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) {
 					emit_andi(r_A, VLAN_VID_MASK, r_A);
 				} else {
 					emit_loadimm(VLAN_TAG_PRESENT, r_TMP);
@@ -631,44 +623,44 @@ void bpf_jit_compile(struct sk_filter *fp)
 				}
 				break;
 
-			case BPF_S_LD_IMM:
+			case BPF_LD | BPF_IMM:
 				emit_loadimm(K, r_A);
 				break;
-			case BPF_S_LDX_IMM:
+			case BPF_LDX | BPF_IMM:
 				emit_loadimm(K, r_X);
 				break;
-			case BPF_S_LD_MEM:
+			case BPF_LD | BPF_MEM:
 				emit_ldmem(K * 4, r_A);
 				break;
-			case BPF_S_LDX_MEM:
+			case BPF_LDX | BPF_MEM:
 				emit_ldmem(K * 4, r_X);
 				break;
-			case BPF_S_ST:
+			case BPF_ST:
 				emit_stmem(K * 4, r_A);
 				break;
-			case BPF_S_STX:
+			case BPF_STX:
 				emit_stmem(K * 4, r_X);
 				break;
 
 #define CHOOSE_LOAD_FUNC(K, func) \
 	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
 
-			case BPF_S_LD_W_ABS:
+			case BPF_LD | BPF_W | BPF_ABS:
 				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_word);
 common_load:			seen |= SEEN_DATAREF;
 				emit_loadimm(K, r_OFF);
 				emit_call(func);
 				break;
-			case BPF_S_LD_H_ABS:
+			case BPF_LD | BPF_H | BPF_ABS:
 				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_half);
 				goto common_load;
-			case BPF_S_LD_B_ABS:
+			case BPF_LD | BPF_B | BPF_ABS:
 				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte);
 				goto common_load;
-			case BPF_S_LDX_B_MSH:
+			case BPF_LDX | BPF_B | BPF_MSH:
 				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte_msh);
 				goto common_load;
-			case BPF_S_LD_W_IND:
+			case BPF_LD | BPF_W | BPF_IND:
 				func = bpf_jit_load_word;
 common_load_ind:		seen |= SEEN_DATAREF | SEEN_XREG;
 				if (K) {
@@ -683,13 +675,13 @@ common_load_ind:		seen |= SEEN_DATAREF | SEEN_XREG;
 				}
 				emit_call(func);
 				break;
-			case BPF_S_LD_H_IND:
+			case BPF_LD | BPF_H | BPF_IND:
 				func = bpf_jit_load_half;
 				goto common_load_ind;
-			case BPF_S_LD_B_IND:
+			case BPF_LD | BPF_B | BPF_IND:
 				func = bpf_jit_load_byte;
 				goto common_load_ind;
-			case BPF_S_JMP_JA:
+			case BPF_JMP | BPF_JA:
 				emit_jump(addrs[i + K]);
 				emit_nop();
 				break;
@@ -700,14 +692,14 @@ common_load_ind:		seen |= SEEN_DATAREF | SEEN_XREG;
 		f_op = FOP;		\
 		goto cond_branch
 
-			COND_SEL(BPF_S_JMP_JGT_K, BGU, BLEU);
-			COND_SEL(BPF_S_JMP_JGE_K, BGEU, BLU);
-			COND_SEL(BPF_S_JMP_JEQ_K, BE, BNE);
-			COND_SEL(BPF_S_JMP_JSET_K, BNE, BE);
-			COND_SEL(BPF_S_JMP_JGT_X, BGU, BLEU);
-			COND_SEL(BPF_S_JMP_JGE_X, BGEU, BLU);
-			COND_SEL(BPF_S_JMP_JEQ_X, BE, BNE);
-			COND_SEL(BPF_S_JMP_JSET_X, BNE, BE);
+			COND_SEL(BPF_JMP | BPF_JGT | BPF_K, BGU, BLEU);
+			COND_SEL(BPF_JMP | BPF_JGE | BPF_K, BGEU, BLU);
+			COND_SEL(BPF_JMP | BPF_JEQ | BPF_K, BE, BNE);
+			COND_SEL(BPF_JMP | BPF_JSET | BPF_K, BNE, BE);
+			COND_SEL(BPF_JMP | BPF_JGT | BPF_X, BGU, BLEU);
+			COND_SEL(BPF_JMP | BPF_JGE | BPF_X, BGEU, BLU);
+			COND_SEL(BPF_JMP | BPF_JEQ | BPF_X, BE, BNE);
+			COND_SEL(BPF_JMP | BPF_JSET | BPF_X, BNE, BE);
 
 cond_branch:			f_offset = addrs[i + filter[i].jf];
 				t_offset = addrs[i + filter[i].jt];
@@ -719,20 +711,20 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 					break;
 				}
 
-				switch (filter[i].code) {
-				case BPF_S_JMP_JGT_X:
-				case BPF_S_JMP_JGE_X:
-				case BPF_S_JMP_JEQ_X:
+				switch (code) {
+				case BPF_JMP | BPF_JGT | BPF_X:
+				case BPF_JMP | BPF_JGE | BPF_X:
+				case BPF_JMP | BPF_JEQ | BPF_X:
 					seen |= SEEN_XREG;
 					emit_cmp(r_A, r_X);
 					break;
-				case BPF_S_JMP_JSET_X:
+				case BPF_JMP | BPF_JSET | BPF_X:
 					seen |= SEEN_XREG;
 					emit_btst(r_A, r_X);
 					break;
-				case BPF_S_JMP_JEQ_K:
-				case BPF_S_JMP_JGT_K:
-				case BPF_S_JMP_JGE_K:
+				case BPF_JMP | BPF_JEQ | BPF_K:
+				case BPF_JMP | BPF_JGT | BPF_K:
+				case BPF_JMP | BPF_JGE | BPF_K:
 					if (is_simm13(K)) {
 						emit_cmpi(r_A, K);
 					} else {
@@ -740,7 +732,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 						emit_cmp(r_A, r_TMP);
 					}
 					break;
-				case BPF_S_JMP_JSET_K:
+				case BPF_JMP | BPF_JSET | BPF_K:
 					if (is_simm13(K)) {
 						emit_btsti(r_A, K);
 					} else {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 625f4de9bdf2..49ef7a298c92 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -197,7 +197,6 @@ int sk_detach_filter(struct sock *sk);
 int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
-void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
 
 void sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
@@ -205,6 +204,41 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct sk_filter *fp);
 
+#define BPF_ANC		BIT(15)
+
+static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
+{
+	BUG_ON(ftest->code & BPF_ANC);
+
+	switch (ftest->code) {
+	case BPF_LD | BPF_W | BPF_ABS:
+	case BPF_LD | BPF_H | BPF_ABS:
+	case BPF_LD | BPF_B | BPF_ABS:
+#define BPF_ANCILLARY(CODE)	case SKF_AD_OFF + SKF_AD_##CODE:	\
+				return BPF_ANC | SKF_AD_##CODE
+		switch (ftest->k) {
+		BPF_ANCILLARY(PROTOCOL);
+		BPF_ANCILLARY(PKTTYPE);
+		BPF_ANCILLARY(IFINDEX);
+		BPF_ANCILLARY(NLATTR);
+		BPF_ANCILLARY(NLATTR_NEST);
+		BPF_ANCILLARY(MARK);
+		BPF_ANCILLARY(QUEUE);
+		BPF_ANCILLARY(HATYPE);
+		BPF_ANCILLARY(RXHASH);
+		BPF_ANCILLARY(CPU);
+		BPF_ANCILLARY(ALU_XOR_X);
+		BPF_ANCILLARY(VLAN_TAG);
+		BPF_ANCILLARY(VLAN_TAG_PRESENT);
+		BPF_ANCILLARY(PAY_OFFSET);
+		BPF_ANCILLARY(RANDOM);
+		}
+		/* Fallthrough. */
+	default:
+		return ftest->code;
+	}
+}
+
 #ifdef CONFIG_BPF_JIT
 #include <stdarg.h>
 #include <linux/linkage.h>
@@ -224,86 +258,20 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 }
 #else
 #include <linux/slab.h>
+
 static inline void bpf_jit_compile(struct sk_filter *fp)
 {
 }
+
 static inline void bpf_jit_free(struct sk_filter *fp)
 {
 	kfree(fp);
 }
-#endif
+#endif /* CONFIG_BPF_JIT */
 
 static inline int bpf_tell_extensions(void)
 {
 	return SKF_AD_MAX;
 }
 
-enum {
-	BPF_S_RET_K = 1,
-	BPF_S_RET_A,
-	BPF_S_ALU_ADD_K,
-	BPF_S_ALU_ADD_X,
-	BPF_S_ALU_SUB_K,
-	BPF_S_ALU_SUB_X,
-	BPF_S_ALU_MUL_K,
-	BPF_S_ALU_MUL_X,
-	BPF_S_ALU_DIV_X,
-	BPF_S_ALU_MOD_K,
-	BPF_S_ALU_MOD_X,
-	BPF_S_ALU_AND_K,
-	BPF_S_ALU_AND_X,
-	BPF_S_ALU_OR_K,
-	BPF_S_ALU_OR_X,
-	BPF_S_ALU_XOR_K,
-	BPF_S_ALU_XOR_X,
-	BPF_S_ALU_LSH_K,
-	BPF_S_ALU_LSH_X,
-	BPF_S_ALU_RSH_K,
-	BPF_S_ALU_RSH_X,
-	BPF_S_ALU_NEG,
-	BPF_S_LD_W_ABS,
-	BPF_S_LD_H_ABS,
-	BPF_S_LD_B_ABS,
-	BPF_S_LD_W_LEN,
-	BPF_S_LD_W_IND,
-	BPF_S_LD_H_IND,
-	BPF_S_LD_B_IND,
-	BPF_S_LD_IMM,
-	BPF_S_LDX_W_LEN,
-	BPF_S_LDX_B_MSH,
-	BPF_S_LDX_IMM,
-	BPF_S_MISC_TAX,
-	BPF_S_MISC_TXA,
-	BPF_S_ALU_DIV_K,
-	BPF_S_LD_MEM,
-	BPF_S_LDX_MEM,
-	BPF_S_ST,
-	BPF_S_STX,
-	BPF_S_JMP_JA,
-	BPF_S_JMP_JEQ_K,
-	BPF_S_JMP_JEQ_X,
-	BPF_S_JMP_JGE_K,
-	BPF_S_JMP_JGE_X,
-	BPF_S_JMP_JGT_K,
-	BPF_S_JMP_JGT_X,
-	BPF_S_JMP_JSET_K,
-	BPF_S_JMP_JSET_X,
-	/* Ancillary data */
-	BPF_S_ANC_PROTOCOL,
-	BPF_S_ANC_PKTTYPE,
-	BPF_S_ANC_IFINDEX,
-	BPF_S_ANC_NLATTR,
-	BPF_S_ANC_NLATTR_NEST,
-	BPF_S_ANC_MARK,
-	BPF_S_ANC_QUEUE,
-	BPF_S_ANC_HATYPE,
-	BPF_S_ANC_RXHASH,
-	BPF_S_ANC_CPU,
-	BPF_S_ANC_ALU_XOR_X,
-	BPF_S_ANC_VLAN_TAG,
-	BPF_S_ANC_VLAN_TAG_PRESENT,
-	BPF_S_ANC_PAY_OFFSET,
-	BPF_S_ANC_RANDOM,
-};
-
 #endif /* __LINUX_FILTER_H__ */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1036b6f2fded..44e69483b383 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -103,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 		u32 k = ftest->k;
 
 		switch (code) {
-		case BPF_S_LD_W_ABS:
+		case BPF_LD | BPF_W | BPF_ABS:
 			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
 			/* 32-bit aligned and not out of bounds. */
 			if (k >= sizeof(struct seccomp_data) || k & 3)
 				return -EINVAL;
 			continue;
-		case BPF_S_LD_W_LEN:
+		case BPF_LD | BPF_W | BPF_LEN:
 			ftest->code = BPF_LD | BPF_IMM;
 			ftest->k = sizeof(struct seccomp_data);
 			continue;
-		case BPF_S_LDX_W_LEN:
+		case BPF_LDX | BPF_W | BPF_LEN:
 			ftest->code = BPF_LDX | BPF_IMM;
 			ftest->k = sizeof(struct seccomp_data);
 			continue;
 		/* Explicitly include allowed calls. */
-		case BPF_S_RET_K:
-		case BPF_S_RET_A:
-		case BPF_S_ALU_ADD_K:
-		case BPF_S_ALU_ADD_X:
-		case BPF_S_ALU_SUB_K:
-		case BPF_S_ALU_SUB_X:
-		case BPF_S_ALU_MUL_K:
-		case BPF_S_ALU_MUL_X:
-		case BPF_S_ALU_DIV_X:
-		case BPF_S_ALU_AND_K:
-		case BPF_S_ALU_AND_X:
-		case BPF_S_ALU_OR_K:
-		case BPF_S_ALU_OR_X:
-		case BPF_S_ALU_XOR_K:
-		case BPF_S_ALU_XOR_X:
-		case BPF_S_ALU_LSH_K:
-		case BPF_S_ALU_LSH_X:
-		case BPF_S_ALU_RSH_K:
-		case BPF_S_ALU_RSH_X:
-		case BPF_S_ALU_NEG:
-		case BPF_S_LD_IMM:
-		case BPF_S_LDX_IMM:
-		case BPF_S_MISC_TAX:
-		case BPF_S_MISC_TXA:
-		case BPF_S_ALU_DIV_K:
-		case BPF_S_LD_MEM:
-		case BPF_S_LDX_MEM:
-		case BPF_S_ST:
-		case BPF_S_STX:
-		case BPF_S_JMP_JA:
-		case BPF_S_JMP_JEQ_K:
-		case BPF_S_JMP_JEQ_X:
-		case BPF_S_JMP_JGE_K:
-		case BPF_S_JMP_JGE_X:
-		case BPF_S_JMP_JGT_K:
-		case BPF_S_JMP_JGT_X:
-		case BPF_S_JMP_JSET_K:
-		case BPF_S_JMP_JSET_X:
-			sk_decode_filter(ftest, ftest);
+		case BPF_RET | BPF_K:
+		case BPF_RET | BPF_A:
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU | BPF_LSH | BPF_K:
+		case BPF_ALU | BPF_LSH | BPF_X:
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_RSH | BPF_X:
+		case BPF_ALU | BPF_NEG:
+		case BPF_LD | BPF_IMM:
+		case BPF_LDX | BPF_IMM:
+		case BPF_MISC | BPF_TAX:
+		case BPF_MISC | BPF_TXA:
+		case BPF_LD | BPF_MEM:
+		case BPF_LDX | BPF_MEM:
+		case BPF_ST:
+		case BPF_STX:
+		case BPF_JMP | BPF_JA:
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JSET | BPF_K:
+		case BPF_JMP | BPF_JSET | BPF_X:
 			continue;
 		default:
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index 2c2d35d9d101..328aaf6ff4d1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -536,11 +536,13 @@ load_word:
 		 * Output:
 		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
 		 */
+
 		ptr = load_pointer((struct sk_buff *) ctx, off, 4, &tmp);
 		if (likely(ptr != NULL)) {
 			BPF_R0 = get_unaligned_be32(ptr);
 			CONT;
 		}
+
 		return 0;
 	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + K)) */
 		off = K;
@@ -550,6 +552,7 @@ load_half:
 			BPF_R0 = get_unaligned_be16(ptr);
 			CONT;
 		}
+
 		return 0;
 	LD_ABS_B: /* BPF_R0 = *(u8 *) (ctx + K) */
 		off = K;
@@ -559,6 +562,7 @@ load_byte:
 			BPF_R0 = *(u8 *)ptr;
 			CONT;
 		}
+
 		return 0;
 	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + X + K)) */
 		off = K + X;
@@ -1136,44 +1140,46 @@ err:
  */
 static int check_load_and_stores(struct sock_filter *filter, int flen)
 {
-	u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
+	u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
 	int pc, ret = 0;
 
 	BUILD_BUG_ON(BPF_MEMWORDS > 16);
+
 	masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
 	if (!masks)
 		return -ENOMEM;
+
 	memset(masks, 0xff, flen * sizeof(*masks));
 
 	for (pc = 0; pc < flen; pc++) {
 		memvalid &= masks[pc];
 
 		switch (filter[pc].code) {
-		case BPF_S_ST:
-		case BPF_S_STX:
+		case BPF_ST:
+		case BPF_STX:
 			memvalid |= (1 << filter[pc].k);
 			break;
-		case BPF_S_LD_MEM:
-		case BPF_S_LDX_MEM:
+		case BPF_LD | BPF_MEM:
+		case BPF_LDX | BPF_MEM:
 			if (!(memvalid & (1 << filter[pc].k))) {
 				ret = -EINVAL;
 				goto error;
 			}
 			break;
-		case BPF_S_JMP_JA:
-			/* a jump must set masks on target */
+		case BPF_JMP | BPF_JA:
+			/* A jump must set masks on target */
 			masks[pc + 1 + filter[pc].k] &= memvalid;
 			memvalid = ~0;
 			break;
-		case BPF_S_JMP_JEQ_K:
-		case BPF_S_JMP_JEQ_X:
-		case BPF_S_JMP_JGE_K:
-		case BPF_S_JMP_JGE_X:
-		case BPF_S_JMP_JGT_K:
-		case BPF_S_JMP_JGT_X:
-		case BPF_S_JMP_JSET_X:
-		case BPF_S_JMP_JSET_K:
-			/* a jump must set masks on targets */
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JSET | BPF_K:
+		case BPF_JMP | BPF_JSET | BPF_X:
+			/* A jump must set masks on targets */
 			masks[pc + 1 + filter[pc].jt] &= memvalid;
 			masks[pc + 1 + filter[pc].jf] &= memvalid;
 			memvalid = ~0;
@@ -1185,6 +1191,72 @@ error:
 	return ret;
 }
 
+static bool chk_code_allowed(u16 code_to_probe)
+{
+	static const bool codes[] = {
+		/* 32 bit ALU operations */
+		[BPF_ALU | BPF_ADD | BPF_K] = true,
+		[BPF_ALU | BPF_ADD | BPF_X] = true,
+		[BPF_ALU | BPF_SUB | BPF_K] = true,
+		[BPF_ALU | BPF_SUB | BPF_X] = true,
+		[BPF_ALU | BPF_MUL | BPF_K] = true,
+		[BPF_ALU | BPF_MUL | BPF_X] = true,
+		[BPF_ALU | BPF_DIV | BPF_K] = true,
+		[BPF_ALU | BPF_DIV | BPF_X] = true,
+		[BPF_ALU | BPF_MOD | BPF_K] = true,
+		[BPF_ALU | BPF_MOD | BPF_X] = true,
+		[BPF_ALU | BPF_AND | BPF_K] = true,
+		[BPF_ALU | BPF_AND | BPF_X] = true,
+		[BPF_ALU | BPF_OR | BPF_K] = true,
+		[BPF_ALU | BPF_OR | BPF_X] = true,
+		[BPF_ALU | BPF_XOR | BPF_K] = true,
+		[BPF_ALU | BPF_XOR | BPF_X] = true,
+		[BPF_ALU | BPF_LSH | BPF_K] = true,
+		[BPF_ALU | BPF_LSH | BPF_X] = true,
+		[BPF_ALU | BPF_RSH | BPF_K] = true,
+		[BPF_ALU | BPF_RSH | BPF_X] = true,
+		[BPF_ALU | BPF_NEG] = true,
+		/* Load instructions */
+		[BPF_LD | BPF_W | BPF_ABS] = true,
+		[BPF_LD | BPF_H | BPF_ABS] = true,
+		[BPF_LD | BPF_B | BPF_ABS] = true,
+		[BPF_LD | BPF_W | BPF_LEN] = true,
+		[BPF_LD | BPF_W | BPF_IND] = true,
+		[BPF_LD | BPF_H | BPF_IND] = true,
+		[BPF_LD | BPF_B | BPF_IND] = true,
+		[BPF_LD | BPF_IMM] = true,
+		[BPF_LD | BPF_MEM] = true,
+		[BPF_LDX | BPF_W | BPF_LEN] = true,
+		[BPF_LDX | BPF_B | BPF_MSH] = true,
+		[BPF_LDX | BPF_IMM] = true,
+		[BPF_LDX | BPF_MEM] = true,
+		/* Store instructions */
+		[BPF_ST] = true,
+		[BPF_STX] = true,
+		/* Misc instructions */
+		[BPF_MISC | BPF_TAX] = true,
+		[BPF_MISC | BPF_TXA] = true,
+		/* Return instructions */
+		[BPF_RET | BPF_K] = true,
+		[BPF_RET | BPF_A] = true,
+		/* Jump instructions */
+		[BPF_JMP | BPF_JA] = true,
+		[BPF_JMP | BPF_JEQ | BPF_K] = true,
+		[BPF_JMP | BPF_JEQ | BPF_X] = true,
+		[BPF_JMP | BPF_JGE | BPF_K] = true,
+		[BPF_JMP | BPF_JGE | BPF_X] = true,
+		[BPF_JMP | BPF_JGT | BPF_K] = true,
+		[BPF_JMP | BPF_JGT | BPF_X] = true,
+		[BPF_JMP | BPF_JSET | BPF_K] = true,
+		[BPF_JMP | BPF_JSET | BPF_X] = true,
+	};
+
+	if (code_to_probe >= ARRAY_SIZE(codes))
+		return false;
+
+	return codes[code_to_probe];
+}
+
 /**
  *	sk_chk_filter - verify socket filter code
  *	@filter: filter to verify
@@ -1201,154 +1273,76 @@ error:
  */
 int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 {
-	/*
-	 * Valid instructions are initialized to non-0.
-	 * Invalid instructions are initialized to 0.
-	 */
-	static const u8 codes[] = {
-		[BPF_ALU|BPF_ADD|BPF_K]  = BPF_S_ALU_ADD_K,
-		[BPF_ALU|BPF_ADD|BPF_X]  = BPF_S_ALU_ADD_X,
-		[BPF_ALU|BPF_SUB|BPF_K]  = BPF_S_ALU_SUB_K,
-		[BPF_ALU|BPF_SUB|BPF_X]  = BPF_S_ALU_SUB_X,
-		[BPF_ALU|BPF_MUL|BPF_K]  = BPF_S_ALU_MUL_K,
-		[BPF_ALU|BPF_MUL|BPF_X]  = BPF_S_ALU_MUL_X,
-		[BPF_ALU|BPF_DIV|BPF_X]  = BPF_S_ALU_DIV_X,
-		[BPF_ALU|BPF_MOD|BPF_K]  = BPF_S_ALU_MOD_K,
-		[BPF_ALU|BPF_MOD|BPF_X]  = BPF_S_ALU_MOD_X,
-		[BPF_ALU|BPF_AND|BPF_K]  = BPF_S_ALU_AND_K,
-		[BPF_ALU|BPF_AND|BPF_X]  = BPF_S_ALU_AND_X,
-		[BPF_ALU|BPF_OR|BPF_K]   = BPF_S_ALU_OR_K,
-		[BPF_ALU|BPF_OR|BPF_X]   = BPF_S_ALU_OR_X,
-		[BPF_ALU|BPF_XOR|BPF_K]  = BPF_S_ALU_XOR_K,
-		[BPF_ALU|BPF_XOR|BPF_X]  = BPF_S_ALU_XOR_X,
-		[BPF_ALU|BPF_LSH|BPF_K]  = BPF_S_ALU_LSH_K,
-		[BPF_ALU|BPF_LSH|BPF_X]  = BPF_S_ALU_LSH_X,
-		[BPF_ALU|BPF_RSH|BPF_K]  = BPF_S_ALU_RSH_K,
-		[BPF_ALU|BPF_RSH|BPF_X]  = BPF_S_ALU_RSH_X,
-		[BPF_ALU|BPF_NEG]        = BPF_S_ALU_NEG,
-		[BPF_LD|BPF_W|BPF_ABS]   = BPF_S_LD_W_ABS,
-		[BPF_LD|BPF_H|BPF_ABS]   = BPF_S_LD_H_ABS,
-		[BPF_LD|BPF_B|BPF_ABS]   = BPF_S_LD_B_ABS,
-		[BPF_LD|BPF_W|BPF_LEN]   = BPF_S_LD_W_LEN,
-		[BPF_LD|BPF_W|BPF_IND]   = BPF_S_LD_W_IND,
-		[BPF_LD|BPF_H|BPF_IND]   = BPF_S_LD_H_IND,
-		[BPF_LD|BPF_B|BPF_IND]   = BPF_S_LD_B_IND,
-		[BPF_LD|BPF_IMM]         = BPF_S_LD_IMM,
-		[BPF_LDX|BPF_W|BPF_LEN]  = BPF_S_LDX_W_LEN,
-		[BPF_LDX|BPF_B|BPF_MSH]  = BPF_S_LDX_B_MSH,
-		[BPF_LDX|BPF_IMM]        = BPF_S_LDX_IMM,
-		[BPF_MISC|BPF_TAX]       = BPF_S_MISC_TAX,
-		[BPF_MISC|BPF_TXA]       = BPF_S_MISC_TXA,
-		[BPF_RET|BPF_K]          = BPF_S_RET_K,
-		[BPF_RET|BPF_A]          = BPF_S_RET_A,
-		[BPF_ALU|BPF_DIV|BPF_K]  = BPF_S_ALU_DIV_K,
-		[BPF_LD|BPF_MEM]         = BPF_S_LD_MEM,
-		[BPF_LDX|BPF_MEM]        = BPF_S_LDX_MEM,
-		[BPF_ST]                 = BPF_S_ST,
-		[BPF_STX]                = BPF_S_STX,
-		[BPF_JMP|BPF_JA]         = BPF_S_JMP_JA,
-		[BPF_JMP|BPF_JEQ|BPF_K]  = BPF_S_JMP_JEQ_K,
-		[BPF_JMP|BPF_JEQ|BPF_X]  = BPF_S_JMP_JEQ_X,
-		[BPF_JMP|BPF_JGE|BPF_K]  = BPF_S_JMP_JGE_K,
-		[BPF_JMP|BPF_JGE|BPF_X]  = BPF_S_JMP_JGE_X,
-		[BPF_JMP|BPF_JGT|BPF_K]  = BPF_S_JMP_JGT_K,
-		[BPF_JMP|BPF_JGT|BPF_X]  = BPF_S_JMP_JGT_X,
-		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
-		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
-	};
-	int pc;
 	bool anc_found;
+	int pc;
 
 	if (flen == 0 || flen > BPF_MAXINSNS)
 		return -EINVAL;
 
-	/* check the filter code now */
+	/* Check the filter code now */
 	for (pc = 0; pc < flen; pc++) {
 		struct sock_filter *ftest = &filter[pc];
-		u16 code = ftest->code;
 
-		if (code >= ARRAY_SIZE(codes))
-			return -EINVAL;
-		code = codes[code];
-		if (!code)
+		/* May we actually operate on this code? */
+		if (!chk_code_allowed(ftest->code))
 			return -EINVAL;
+
 		/* Some instructions need special checks */
-		switch (code) {
-		case BPF_S_ALU_DIV_K:
-		case BPF_S_ALU_MOD_K:
-			/* check for division by zero */
+		switch (ftest->code) {
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_MOD | BPF_K:
+			/* Check for division by zero */
 			if (ftest->k == 0)
 				return -EINVAL;
 			break;
-		case BPF_S_LD_MEM:
-		case BPF_S_LDX_MEM:
-		case BPF_S_ST:
-		case BPF_S_STX:
-			/* check for invalid memory addresses */
+		case BPF_LD | BPF_MEM:
+		case BPF_LDX | BPF_MEM:
+		case BPF_ST:
+		case BPF_STX:
+			/* Check for invalid memory addresses */
 			if (ftest->k >= BPF_MEMWORDS)
 				return -EINVAL;
 			break;
-		case BPF_S_JMP_JA:
-			/*
-			 * Note, the large ftest->k might cause loops.
+		case BPF_JMP | BPF_JA:
+			/* Note, the large ftest->k might cause loops.
 			 * Compare this with conditional jumps below,
 			 * where offsets are limited. --ANK (981016)
 			 */
-			if (ftest->k >= (unsigned int)(flen-pc-1))
+			if (ftest->k >= (unsigned int)(flen - pc - 1))
 				return -EINVAL;
 			break;
-		case BPF_S_JMP_JEQ_K:
-		case BPF_S_JMP_JEQ_X:
-		case BPF_S_JMP_JGE_K:
-		case BPF_S_JMP_JGE_X:
-		case BPF_S_JMP_JGT_K:
-		case BPF_S_JMP_JGT_X:
-		case BPF_S_JMP_JSET_X:
-		case BPF_S_JMP_JSET_K:
-			/* for conditionals both must be safe */
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JSET | BPF_K:
+		case BPF_JMP | BPF_JSET | BPF_X:
+			/* Both conditionals must be safe */
 			if (pc + ftest->jt + 1 >= flen ||
 			    pc + ftest->jf + 1 >= flen)
 				return -EINVAL;
 			break;
-		case BPF_S_LD_W_ABS:
-		case BPF_S_LD_H_ABS:
-		case BPF_S_LD_B_ABS:
+		case BPF_LD | BPF_W | BPF_ABS:
+		case BPF_LD | BPF_H | BPF_ABS:
+		case BPF_LD | BPF_B | BPF_ABS:
 			anc_found = false;
-#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\
-				code = BPF_S_ANC_##CODE;	\
-				anc_found = true;		\
-				break
-			switch (ftest->k) {
-			ANCILLARY(PROTOCOL);
-			ANCILLARY(PKTTYPE);
-			ANCILLARY(IFINDEX);
-			ANCILLARY(NLATTR);
-			ANCILLARY(NLATTR_NEST);
-			ANCILLARY(MARK);
-			ANCILLARY(QUEUE);
-			ANCILLARY(HATYPE);
-			ANCILLARY(RXHASH);
-			ANCILLARY(CPU);
-			ANCILLARY(ALU_XOR_X);
-			ANCILLARY(VLAN_TAG);
-			ANCILLARY(VLAN_TAG_PRESENT);
-			ANCILLARY(PAY_OFFSET);
-			ANCILLARY(RANDOM);
-			}
-
-			/* ancillary operation unknown or unsupported */
+			if (bpf_anc_helper(ftest) & BPF_ANC)
+				anc_found = true;
+			/* Ancillary operation unknown or unsupported */
 			if (anc_found == false && ftest->k >= SKF_AD_OFF)
 				return -EINVAL;
 		}
-		ftest->code = code;
 	}
 
-	/* last instruction must be a RET code */
+	/* Last instruction must be a RET code */
 	switch (filter[flen - 1].code) {
-	case BPF_S_RET_K:
-	case BPF_S_RET_A:
+	case BPF_RET | BPF_K:
+	case BPF_RET | BPF_A:
 		return check_load_and_stores(filter, flen);
 	}
+
 	return -EINVAL;
 }
 EXPORT_SYMBOL(sk_chk_filter);
@@ -1448,7 +1442,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 {
 	struct sock_filter *old_prog;
 	struct sk_filter *old_fp;
-	int i, err, new_len, old_len = fp->len;
+	int err, new_len, old_len = fp->len;
 
 	/* We are free to overwrite insns et al right here as it
 	 * won't be used at this point in time anymore internally
@@ -1458,13 +1452,6 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 	BUILD_BUG_ON(sizeof(struct sock_filter) !=
 		     sizeof(struct sock_filter_int));
 
-	/* For now, we need to unfiddle BPF_S_* identifiers in place.
-	 * This can sooner or later on be subject to removal, e.g. when
-	 * JITs have been converted.
-	 */
-	for (i = 0; i < fp->len; i++)
-		sk_decode_filter(&fp->insns[i], &fp->insns[i]);
-
 	/* Conversion cannot happen on overlapping memory areas,
 	 * so we need to keep the user BPF around until the 2nd
 	 * pass. At this time, the user BPF is stored in fp->insns.
@@ -1706,84 +1693,6 @@ int sk_detach_filter(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_detach_filter);
 
-void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
-{
-	static const u16 decodes[] = {
-		[BPF_S_ALU_ADD_K]	= BPF_ALU|BPF_ADD|BPF_K,
-		[BPF_S_ALU_ADD_X]	= BPF_ALU|BPF_ADD|BPF_X,
-		[BPF_S_ALU_SUB_K]	= BPF_ALU|BPF_SUB|BPF_K,
-		[BPF_S_ALU_SUB_X]	= BPF_ALU|BPF_SUB|BPF_X,
-		[BPF_S_ALU_MUL_K]	= BPF_ALU|BPF_MUL|BPF_K,
-		[BPF_S_ALU_MUL_X]	= BPF_ALU|BPF_MUL|BPF_X,
-		[BPF_S_ALU_DIV_X]	= BPF_ALU|BPF_DIV|BPF_X,
-		[BPF_S_ALU_MOD_K]	= BPF_ALU|BPF_MOD|BPF_K,
-		[BPF_S_ALU_MOD_X]	= BPF_ALU|BPF_MOD|BPF_X,
-		[BPF_S_ALU_AND_K]	= BPF_ALU|BPF_AND|BPF_K,
-		[BPF_S_ALU_AND_X]	= BPF_ALU|BPF_AND|BPF_X,
-		[BPF_S_ALU_OR_K]	= BPF_ALU|BPF_OR|BPF_K,
-		[BPF_S_ALU_OR_X]	= BPF_ALU|BPF_OR|BPF_X,
-		[BPF_S_ALU_XOR_K]	= BPF_ALU|BPF_XOR|BPF_K,
-		[BPF_S_ALU_XOR_X]	= BPF_ALU|BPF_XOR|BPF_X,
-		[BPF_S_ALU_LSH_K]	= BPF_ALU|BPF_LSH|BPF_K,
-		[BPF_S_ALU_LSH_X]	= BPF_ALU|BPF_LSH|BPF_X,
-		[BPF_S_ALU_RSH_K]	= BPF_ALU|BPF_RSH|BPF_K,
-		[BPF_S_ALU_RSH_X]	= BPF_ALU|BPF_RSH|BPF_X,
-		[BPF_S_ALU_NEG]		= BPF_ALU|BPF_NEG,
-		[BPF_S_LD_W_ABS]	= BPF_LD|BPF_W|BPF_ABS,
-		[BPF_S_LD_H_ABS]	= BPF_LD|BPF_H|BPF_ABS,
-		[BPF_S_LD_B_ABS]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_PKTTYPE]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_IFINDEX]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_NLATTR]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_NLATTR_NEST]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_MARK]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_QUEUE]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_HATYPE]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_RXHASH]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_CPU]		= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_ALU_XOR_X]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
-		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
-		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
-		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
-		[BPF_S_LD_B_IND]	= BPF_LD|BPF_B|BPF_IND,
-		[BPF_S_LD_IMM]		= BPF_LD|BPF_IMM,
-		[BPF_S_LDX_W_LEN]	= BPF_LDX|BPF_W|BPF_LEN,
-		[BPF_S_LDX_B_MSH]	= BPF_LDX|BPF_B|BPF_MSH,
-		[BPF_S_LDX_IMM]		= BPF_LDX|BPF_IMM,
-		[BPF_S_MISC_TAX]	= BPF_MISC|BPF_TAX,
-		[BPF_S_MISC_TXA]	= BPF_MISC|BPF_TXA,
-		[BPF_S_RET_K]		= BPF_RET|BPF_K,
-		[BPF_S_RET_A]		= BPF_RET|BPF_A,
-		[BPF_S_ALU_DIV_K]	= BPF_ALU|BPF_DIV|BPF_K,
-		[BPF_S_LD_MEM]		= BPF_LD|BPF_MEM,
-		[BPF_S_LDX_MEM]		= BPF_LDX|BPF_MEM,
-		[BPF_S_ST]		= BPF_ST,
-		[BPF_S_STX]		= BPF_STX,
-		[BPF_S_JMP_JA]		= BPF_JMP|BPF_JA,
-		[BPF_S_JMP_JEQ_K]	= BPF_JMP|BPF_JEQ|BPF_K,
-		[BPF_S_JMP_JEQ_X]	= BPF_JMP|BPF_JEQ|BPF_X,
-		[BPF_S_JMP_JGE_K]	= BPF_JMP|BPF_JGE|BPF_K,
-		[BPF_S_JMP_JGE_X]	= BPF_JMP|BPF_JGE|BPF_X,
-		[BPF_S_JMP_JGT_K]	= BPF_JMP|BPF_JGT|BPF_K,
-		[BPF_S_JMP_JGT_X]	= BPF_JMP|BPF_JGT|BPF_X,
-		[BPF_S_JMP_JSET_K]	= BPF_JMP|BPF_JSET|BPF_K,
-		[BPF_S_JMP_JSET_X]	= BPF_JMP|BPF_JSET|BPF_X,
-	};
-	u16 code;
-
-	code = filt->code;
-
-	to->code = decodes[code];
-	to->jt = filt->jt;
-	to->jf = filt->jf;
-	to->k = filt->k;
-}
-
 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 		  unsigned int len)
 {
-- 
cgit v1.2.3-71-gd317


From f8f6d679aaa78b989d9aee8d2935066fbdca2a30 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 29 May 2014 10:22:51 +0200
Subject: net: filter: improve filter block macros

Commit 9739eef13c92 ("net: filter: make BPF conversion more readable")
started to introduce helper macros similar to BPF_STMT()/BPF_JUMP()
macros from classic BPF.

However, quite some statements in the filter conversion functions
remained in the old style which gives a mixture of block macros and
non block macros in the code. This patch makes the block macros itself
more readable by using explicit member initialization, and converts
the remaining ones where possible to remain in a more consistent state.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 255 +++++++++++++++++++++++++++++++++++++++----------
 net/core/filter.c      | 196 ++++++++++++++-----------------------
 2 files changed, 277 insertions(+), 174 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 49ef7a298c92..f0c2ad43b4af 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -76,56 +76,211 @@ enum {
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
-/* bpf_add|sub|...: a += x, bpf_mov: a = x */
-#define BPF_ALU64_REG(op, a, x) \
-	((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_X, a, x, 0, 0})
-#define BPF_ALU32_REG(op, a, x) \
-	((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_X, a, x, 0, 0})
-
-/* bpf_add|sub|...: a += imm, bpf_mov: a = imm */
-#define BPF_ALU64_IMM(op, a, imm) \
-	((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_K, a, 0, 0, imm})
-#define BPF_ALU32_IMM(op, a, imm) \
-	((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_K, a, 0, 0, imm})
-
-/* R0 = *(uint *) (skb->data + off) */
-#define BPF_LD_ABS(size, off) \
-	((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_ABS, 0, 0, 0, off})
-
-/* R0 = *(uint *) (skb->data + x + off) */
-#define BPF_LD_IND(size, x, off) \
-	((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_IND, 0, x, 0, off})
-
-/* a = *(uint *) (x + off) */
-#define BPF_LDX_MEM(sz, a, x, off) \
-	((struct sock_filter_int) {BPF_LDX|BPF_SIZE(sz)|BPF_MEM, a, x, off, 0})
-
-/* if (a 'op' x) goto pc+off */
-#define BPF_JMP_REG(op, a, x, off) \
-	((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_X, a, x, off, 0})
-
-/* if (a 'op' imm) goto pc+off */
-#define BPF_JMP_IMM(op, a, imm, off) \
-	((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_K, a, 0, off, imm})
-
-#define BPF_EXIT_INSN() \
-	((struct sock_filter_int) {BPF_JMP|BPF_EXIT, 0, 0, 0, 0})
-
-static inline int size_to_bpf(int size)
-{
-	switch (size) {
-	case 1:
-		return BPF_B;
-	case 2:
-		return BPF_H;
-	case 4:
-		return BPF_W;
-	case 8:
-		return BPF_DW;
-	default:
-		return -EINVAL;
-	}
-}
+/* Helper macros for filter block array initializers. */
+
+/* ALU ops on registers, bpf_add|sub|...: A += X */
+
+#define BPF_ALU64_REG(OP, A, X)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define BPF_ALU32_REG(OP, A, X)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: A += IMM */
+
+#define BPF_ALU64_IMM(OP, A, IMM)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
+		.a_reg = A,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_ALU32_IMM(OP, A, IMM)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
+		.a_reg = A,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
+
+#define BPF_ENDIAN(TYPE, A, LEN)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),	\
+		.a_reg = A,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = LEN })
+
+/* Short form of mov, A = X */
+
+#define BPF_MOV64_REG(A, X)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define BPF_MOV32_REG(A, X)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* Short form of mov, A = IMM */
+
+#define BPF_MOV64_IMM(A, IMM)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
+		.a_reg = A,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_MOV32_IMM(A, IMM)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU | BPF_MOV | BPF_K,		\
+		.a_reg = A,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Short form of mov based on type, BPF_X: A = X,  BPF_K: A = IMM */
+
+#define BPF_MOV64_RAW(TYPE, A, X, IMM)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),	\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_MOV32_RAW(TYPE, A, X, IMM)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),	\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Direct packet access, R0 = *(uint *) (skb->data + OFF) */
+
+#define BPF_LD_ABS(SIZE, OFF)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
+		.a_reg = 0,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = OFF })
+
+/* Indirect packet access, R0 = *(uint *) (skb->data + X + OFF) */
+
+#define BPF_LD_IND(SIZE, X, OFF)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,	\
+		.a_reg = 0,					\
+		.x_reg = X,					\
+		.off   = 0,					\
+		.imm   = OFF })
+
+/* Memory store, A = *(uint *) (X + OFF), and vice versa */
+
+#define BPF_LDX_MEM(SIZE, A, X, OFF)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+#define BPF_STX_MEM(SIZE, A, X, OFF)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Conditional jumps against registers, if (A 'op' X) goto pc + OFF */
+
+#define BPF_JMP_REG(OP, A, X, OFF)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Conditional jumps against immediates, if (A 'op' IMM) goto pc + OFF */
+
+#define BPF_JMP_IMM(OP, A, IMM, OFF)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
+		.a_reg = A,					\
+		.x_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Function call */
+
+#define BPF_EMIT_CALL(FUNC)					\
+	((struct sock_filter_int) {				\
+		.code  = BPF_JMP | BPF_CALL,			\
+		.a_reg = 0,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = ((FUNC) - __bpf_call_base) })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, A, X, OFF, IMM)			\
+	((struct sock_filter_int) {				\
+		.code  = CODE,					\
+		.a_reg = A,					\
+		.x_reg = X,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN()						\
+	((struct sock_filter_int) {				\
+		.code  = BPF_JMP | BPF_EXIT,			\
+		.a_reg = 0,					\
+		.x_reg = 0,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define bytes_to_bpf_size(bytes)				\
+({								\
+	int bpf_size = -EINVAL;					\
+								\
+	if (bytes == sizeof(u8))				\
+		bpf_size = BPF_B;				\
+	else if (bytes == sizeof(u16))				\
+		bpf_size = BPF_H;				\
+	else if (bytes == sizeof(u32))				\
+		bpf_size = BPF_W;				\
+	else if (bytes == sizeof(u64))				\
+		bpf_size = BPF_DW;				\
+								\
+	bpf_size;						\
+})
 
 /* Macro to invoke filter function. */
 #define SK_RUN_FILTER(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
diff --git a/net/core/filter.c b/net/core/filter.c
index 328aaf6ff4d1..842f8393121d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -672,14 +672,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
 		/* A = *(u16 *) (ctx + offsetof(protocol)) */
-		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
-				    offsetof(struct sk_buff, protocol));
-		insn++;
-
+		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+				      offsetof(struct sk_buff, protocol));
 		/* A = ntohs(A) [emitting a nop or swap16] */
-		insn->code = BPF_ALU | BPF_END | BPF_FROM_BE;
-		insn->a_reg = BPF_REG_A;
-		insn->imm = 16;
+		*insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 		break;
 
 	case SKF_AD_OFF + SKF_AD_PKTTYPE:
@@ -688,37 +684,27 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		if (insn->off < 0)
 			return false;
 		insn++;
-
 		*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
 		break;
 
 	case SKF_AD_OFF + SKF_AD_IFINDEX:
 	case SKF_AD_OFF + SKF_AD_HATYPE:
-		*insn = BPF_LDX_MEM(size_to_bpf(FIELD_SIZEOF(struct sk_buff, dev)),
-				    BPF_REG_TMP, BPF_REG_CTX,
-				    offsetof(struct sk_buff, dev));
-		insn++;
-
-		/* if (tmp != 0) goto pc+1 */
-		*insn = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
-		insn++;
-
-		*insn = BPF_EXIT_INSN();
-		insn++;
-
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
-
-		insn->a_reg = BPF_REG_A;
-		insn->x_reg = BPF_REG_TMP;
-
-		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) {
-			insn->code = BPF_LDX | BPF_MEM | BPF_W;
-			insn->off = offsetof(struct net_device, ifindex);
-		} else {
-			insn->code = BPF_LDX | BPF_MEM | BPF_H;
-			insn->off = offsetof(struct net_device, type);
-		}
+		BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
+
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+				      BPF_REG_TMP, BPF_REG_CTX,
+				      offsetof(struct sk_buff, dev));
+		/* if (tmp != 0) goto pc + 1 */
+		*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+		*insn++ = BPF_EXIT_INSN();
+		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+					    offsetof(struct net_device, ifindex));
+		else
+			*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+					    offsetof(struct net_device, type));
 		break;
 
 	case SKF_AD_OFF + SKF_AD_MARK:
@@ -745,22 +731,17 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_VLAN_TAG:
 	case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
-
-		/* A = *(u16 *) (ctx + offsetof(vlan_tci)) */
-		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
-				    offsetof(struct sk_buff, vlan_tci));
-		insn++;
-
 		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
 
+		/* A = *(u16 *) (ctx + offsetof(vlan_tci)) */
+		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+				      offsetof(struct sk_buff, vlan_tci));
 		if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
 			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
 					      ~VLAN_TAG_PRESENT);
 		} else {
 			/* A >>= 12 */
-			*insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
-			insn++;
-
+			*insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
 			/* A &= 1 */
 			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
 		}
@@ -772,34 +753,27 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
 		/* arg1 = ctx */
-		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
-		insn++;
-
+		*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 		/* arg2 = A */
-		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG2, BPF_REG_A);
-		insn++;
-
+		*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
 		/* arg3 = X */
-		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
-		insn++;
-
+		*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
 		/* Emit call(ctx, arg2=A, arg3=X) */
-		insn->code = BPF_JMP | BPF_CALL;
 		switch (fp->k) {
 		case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
-			insn->imm = __skb_get_pay_offset - __bpf_call_base;
+			*insn = BPF_EMIT_CALL(__skb_get_pay_offset);
 			break;
 		case SKF_AD_OFF + SKF_AD_NLATTR:
-			insn->imm = __skb_get_nlattr - __bpf_call_base;
+			*insn = BPF_EMIT_CALL(__skb_get_nlattr);
 			break;
 		case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
-			insn->imm = __skb_get_nlattr_nest - __bpf_call_base;
+			*insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
 			break;
 		case SKF_AD_OFF + SKF_AD_CPU:
-			insn->imm = __get_raw_cpu_id - __bpf_call_base;
+			*insn = BPF_EMIT_CALL(__get_raw_cpu_id);
 			break;
 		case SKF_AD_OFF + SKF_AD_RANDOM:
-			insn->imm = __get_random_u32 - __bpf_call_base;
+			*insn = BPF_EMIT_CALL(__get_random_u32);
 			break;
 		}
 		break;
@@ -871,9 +845,8 @@ do_pass:
 	new_insn = new_prog;
 	fp = prog;
 
-	if (new_insn) {
-		*new_insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_CTX, BPF_REG_ARG1);
-	}
+	if (new_insn)
+		*new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
 	new_insn++;
 
 	for (i = 0; i < len; fp++, i++) {
@@ -921,17 +894,16 @@ do_pass:
 			    convert_bpf_extensions(fp, &insn))
 				break;
 
-			insn->code = fp->code;
-			insn->a_reg = BPF_REG_A;
-			insn->x_reg = BPF_REG_X;
-			insn->imm = fp->k;
+			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 			break;
 
-		/* Jump opcodes map as-is, but offsets need adjustment. */
-		case BPF_JMP | BPF_JA:
-			target = i + fp->k + 1;
-			insn->code = fp->code;
-#define EMIT_JMP							\
+		/* Jump transformation cannot use BPF block macros
+		 * everywhere as offset calculation and target updates
+		 * require a bit more work than the rest, i.e. jump
+		 * opcodes map as-is, but offsets need adjustment.
+		 */
+
+#define BPF_EMIT_JMP							\
 	do {								\
 		if (target >= len || target < 0)			\
 			goto err;					\
@@ -940,7 +912,10 @@ do_pass:
 		insn->off -= insn - tmp_insns;				\
 	} while (0)
 
-			EMIT_JMP;
+		case BPF_JMP | BPF_JA:
+			target = i + fp->k + 1;
+			insn->code = fp->code;
+			BPF_EMIT_JMP;
 			break;
 
 		case BPF_JMP | BPF_JEQ | BPF_K:
@@ -956,10 +931,7 @@ do_pass:
 				 * immediate into tmp register and use it
 				 * in compare insn.
 				 */
-				insn->code = BPF_ALU | BPF_MOV | BPF_K;
-				insn->a_reg = BPF_REG_TMP;
-				insn->imm = fp->k;
-				insn++;
+				*insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
 
 				insn->a_reg = BPF_REG_A;
 				insn->x_reg = BPF_REG_TMP;
@@ -975,7 +947,7 @@ do_pass:
 			if (fp->jf == 0) {
 				insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 				target = i + fp->jt + 1;
-				EMIT_JMP;
+				BPF_EMIT_JMP;
 				break;
 			}
 
@@ -983,116 +955,94 @@ do_pass:
 			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
 				insn->code = BPF_JMP | BPF_JNE | bpf_src;
 				target = i + fp->jf + 1;
-				EMIT_JMP;
+				BPF_EMIT_JMP;
 				break;
 			}
 
 			/* Other jumps are mapped into two insns: Jxx and JA. */
 			target = i + fp->jt + 1;
 			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
-			EMIT_JMP;
+			BPF_EMIT_JMP;
 			insn++;
 
 			insn->code = BPF_JMP | BPF_JA;
 			target = i + fp->jf + 1;
-			EMIT_JMP;
+			BPF_EMIT_JMP;
 			break;
 
 		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
 		case BPF_LDX | BPF_MSH | BPF_B:
 			/* tmp = A */
-			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_TMP, BPF_REG_A);
-			insn++;
-
+			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
 			/* A = BPF_R0 = *(u8 *) (skb->data + K) */
-			*insn = BPF_LD_ABS(BPF_B, fp->k);
-			insn++;
-
+			*insn++ = BPF_LD_ABS(BPF_B, fp->k);
 			/* A &= 0xf */
-			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
-			insn++;
-
+			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 			/* A <<= 2 */
-			*insn = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
-			insn++;
-
+			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
 			/* X = A */
-			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_X, BPF_REG_A);
-			insn++;
-
+			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 			/* A = tmp */
-			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_A, BPF_REG_TMP);
+			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 			break;
 
 		/* RET_K, RET_A are remaped into 2 insns. */
 		case BPF_RET | BPF_A:
 		case BPF_RET | BPF_K:
-			insn->code = BPF_ALU | BPF_MOV |
-				     (BPF_RVAL(fp->code) == BPF_K ?
-				      BPF_K : BPF_X);
-			insn->a_reg = 0;
-			insn->x_reg = BPF_REG_A;
-			insn->imm = fp->k;
-			insn++;
-
+			*insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
+						BPF_K : BPF_X, BPF_REG_0,
+						BPF_REG_A, fp->k);
 			*insn = BPF_EXIT_INSN();
 			break;
 
 		/* Store to stack. */
 		case BPF_ST:
 		case BPF_STX:
-			insn->code = BPF_STX | BPF_MEM | BPF_W;
-			insn->a_reg = BPF_REG_FP;
-			insn->x_reg = fp->code == BPF_ST ?
-				      BPF_REG_A : BPF_REG_X;
-			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
+			*insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+					    BPF_ST ? BPF_REG_A : BPF_REG_X,
+					    -(BPF_MEMWORDS - fp->k) * 4);
 			break;
 
 		/* Load from stack. */
 		case BPF_LD | BPF_MEM:
 		case BPF_LDX | BPF_MEM:
-			insn->code = BPF_LDX | BPF_MEM | BPF_W;
-			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
-				      BPF_REG_A : BPF_REG_X;
-			insn->x_reg = BPF_REG_FP;
-			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
+			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
+					    BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+					    -(BPF_MEMWORDS - fp->k) * 4);
 			break;
 
 		/* A = K or X = K */
 		case BPF_LD | BPF_IMM:
 		case BPF_LDX | BPF_IMM:
-			insn->code = BPF_ALU | BPF_MOV | BPF_K;
-			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
-				      BPF_REG_A : BPF_REG_X;
-			insn->imm = fp->k;
+			*insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+					      BPF_REG_A : BPF_REG_X, fp->k);
 			break;
 
 		/* X = A */
 		case BPF_MISC | BPF_TAX:
-			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_X, BPF_REG_A);
+			*insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 			break;
 
 		/* A = X */
 		case BPF_MISC | BPF_TXA:
-			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_A, BPF_REG_X);
+			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
 			break;
 
 		/* A = skb->len or X = skb->len */
 		case BPF_LD | BPF_W | BPF_LEN:
 		case BPF_LDX | BPF_W | BPF_LEN:
-			insn->code = BPF_LDX | BPF_MEM | BPF_W;
-			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
-				      BPF_REG_A : BPF_REG_X;
-			insn->x_reg = BPF_REG_CTX;
-			insn->off = offsetof(struct sk_buff, len);
+			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+					    BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+					    offsetof(struct sk_buff, len));
 			break;
 
-		/* access seccomp_data fields */
+		/* Access seccomp_data fields. */
 		case BPF_LDX | BPF_ABS | BPF_W:
 			/* A = *(u32 *) (ctx + K) */
 			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
 			break;
 
+		/* Unkown instruction. */
 		default:
 			goto err;
 		}
@@ -1101,7 +1051,6 @@ do_pass:
 		if (new_prog)
 			memcpy(new_insn, tmp_insns,
 			       sizeof(*insn) * (insn - tmp_insns));
-
 		new_insn += insn - tmp_insns;
 	}
 
@@ -1116,7 +1065,6 @@ do_pass:
 		new_flen = new_insn - new_prog;
 		if (pass > 2)
 			goto err;
-
 		goto do_pass;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 96b2e73c5471542cb9c622c4360716684f8797ed Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 2 Jun 2014 00:18:48 -0700
Subject: Revert "net/mlx4_en: Use affinity hint"

This reverts commit 70a640d0dae3a9b1b222ce673eb5d92c263ddd61.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/main.c              |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_cq.c     |  6 +-----
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 30 --------------------------
 drivers/net/ethernet/mellanox/mlx4/eq.c        | 13 +----------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  1 -
 include/linux/mlx4/device.h                    |  2 +-
 6 files changed, 4 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 58b1f239ac2b..199c7896f081 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1897,7 +1897,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 				 i, j, dev->pdev->bus->name);
 			/* Set IRQ for specific name (per ring) */
 			if (mlx4_assign_eq(dev, name, NULL,
-					   &ibdev->eq_table[eq], NULL)) {
+					   &ibdev->eq_table[eq])) {
 				/* Use legacy (same as mlx4_en driver) */
 				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
 				ibdev->eq_table[eq] =
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index ea2cd72e5368..636963db598a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -118,15 +118,11 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	if (cq->is_tx == RX) {
 		if (mdev->dev->caps.comp_pool) {
 			if (!cq->vector) {
-				struct mlx4_en_rx_ring *ring =
-					priv->rx_ring[cq->ring];
-
 				sprintf(name, "%s-%d", priv->dev->name,
 					cq->ring);
 				/* Set IRQ for specific name (per ring) */
 				if (mlx4_assign_eq(mdev->dev, name, rmap,
-						   &cq->vector,
-						   ring->affinity_mask)) {
+						   &cq->vector)) {
 					cq->vector = (cq->ring + 1 + priv->port)
 					    % mdev->dev->caps.num_comp_vectors;
 					mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n",
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 05d135572abc..58209bd0c94c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1526,32 +1526,6 @@ static void mlx4_en_linkstate(struct work_struct *work)
 	mutex_unlock(&mdev->state_lock);
 }
 
-static void mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
-{
-	struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx];
-	int numa_node = priv->mdev->dev->numa_node;
-
-	if (numa_node == -1)
-		return;
-
-	if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL)) {
-		en_err(priv, "Failed to allocate core mask\n");
-		return;
-	}
-
-	if (cpumask_set_cpu_local_first(ring_idx, numa_node,
-					ring->affinity_mask)) {
-		en_err(priv, "Failed setting affinity hint\n");
-		free_cpumask_var(ring->affinity_mask);
-		ring->affinity_mask = NULL;
-	}
-}
-
-static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
-{
-	free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
-	priv->rx_ring[ring_idx]->affinity_mask = NULL;
-}
 
 int mlx4_en_start_port(struct net_device *dev)
 {
@@ -1593,8 +1567,6 @@ int mlx4_en_start_port(struct net_device *dev)
 
 		mlx4_en_cq_init_lock(cq);
 
-		mlx4_en_init_affinity_hint(priv, i);
-
 		err = mlx4_en_activate_cq(priv, cq, i);
 		if (err) {
 			en_err(priv, "Failed activating Rx CQ\n");
@@ -1875,8 +1847,6 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 			msleep(1);
 		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
 		mlx4_en_deactivate_cq(priv, cq);
-
-		mlx4_en_free_affinity_hint(priv, i);
 	}
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index f91659e5fa13..d954ec1eac17 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -1376,7 +1376,7 @@ int mlx4_test_interrupts(struct mlx4_dev *dev)
 EXPORT_SYMBOL(mlx4_test_interrupts);
 
 int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
-		   int *vector, cpumask_var_t cpu_hint_mask)
+		   int *vector)
 {
 
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1411,15 +1411,6 @@ int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
 			}
 			mlx4_assign_irq_notifier(priv, dev,
 						 priv->eq_table.eq[vec].irq);
-			if (cpu_hint_mask) {
-				err = irq_set_affinity_hint(
-						priv->eq_table.eq[vec].irq,
-						cpu_hint_mask);
-				if (err) {
-					mlx4_warn(dev, "Failed setting affinity hint\n");
-					/*we dont want to break here*/
-				}
-			}
 
 			eq_set_ci(&priv->eq_table.eq[vec], 1);
 		}
@@ -1450,8 +1441,6 @@ void mlx4_release_eq(struct mlx4_dev *dev, int vec)
 			irq_set_affinity_notifier(
 				priv->eq_table.eq[vec].irq,
 				NULL);
-			irq_set_affinity_hint(priv->eq_table.eq[vec].irq,
-					      NULL);
 			free_irq(priv->eq_table.eq[vec].irq,
 				 &priv->eq_table.eq[vec]);
 			priv->msix_ctl.pool_bm &= ~(1ULL << i);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 0e15295bedd6..b5db1bf361dc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -313,7 +313,6 @@ struct mlx4_en_rx_ring {
 	unsigned long csum_ok;
 	unsigned long csum_none;
 	int hwtstamp_rx_filter;
-	cpumask_var_t affinity_mask;
 };
 
 struct mlx4_en_cq {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b9b70e00e3c1..ca38871a585c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1161,7 +1161,7 @@ int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
 int mlx4_test_interrupts(struct mlx4_dev *dev);
 int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
-		   int *vector, cpumask_t *cpu_hint_mask);
+		   int *vector);
 void mlx4_release_eq(struct mlx4_dev *dev, int vec);
 
 int mlx4_get_phys_port_id(struct mlx4_dev *dev);
-- 
cgit v1.2.3-71-gd317


From 670e5b8eaf85704742bc3cb1df51fdd3ce08fc15 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 28 May 2014 18:44:46 -0700
Subject: net: Add support for device specific address syncing

This change provides a function to be used in order to break the
ndo_set_rx_mode call into a set of address add and remove calls.  The code
is based on the implementation of dev_uc_sync/dev_mc_sync.  Since they
essentially do the same thing but with only one dev I simply named my
functions __dev_uc_sync/__dev_mc_sync.

I also implemented an unsync version of the functions as well to allow for
cleanup on close.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 73 ++++++++++++++++++++++++++++++++++++++++
 net/core/dev_addr_lists.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2db1610bf109..774e5391eb8e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3003,6 +3003,15 @@ int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
 		   struct netdev_hw_addr_list *from_list, int addr_len);
 void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
 		      struct netdev_hw_addr_list *from_list, int addr_len);
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+		       struct net_device *dev,
+		       int (*sync)(struct net_device *, const unsigned char *),
+		       int (*unsync)(struct net_device *,
+				     const unsigned char *));
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+			  struct net_device *dev,
+			  int (*unsync)(struct net_device *,
+					const unsigned char *));
 void __hw_addr_init(struct netdev_hw_addr_list *list);
 
 /* Functions used for device addresses handling */
@@ -3023,6 +3032,38 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from);
 void dev_uc_flush(struct net_device *dev);
 void dev_uc_init(struct net_device *dev);
 
+/**
+ *  __dev_uc_sync - Synchonize device's unicast list
+ *  @dev:  device to sync
+ *  @sync: function to call if address should be added
+ *  @unsync: function to call if address should be removed
+ *
+ *  Add newly added addresses to the interface, and release
+ *  addresses that have been deleted.
+ **/
+static inline int __dev_uc_sync(struct net_device *dev,
+				int (*sync)(struct net_device *,
+					    const unsigned char *),
+				int (*unsync)(struct net_device *,
+					      const unsigned char *))
+{
+	return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
+}
+
+/**
+ *  __dev_uc_unsync - Remove synchonized addresses from device
+ *  @dev:  device to sync
+ *  @unsync: function to call if address should be removed
+ *
+ *  Remove all addresses that were added to the device by dev_uc_sync().
+ **/
+static inline void __dev_uc_unsync(struct net_device *dev,
+				   int (*unsync)(struct net_device *,
+						 const unsigned char *))
+{
+	__hw_addr_unsync_dev(&dev->uc, dev, unsync);
+}
+
 /* Functions used for multicast addresses handling */
 int dev_mc_add(struct net_device *dev, const unsigned char *addr);
 int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
@@ -3035,6 +3076,38 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from);
 void dev_mc_flush(struct net_device *dev);
 void dev_mc_init(struct net_device *dev);
 
+/**
+ *  __dev_mc_sync - Synchonize device's multicast list
+ *  @dev:  device to sync
+ *  @sync: function to call if address should be added
+ *  @unsync: function to call if address should be removed
+ *
+ *  Add newly added addresses to the interface, and release
+ *  addresses that have been deleted.
+ **/
+static inline int __dev_mc_sync(struct net_device *dev,
+				int (*sync)(struct net_device *,
+					    const unsigned char *),
+				int (*unsync)(struct net_device *,
+					      const unsigned char *))
+{
+	return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
+}
+
+/**
+ *  __dev_mc_unsync - Remove synchonized addresses from device
+ *  @dev:  device to sync
+ *  @unsync: function to call if address should be removed
+ *
+ *  Remove all addresses that were added to the device by dev_mc_sync().
+ **/
+static inline void __dev_mc_unsync(struct net_device *dev,
+				   int (*unsync)(struct net_device *,
+						 const unsigned char *))
+{
+	__hw_addr_unsync_dev(&dev->mc, dev, unsync);
+}
+
 /* Functions used for secondary unicast and multicast support */
 void dev_set_rx_mode(struct net_device *dev);
 void __dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 329d5794e7dc..b6b230600b97 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -225,6 +225,91 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
 }
 EXPORT_SYMBOL(__hw_addr_unsync);
 
+/**
+ *  __hw_addr_sync_dev - Synchonize device's multicast list
+ *  @list: address list to syncronize
+ *  @dev:  device to sync
+ *  @sync: function to call if address should be added
+ *  @unsync: function to call if address should be removed
+ *
+ *  This funciton is intended to be called from the ndo_set_rx_mode
+ *  function of devices that require explicit address add/remove
+ *  notifications.  The unsync function may be NULL in which case
+ *  the addresses requiring removal will simply be removed without
+ *  any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+		       struct net_device *dev,
+		       int (*sync)(struct net_device *, const unsigned char *),
+		       int (*unsync)(struct net_device *,
+				     const unsigned char *))
+{
+	struct netdev_hw_addr *ha, *tmp;
+	int err;
+
+	/* first go through and flush out any stale entries */
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+		if (!ha->sync_cnt || ha->refcount != 1)
+			continue;
+
+		/* if unsync is defined and fails defer unsyncing address */
+		if (unsync && unsync(dev, ha->addr))
+			continue;
+
+		ha->sync_cnt--;
+		__hw_addr_del_entry(list, ha, false, false);
+	}
+
+	/* go through and sync new entries to the list */
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+		if (ha->sync_cnt)
+			continue;
+
+		err = sync(dev, ha->addr);
+		if (err)
+			return err;
+
+		ha->sync_cnt++;
+		ha->refcount++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ *  __hw_addr_unsync_dev - Remove synchonized addresses from device
+ *  @list: address list to remove syncronized addresses from
+ *  @dev:  device to sync
+ *  @unsync: function to call if address should be removed
+ *
+ *  Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ *  This function is intended to be called from the ndo_stop or ndo_open
+ *  functions on devices that require explicit address add/remove
+ *  notifications.  If the unsync function pointer is NULL then this function
+ *  can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+			  struct net_device *dev,
+			  int (*unsync)(struct net_device *,
+					const unsigned char *))
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+		if (!ha->sync_cnt)
+			continue;
+
+		/* if unsync is defined and fails defer unsyncing address */
+		if (unsync && unsync(dev, ha->addr))
+			continue;
+
+		ha->sync_cnt--;
+		__hw_addr_del_entry(list, ha, false, false);
+	}
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
 {
 	struct netdev_hw_addr *ha, *tmp;
-- 
cgit v1.2.3-71-gd317


From 1b49dcf3d7c765ad18ca7167a0e441824eb1f7af Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Sun, 16 Mar 2014 14:51:35 -0700
Subject: virtio-scsi.h: Add virtio_scsi_cmd_req_pi + VIRTIO_SCSI_F_T10_PI bits

This patch adds a virtio_scsi_cmd_req_pi header as recommened by
Paolo that contains pi_bytesout + pi_bytesin elements used for
signaling when protection information buffers (in bytes) are
expected to preceed the data payload buffers.

Also add new VIRTIO_SCSI_F_T10_PI feature bit to be used to signal
host support.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Sagi Grimberg <sagig@dev.mellanox.co.il>
Cc: H. Peter Anvin <hpa@zytor.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/linux/virtio_scsi.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_scsi.h b/include/linux/virtio_scsi.h
index 4195b97a3def..de429d1f4357 100644
--- a/include/linux/virtio_scsi.h
+++ b/include/linux/virtio_scsi.h
@@ -35,11 +35,23 @@ struct virtio_scsi_cmd_req {
 	u8 lun[8];		/* Logical Unit Number */
 	u64 tag;		/* Command identifier */
 	u8 task_attr;		/* Task attribute */
-	u8 prio;
+	u8 prio;		/* SAM command priority field */
 	u8 crn;
 	u8 cdb[VIRTIO_SCSI_CDB_SIZE];
 } __packed;
 
+/* SCSI command request, followed by protection information */
+struct virtio_scsi_cmd_req_pi {
+	u8 lun[8];		/* Logical Unit Number */
+	u64 tag;		/* Command identifier */
+	u8 task_attr;		/* Task attribute */
+	u8 prio;		/* SAM command priority field */
+	u8 crn;
+	u32 pi_bytesout;	/* DataOUT PI Number of bytes */
+	u32 pi_bytesin;		/* DataIN PI Number of bytes */
+	u8 cdb[VIRTIO_SCSI_CDB_SIZE];
+} __packed;
+
 /* Response, followed by sense data and data-in */
 struct virtio_scsi_cmd_resp {
 	u32 sense_len;		/* Sense data length */
@@ -97,6 +109,7 @@ struct virtio_scsi_config {
 #define VIRTIO_SCSI_F_INOUT                    0
 #define VIRTIO_SCSI_F_HOTPLUG                  1
 #define VIRTIO_SCSI_F_CHANGE                   2
+#define VIRTIO_SCSI_F_T10_PI                   3
 
 /* Response codes */
 #define VIRTIO_SCSI_S_OK                       0
-- 
cgit v1.2.3-71-gd317


From 40f2287bd583f4df4c602c1a29a48df2730fb6d4 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Sun, 11 May 2014 15:15:12 +0300
Subject: IB/mlx4: Implement IB_QP_CREATE_USE_GFP_NOIO

Modify the various routines used to allocate memory resources which
serve QPs in mlx4 to get an input GFP directive.  Have the Ethernet
driver to use GFP_KERNEL in it's QP allocations as done prior to this
commit, and the IB driver to use GFP_NOIO when the IB verbs
IB_QP_CREATE_USE_GFP_NOIO QP creation flag is provided.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/infiniband/hw/mlx4/cq.c                    |  6 ++---
 drivers/infiniband/hw/mlx4/mlx4_ib.h               |  1 +
 drivers/infiniband/hw/mlx4/qp.c                    | 30 ++++++++++++----------
 drivers/infiniband/hw/mlx4/srq.c                   |  7 ++---
 drivers/net/ethernet/mellanox/mlx4/alloc.c         | 27 +++++++++----------
 drivers/net/ethernet/mellanox/mlx4/cq.c            |  4 +--
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |  6 ++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |  2 +-
 drivers/net/ethernet/mellanox/mlx4/icm.c           |  7 ++---
 drivers/net/ethernet/mellanox/mlx4/icm.h           |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  4 +--
 drivers/net/ethernet/mellanox/mlx4/mr.c            | 17 ++++++------
 drivers/net/ethernet/mellanox/mlx4/qp.c            | 20 +++++++--------
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  4 +--
 drivers/net/ethernet/mellanox/mlx4/srq.c           |  4 +--
 include/linux/mlx4/device.h                        | 10 +++++---
 16 files changed, 82 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 5f640814cc81..1066eec854a9 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -102,7 +102,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
 	int err;
 
 	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
-			     PAGE_SIZE * 2, &buf->buf);
+			     PAGE_SIZE * 2, &buf->buf, GFP_KERNEL);
 
 	if (err)
 		goto out;
@@ -113,7 +113,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
 	if (err)
 		goto err_buf;
 
-	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf);
+	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL);
 	if (err)
 		goto err_mtt;
 
@@ -209,7 +209,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 
 		uar = &to_mucontext(context)->uar;
 	} else {
-		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+		err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL);
 		if (err)
 			goto err_cq;
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index f589522fddfd..bb8c9dd442ae 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -156,6 +156,7 @@ enum mlx4_ib_qp_flags {
 	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
 	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
 	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
+	MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
 	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
 	MLX4_IB_SRIOV_SQP = 1 << 31,
 };
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 41308af4163c..8710baf60bb9 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -610,7 +610,8 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)
 
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
-			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp)
+			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp,
+			    gfp_t gfp)
 {
 	int qpn;
 	int err;
@@ -748,14 +749,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			goto err;
 
 		if (qp_has_rq(init_attr)) {
-			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
+			err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp);
 			if (err)
 				goto err;
 
 			*qp->db.db = 0;
 		}
 
-		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
 			err = -ENOMEM;
 			goto err_db;
 		}
@@ -765,13 +766,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		if (err)
 			goto err_buf;
 
-		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp);
 		if (err)
 			goto err_mtt;
 
-		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
-		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);
-
+		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
+		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
 		if (!qp->sq.wrid || !qp->rq.wrid) {
 			err = -ENOMEM;
 			goto err_wrid;
@@ -801,7 +801,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			goto err_proxy;
 	}
 
-	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp);
 	if (err)
 		goto err_qpn;
 
@@ -1040,7 +1040,10 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	struct mlx4_ib_qp *qp = NULL;
 	int err;
 	u16 xrcdn = 0;
+	gfp_t gfp;
 
+	gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ?
+		GFP_NOIO : GFP_KERNEL;
 	/*
 	 * We only support LSO, vendor flag1, and multicast loopback blocking,
 	 * and only for kernel UD QPs.
@@ -1049,7 +1052,8 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
 					MLX4_IB_SRIOV_TUNNEL_QP |
 					MLX4_IB_SRIOV_SQP |
-					MLX4_IB_QP_NETIF))
+					MLX4_IB_QP_NETIF |
+					MLX4_IB_QP_CREATE_USE_GFP_NOIO))
 		return ERR_PTR(-EINVAL);
 
 	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
@@ -1059,7 +1063,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 
 	if (init_attr->create_flags &&
 	    (udata ||
-	     ((init_attr->create_flags & ~MLX4_IB_SRIOV_SQP) &&
+	     ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) &&
 	      init_attr->qp_type != IB_QPT_UD) ||
 	     ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
 	      init_attr->qp_type > IB_QPT_GSI)))
@@ -1079,7 +1083,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	case IB_QPT_RC:
 	case IB_QPT_UC:
 	case IB_QPT_RAW_PACKET:
-		qp = kzalloc(sizeof *qp, GFP_KERNEL);
+		qp = kzalloc(sizeof *qp, gfp);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
 		qp->pri.vid = 0xFFFF;
@@ -1088,7 +1092,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	case IB_QPT_UD:
 	{
 		err = create_qp_common(to_mdev(pd->device), pd, init_attr,
-				       udata, 0, &qp);
+				       udata, 0, &qp, gfp);
 		if (err)
 			return ERR_PTR(err);
 
@@ -1106,7 +1110,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 
 		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
 				       get_sqp_num(to_mdev(pd->device), init_attr),
-				       &qp);
+				       &qp, gfp);
 		if (err)
 			return ERR_PTR(err);
 
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 60c5fb025fc7..62d9285300af 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -134,13 +134,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 		if (err)
 			goto err_mtt;
 	} else {
-		err = mlx4_db_alloc(dev->dev, &srq->db, 0);
+		err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL);
 		if (err)
 			goto err_srq;
 
 		*srq->db.db = 0;
 
-		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf,
+				   GFP_KERNEL)) {
 			err = -ENOMEM;
 			goto err_db;
 		}
@@ -165,7 +166,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 		if (err)
 			goto err_buf;
 
-		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL);
 		if (err)
 			goto err_mtt;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index c3ad464d0627..b0297da50304 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -171,7 +171,7 @@ void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
  */
 
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
-		   struct mlx4_buf *buf)
+		   struct mlx4_buf *buf, gfp_t gfp)
 {
 	dma_addr_t t;
 
@@ -180,7 +180,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		buf->npages       = 1;
 		buf->page_shift   = get_order(size) + PAGE_SHIFT;
 		buf->direct.buf   = dma_alloc_coherent(&dev->pdev->dev,
-						       size, &t, GFP_KERNEL);
+						       size, &t, gfp);
 		if (!buf->direct.buf)
 			return -ENOMEM;
 
@@ -200,14 +200,14 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		buf->npages      = buf->nbufs;
 		buf->page_shift  = PAGE_SHIFT;
 		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
-					   GFP_KERNEL);
+					   gfp);
 		if (!buf->page_list)
 			return -ENOMEM;
 
 		for (i = 0; i < buf->nbufs; ++i) {
 			buf->page_list[i].buf =
 				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
-						   &t, GFP_KERNEL);
+						   &t, gfp);
 			if (!buf->page_list[i].buf)
 				goto err_free;
 
@@ -218,7 +218,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 
 		if (BITS_PER_LONG == 64) {
 			struct page **pages;
-			pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+			pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
 			if (!pages)
 				goto err_free;
 			for (i = 0; i < buf->nbufs; ++i)
@@ -260,11 +260,12 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 }
 EXPORT_SYMBOL_GPL(mlx4_buf_free);
 
-static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
+static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device,
+						 gfp_t gfp)
 {
 	struct mlx4_db_pgdir *pgdir;
 
-	pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL);
+	pgdir = kzalloc(sizeof *pgdir, gfp);
 	if (!pgdir)
 		return NULL;
 
@@ -272,7 +273,7 @@ static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
 	pgdir->bits[0] = pgdir->order0;
 	pgdir->bits[1] = pgdir->order1;
 	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
-					    &pgdir->db_dma, GFP_KERNEL);
+					    &pgdir->db_dma, gfp);
 	if (!pgdir->db_page) {
 		kfree(pgdir);
 		return NULL;
@@ -312,7 +313,7 @@ found:
 	return 0;
 }
 
-int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_db_pgdir *pgdir;
@@ -324,7 +325,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
 		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
 			goto out;
 
-	pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev));
+	pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev), gfp);
 	if (!pgdir) {
 		ret = -ENOMEM;
 		goto out;
@@ -376,13 +377,13 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
 {
 	int err;
 
-	err = mlx4_db_alloc(dev, &wqres->db, 1);
+	err = mlx4_db_alloc(dev, &wqres->db, 1, GFP_KERNEL);
 	if (err)
 		return err;
 
 	*wqres->db.db = 0;
 
-	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf);
+	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf, GFP_KERNEL);
 	if (err)
 		goto err_db;
 
@@ -391,7 +392,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
 	if (err)
 		goto err_buf;
 
-	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf);
+	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf, GFP_KERNEL);
 	if (err)
 		goto err_mtt;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 0487121e4a0f..c90cde5b4aee 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -173,11 +173,11 @@ int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
 	if (*cqn == -1)
 		return -ENOMEM;
 
-	err = mlx4_table_get(dev, &cq_table->table, *cqn);
+	err = mlx4_table_get(dev, &cq_table->table, *cqn, GFP_KERNEL);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn);
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn, GFP_KERNEL);
 	if (err)
 		goto err_put;
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index ba049ae88749..87857a6463eb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -972,7 +972,7 @@ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
 	if (!context)
 		return -ENOMEM;
 
-	err = mlx4_qp_alloc(mdev->dev, qpn, qp);
+	err = mlx4_qp_alloc(mdev->dev, qpn, qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed to allocate qp #%x\n", qpn);
 		goto out;
@@ -1012,7 +1012,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
 		en_err(priv, "Failed reserving drop qpn\n");
 		return err;
 	}
-	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp);
+	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed allocating drop qp\n");
 		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
@@ -1071,7 +1071,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 	}
 
 	/* Configure RSS indirection qp */
-	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp);
+	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed to allocate RSS indirection QP\n");
 		goto rss_err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index dd1f6d346459..bc0cc1eb214d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -113,7 +113,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	       ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
 
 	ring->qpn = qpn;
-	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp);
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
 		goto err_map;
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 5fbf4924c272..eb1747e1937d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -245,7 +245,8 @@ int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
-int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj,
+		   int gfp)
 {
 	u32 i = (obj & (table->num_obj - 1)) /
 			(MLX4_TABLE_CHUNK_SIZE / table->obj_size);
@@ -259,7 +260,7 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
 	}
 
 	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
-				       (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+				       (table->lowmem ? gfp : GFP_HIGHUSER) |
 				       __GFP_NOWARN, table->coherent);
 	if (!table->icm[i]) {
 		ret = -ENOMEM;
@@ -356,7 +357,7 @@ int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 	u32 i;
 
 	for (i = start; i <= end; i += inc) {
-		err = mlx4_table_get(dev, table, i);
+		err = mlx4_table_get(dev, table, i, GFP_KERNEL);
 		if (err)
 			goto fail;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h
index dee67fa39107..067e6e0af36c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.h
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.h
@@ -71,7 +71,8 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 				gfp_t gfp_mask, int coherent);
 void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
 
-int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj,
+		   int gfp);
 void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
 int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 			 u32 start, u32 end);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index f9c465101963..627a54ef2955 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -888,7 +888,7 @@ void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
 void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
 void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
 void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
-int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn);
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp);
 void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn);
 int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn);
 void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn);
@@ -896,7 +896,7 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn);
 void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn);
 int __mlx4_mpt_reserve(struct mlx4_dev *dev);
 void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index);
-int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index);
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp);
 void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index);
 u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order);
 void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 24835853b753..4c71dafad217 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -364,14 +364,14 @@ static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
 	__mlx4_mpt_release(dev, index);
 }
 
-int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp)
 {
 	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
 
-	return mlx4_table_get(dev, &mr_table->dmpt_table, index);
+	return mlx4_table_get(dev, &mr_table->dmpt_table, index, gfp);
 }
 
-static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp)
 {
 	u64 param = 0;
 
@@ -382,7 +382,7 @@ static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
 							MLX4_CMD_TIME_CLASS_A,
 							MLX4_CMD_WRAPPED);
 	}
-	return __mlx4_mpt_alloc_icm(dev, index);
+	return __mlx4_mpt_alloc_icm(dev, index, gfp);
 }
 
 void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
@@ -469,7 +469,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
 	struct mlx4_mpt_entry *mpt_entry;
 	int err;
 
-	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key));
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key), GFP_KERNEL);
 	if (err)
 		return err;
 
@@ -627,13 +627,14 @@ int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 EXPORT_SYMBOL_GPL(mlx4_write_mtt);
 
 int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
-		       struct mlx4_buf *buf)
+		       struct mlx4_buf *buf, gfp_t gfp)
 {
 	u64 *page_list;
 	int err;
 	int i;
 
-	page_list = kmalloc(buf->npages * sizeof *page_list, GFP_KERNEL);
+	page_list = kmalloc(buf->npages * sizeof *page_list,
+			    gfp);
 	if (!page_list)
 		return -ENOMEM;
 
@@ -680,7 +681,7 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw)
 	struct mlx4_mpt_entry *mpt_entry;
 	int err;
 
-	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key));
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key), GFP_KERNEL);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 61d64ebffd56..917f0d0ba7c6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -272,29 +272,29 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
 
-int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 	int err;
 
-	err = mlx4_table_get(dev, &qp_table->qp_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->qp_table, qpn, gfp);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn, gfp);
 	if (err)
 		goto err_put_qp;
 
-	err = mlx4_table_get(dev, &qp_table->altc_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->altc_table, qpn, gfp);
 	if (err)
 		goto err_put_auxc;
 
-	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn, gfp);
 	if (err)
 		goto err_put_altc;
 
-	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn, gfp);
 	if (err)
 		goto err_put_rdmarc;
 
@@ -316,7 +316,7 @@ err_out:
 	return err;
 }
 
-static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, int gfp)
 {
 	u64 param = 0;
 
@@ -326,7 +326,7 @@ static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
 				    MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A,
 				    MLX4_CMD_WRAPPED);
 	}
-	return __mlx4_qp_alloc_icm(dev, qpn);
+	return __mlx4_qp_alloc_icm(dev, qpn, gfp);
 }
 
 void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
@@ -355,7 +355,7 @@ static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
 		__mlx4_qp_free_icm(dev, qpn);
 }
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, gfp_t gfp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
@@ -366,7 +366,7 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
 
 	qp->qpn = qpn;
 
-	err = mlx4_qp_alloc_icm(dev, qpn);
+	err = mlx4_qp_alloc_icm(dev, qpn, gfp);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 1c3fdd4a1f7d..45da913b5679 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -1532,7 +1532,7 @@ static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 			return err;
 
 		if (!fw_reserved(dev, qpn)) {
-			err = __mlx4_qp_alloc_icm(dev, qpn);
+			err = __mlx4_qp_alloc_icm(dev, qpn, GFP_KERNEL);
 			if (err) {
 				res_abort_move(dev, slave, RES_QP, qpn);
 				return err;
@@ -1619,7 +1619,7 @@ static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 		if (err)
 			return err;
 
-		err = __mlx4_mpt_alloc_icm(dev, mpt->key);
+		err = __mlx4_mpt_alloc_icm(dev, mpt->key, GFP_KERNEL);
 		if (err) {
 			res_abort_move(dev, slave, RES_MPT, id);
 			return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
index 98faf870b0b0..67146624eb58 100644
--- a/drivers/net/ethernet/mellanox/mlx4/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -103,11 +103,11 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
 	if (*srqn == -1)
 		return -ENOMEM;
 
-	err = mlx4_table_get(dev, &srq_table->table, *srqn);
+	err = mlx4_table_get(dev, &srq_table->table, *srqn, GFP_KERNEL);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn);
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn, GFP_KERNEL);
 	if (err)
 		goto err_put;
 	return 0;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ba87bd21295a..be60b002bb37 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -837,7 +837,7 @@ static inline int mlx4_is_slave(struct mlx4_dev *dev)
 }
 
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
-		   struct mlx4_buf *buf);
+		   struct mlx4_buf *buf, gfp_t gfp);
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
 static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
 {
@@ -874,9 +874,10 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw);
 int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		   int start_index, int npages, u64 *page_list);
 int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
-		       struct mlx4_buf *buf);
+		       struct mlx4_buf *buf, gfp_t gfp);
 
-int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order);
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order,
+		  gfp_t gfp);
 void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
 
 int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
@@ -892,7 +893,8 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
 int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp,
+		  gfp_t gfp);
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
 int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn,
-- 
cgit v1.2.3-71-gd317


From 688cea83f4396fa98b77a126ed278b89daccccdc Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Fri, 30 May 2014 16:00:56 +0800
Subject: macvlan: add netpoll support

Add netpoll support to macvlan devices. Based on the netpoll support in the 802.1q vlan code.

Tested and macvlan could work well with netconsole.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c      | 66 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/if_macvlan.h |  3 +++
 2 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d2dbcfc68ee4..eee9106d1da1 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -33,6 +33,7 @@
 #include <linux/workqueue.h>
 #include <net/rtnetlink.h>
 #include <net/xfrm.h>
+#include <linux/netpoll.h>
 
 #define MACVLAN_HASH_SIZE	(1 << BITS_PER_BYTE)
 
@@ -357,12 +358,26 @@ xmit_world:
 	return dev_queue_xmit(skb);
 }
 
+static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	if (vlan->netpoll)
+		netpoll_send_skb(vlan->netpoll, skb);
+#else
+	BUG();
+#endif
+	return NETDEV_TX_OK;
+}
+
 static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
 	unsigned int len = skb->len;
 	int ret;
-	const struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	if (unlikely(netpoll_tx_running(dev)))
+		return macvlan_netpoll_send_skb(vlan, skb);
 
 	if (vlan->fwd_priv) {
 		skb->dev = vlan->lowerdev;
@@ -788,6 +803,50 @@ static netdev_features_t macvlan_fix_features(struct net_device *dev,
 	return features;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void macvlan_dev_poll_controller(struct net_device *dev)
+{
+	return;
+}
+
+static int macvlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *real_dev = vlan->lowerdev;
+	struct netpoll *netpoll;
+	int err = 0;
+
+	netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!netpoll)
+		goto out;
+
+	err = __netpoll_setup(netpoll, real_dev);
+	if (err) {
+		kfree(netpoll);
+		goto out;
+	}
+
+	vlan->netpoll = netpoll;
+
+out:
+	return err;
+}
+
+static void macvlan_dev_netpoll_cleanup(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct netpoll *netpoll = vlan->netpoll;
+
+	if (!netpoll)
+		return;
+
+	vlan->netpoll = NULL;
+
+	__netpoll_free_async(netpoll);
+}
+#endif	/* CONFIG_NET_POLL_CONTROLLER */
+
 static const struct ethtool_ops macvlan_ethtool_ops = {
 	.get_link		= ethtool_op_get_link,
 	.get_settings		= macvlan_ethtool_get_settings,
@@ -813,6 +872,11 @@ static const struct net_device_ops macvlan_netdev_ops = {
 	.ndo_fdb_del		= macvlan_fdb_del,
 	.ndo_fdb_dump		= ndo_dflt_fdb_dump,
 	.ndo_get_lock_subclass  = macvlan_get_nest_level,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= macvlan_dev_poll_controller,
+	.ndo_netpoll_setup	= macvlan_dev_netpoll_setup,
+	.ndo_netpoll_cleanup	= macvlan_dev_netpoll_cleanup,
+#endif
 };
 
 void macvlan_common_setup(struct net_device *dev)
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index a9a53b12397b..6b2c7cf352a5 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -57,6 +57,9 @@ struct macvlan_dev {
 	netdev_features_t	tap_features;
 	int			minor;
 	int			nest_level;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct netpoll		*netpoll;
+#endif
 };
 
 static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
-- 
cgit v1.2.3-71-gd317


From fe62d001372388abb15a324148c913f9b43722a8 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Thu, 15 May 2014 01:25:27 +0100
Subject: ethtool: Replace ethtool_ops::{get,set}_rxfh_indir() with
 {get,set}_rxfh()

ETHTOOL_{G,S}RXFHINDIR and ETHTOOL_{G,S}RSSH should work for drivers
regardless of whether they expose the hash key, unless you try to
set a hash key for a driver that doesn't expose it.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 15 ++++++++-------
 drivers/net/ethernet/broadcom/tg3.c                 |  8 ++++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c     |  8 ++++----
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c  | 15 +++++++++------
 drivers/net/ethernet/intel/igb/igb_ethtool.c        |  9 +++++----
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c     | 10 +++++-----
 drivers/net/ethernet/sfc/ethtool.c                  | 10 +++++-----
 drivers/net/vmxnet3/vmxnet3_ethtool.c               |  8 ++++----
 include/linux/ethtool.h                             |  6 ------
 net/core/ethtool.c                                  |  8 ++++----
 10 files changed, 48 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 03224090ecf9..af138f8aa361 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3316,7 +3316,7 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device *dev)
 	return T_ETH_INDIRECTION_TABLE_SIZE;
 }
 
-static int bnx2x_get_rxfh_indir(struct net_device *dev, u32 *indir)
+static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	u8 ind_table[T_ETH_INDIRECTION_TABLE_SIZE] = {0};
@@ -3340,14 +3340,15 @@ static int bnx2x_get_rxfh_indir(struct net_device *dev, u32 *indir)
 	return 0;
 }
 
-static int bnx2x_set_rxfh_indir(struct net_device *dev, const u32 *indir)
+static int bnx2x_set_rxfh(struct net_device *dev, const u32 *indir,
+			  const u8 *key)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	size_t i;
 
 	for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++) {
 		/*
-		 * The same as in bnx2x_get_rxfh_indir: we can't use a memcpy()
+		 * The same as in bnx2x_get_rxfh: we can't use a memcpy()
 		 * as an internal storage of an indirection table is a u8 array
 		 * while indir->ring_index points to an array of u32.
 		 *
@@ -3471,8 +3472,8 @@ static const struct ethtool_ops bnx2x_ethtool_ops = {
 	.get_rxnfc		= bnx2x_get_rxnfc,
 	.set_rxnfc		= bnx2x_set_rxnfc,
 	.get_rxfh_indir_size	= bnx2x_get_rxfh_indir_size,
-	.get_rxfh_indir		= bnx2x_get_rxfh_indir,
-	.set_rxfh_indir		= bnx2x_set_rxfh_indir,
+	.get_rxfh		= bnx2x_get_rxfh,
+	.set_rxfh		= bnx2x_set_rxfh,
 	.get_channels		= bnx2x_get_channels,
 	.set_channels		= bnx2x_set_channels,
 	.get_module_info	= bnx2x_get_module_info,
@@ -3498,8 +3499,8 @@ static const struct ethtool_ops bnx2x_vf_ethtool_ops = {
 	.get_rxnfc		= bnx2x_get_rxnfc,
 	.set_rxnfc		= bnx2x_set_rxnfc,
 	.get_rxfh_indir_size	= bnx2x_get_rxfh_indir_size,
-	.get_rxfh_indir		= bnx2x_get_rxfh_indir,
-	.set_rxfh_indir		= bnx2x_set_rxfh_indir,
+	.get_rxfh		= bnx2x_get_rxfh,
+	.set_rxfh		= bnx2x_set_rxfh,
 	.get_channels		= bnx2x_get_channels,
 	.set_channels		= bnx2x_set_channels,
 };
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index ccd90156aebc..8c2314ed260c 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12501,7 +12501,7 @@ static u32 tg3_get_rxfh_indir_size(struct net_device *dev)
 	return size;
 }
 
-static int tg3_get_rxfh_indir(struct net_device *dev, u32 *indir)
+static int tg3_get_rxfh(struct net_device *dev, u32 *indir, u8 *key)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	int i;
@@ -12512,7 +12512,7 @@ static int tg3_get_rxfh_indir(struct net_device *dev, u32 *indir)
 	return 0;
 }
 
-static int tg3_set_rxfh_indir(struct net_device *dev, const u32 *indir)
+static int tg3_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	size_t i;
@@ -14044,8 +14044,8 @@ static const struct ethtool_ops tg3_ethtool_ops = {
 	.get_sset_count		= tg3_get_sset_count,
 	.get_rxnfc		= tg3_get_rxnfc,
 	.get_rxfh_indir_size    = tg3_get_rxfh_indir_size,
-	.get_rxfh_indir		= tg3_get_rxfh_indir,
-	.set_rxfh_indir		= tg3_set_rxfh_indir,
+	.get_rxfh		= tg3_get_rxfh,
+	.set_rxfh		= tg3_set_rxfh,
 	.get_channels		= tg3_get_channels,
 	.set_channels		= tg3_set_channels,
 	.get_ts_info		= tg3_get_ts_info,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 266a5bc6aedf..8cf6be93f491 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2739,7 +2739,7 @@ static u32 get_rss_table_size(struct net_device *dev)
 	return pi->rss_size;
 }
 
-static int get_rss_table(struct net_device *dev, u32 *p)
+static int get_rss_table(struct net_device *dev, u32 *p, u8 *key)
 {
 	const struct port_info *pi = netdev_priv(dev);
 	unsigned int n = pi->rss_size;
@@ -2749,7 +2749,7 @@ static int get_rss_table(struct net_device *dev, u32 *p)
 	return 0;
 }
 
-static int set_rss_table(struct net_device *dev, const u32 *p)
+static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key)
 {
 	unsigned int i;
 	struct port_info *pi = netdev_priv(dev);
@@ -2851,8 +2851,8 @@ static const struct ethtool_ops cxgb_ethtool_ops = {
 	.set_wol           = set_wol,
 	.get_rxnfc         = get_rxnfc,
 	.get_rxfh_indir_size = get_rss_table_size,
-	.get_rxfh_indir    = get_rss_table,
-	.set_rxfh_indir    = set_rss_table,
+	.get_rxfh	   = get_rss_table,
+	.set_rxfh	   = set_rss_table,
 	.flash_device      = set_flash,
 };
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index 77e786d2d0e0..dbc8986c2dae 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -626,13 +626,14 @@ static u32 i40evf_get_rxfh_indir_size(struct net_device *netdev)
 }
 
 /**
- * i40evf_get_rxfh_indir - get the rx flow hash indirection table
+ * i40evf_get_rxfh - get the rx flow hash indirection table
  * @netdev: network interface device structure
  * @indir: indirection table
+ * @key: hash key (will be %NULL until get_rxfh_key_size is implemented)
  *
  * Reads the indirection table directly from the hardware. Always returns 0.
  **/
-static int i40evf_get_rxfh_indir(struct net_device *netdev, u32 *indir)
+static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
 {
 	struct i40evf_adapter *adapter = netdev_priv(netdev);
 	struct i40e_hw *hw = &adapter->hw;
@@ -650,14 +651,16 @@ static int i40evf_get_rxfh_indir(struct net_device *netdev, u32 *indir)
 }
 
 /**
- * i40evf_set_rxfh_indir - set the rx flow hash indirection table
+ * i40evf_set_rxfh - set the rx flow hash indirection table
  * @netdev: network interface device structure
  * @indir: indirection table
+ * @key: hash key (will be %NULL until get_rxfh_key_size is implemented)
  *
  * Returns -EINVAL if the table specifies an inavlid queue id, otherwise
  * returns 0 after programming the table.
  **/
-static int i40evf_set_rxfh_indir(struct net_device *netdev, const u32 *indir)
+static int i40evf_set_rxfh(struct net_device *netdev, const u32 *indir,
+			   const u8 *key)
 {
 	struct i40evf_adapter *adapter = netdev_priv(netdev);
 	struct i40e_hw *hw = &adapter->hw;
@@ -691,8 +694,8 @@ static struct ethtool_ops i40evf_ethtool_ops = {
 	.get_rxnfc		= i40evf_get_rxnfc,
 	.set_rxnfc		= i40evf_set_rxnfc,
 	.get_rxfh_indir_size	= i40evf_get_rxfh_indir_size,
-	.get_rxfh_indir		= i40evf_get_rxfh_indir,
-	.set_rxfh_indir		= i40evf_set_rxfh_indir,
+	.get_rxfh		= i40evf_get_rxfh,
+	.set_rxfh		= i40evf_set_rxfh,
 	.get_channels		= i40evf_get_channels,
 };
 
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index a84297c85fb1..d8bbcf1873ca 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2830,7 +2830,7 @@ static u32 igb_get_rxfh_indir_size(struct net_device *netdev)
 	return IGB_RETA_SIZE;
 }
 
-static int igb_get_rxfh_indir(struct net_device *netdev, u32 *indir)
+static int igb_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	int i;
@@ -2876,7 +2876,8 @@ void igb_write_rss_indir_tbl(struct igb_adapter *adapter)
 	}
 }
 
-static int igb_set_rxfh_indir(struct net_device *netdev, const u32 *indir)
+static int igb_set_rxfh(struct net_device *netdev, const u32 *indir,
+			const u8 *key)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -3025,8 +3026,8 @@ static const struct ethtool_ops igb_ethtool_ops = {
 	.get_module_info	= igb_get_module_info,
 	.get_module_eeprom	= igb_get_module_eeprom,
 	.get_rxfh_indir_size	= igb_get_rxfh_indir_size,
-	.get_rxfh_indir		= igb_get_rxfh_indir,
-	.set_rxfh_indir		= igb_set_rxfh_indir,
+	.get_rxfh		= igb_get_rxfh,
+	.set_rxfh		= igb_set_rxfh,
 	.get_channels		= igb_get_channels,
 	.set_channels		= igb_set_channels,
 	.begin			= igb_ethtool_begin,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index a72d99fd7a2d..263a1c7a3370 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -564,7 +564,7 @@ static u32 mlx4_en_get_rxfh_indir_size(struct net_device *dev)
 	return priv->rx_ring_num;
 }
 
-static int mlx4_en_get_rxfh_indir(struct net_device *dev, u32 *ring_index)
+static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
@@ -582,8 +582,8 @@ static int mlx4_en_get_rxfh_indir(struct net_device *dev, u32 *ring_index)
 	return err;
 }
 
-static int mlx4_en_set_rxfh_indir(struct net_device *dev,
-		const u32 *ring_index)
+static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
+			    const u8 *key)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
@@ -1224,8 +1224,8 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
 	.get_rxnfc = mlx4_en_get_rxnfc,
 	.set_rxnfc = mlx4_en_set_rxnfc,
 	.get_rxfh_indir_size = mlx4_en_get_rxfh_indir_size,
-	.get_rxfh_indir = mlx4_en_get_rxfh_indir,
-	.set_rxfh_indir = mlx4_en_set_rxfh_indir,
+	.get_rxfh = mlx4_en_get_rxfh,
+	.set_rxfh = mlx4_en_set_rxfh,
 	.get_channels = mlx4_en_get_channels,
 	.set_channels = mlx4_en_set_channels,
 	.get_ts_info = mlx4_en_get_ts_info,
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 0de8b07c24c2..74739c4b9997 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -1033,7 +1033,7 @@ static u32 efx_ethtool_get_rxfh_indir_size(struct net_device *net_dev)
 		0 : ARRAY_SIZE(efx->rx_indir_table));
 }
 
-static int efx_ethtool_get_rxfh_indir(struct net_device *net_dev, u32 *indir)
+static int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
@@ -1041,8 +1041,8 @@ static int efx_ethtool_get_rxfh_indir(struct net_device *net_dev, u32 *indir)
 	return 0;
 }
 
-static int efx_ethtool_set_rxfh_indir(struct net_device *net_dev,
-				      const u32 *indir)
+static int efx_ethtool_set_rxfh(struct net_device *net_dev,
+				const u32 *indir, const u8 *key)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
@@ -1125,8 +1125,8 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.get_rxnfc		= efx_ethtool_get_rxnfc,
 	.set_rxnfc		= efx_ethtool_set_rxnfc,
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
-	.get_rxfh_indir		= efx_ethtool_get_rxfh_indir,
-	.set_rxfh_indir		= efx_ethtool_set_rxfh_indir,
+	.get_rxfh		= efx_ethtool_get_rxfh,
+	.set_rxfh		= efx_ethtool_set_rxfh,
 	.get_ts_info		= efx_ethtool_get_ts_info,
 	.get_module_info	= efx_ethtool_get_module_info,
 	.get_module_eeprom	= efx_ethtool_get_module_eeprom,
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 00e120296e92..9396cca93b09 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -579,7 +579,7 @@ vmxnet3_get_rss_indir_size(struct net_device *netdev)
 }
 
 static int
-vmxnet3_get_rss_indir(struct net_device *netdev, u32 *p)
+vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	struct UPT1_RSSConf *rssConf = adapter->rss_conf;
@@ -592,7 +592,7 @@ vmxnet3_get_rss_indir(struct net_device *netdev, u32 *p)
 }
 
 static int
-vmxnet3_set_rss_indir(struct net_device *netdev, const u32 *p)
+vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key)
 {
 	unsigned int i;
 	unsigned long flags;
@@ -628,8 +628,8 @@ static const struct ethtool_ops vmxnet3_ethtool_ops = {
 	.get_rxnfc         = vmxnet3_get_rxnfc,
 #ifdef VMXNET3_RSS
 	.get_rxfh_indir_size = vmxnet3_get_rss_indir_size,
-	.get_rxfh_indir    = vmxnet3_get_rss_indir,
-	.set_rxfh_indir    = vmxnet3_set_rss_indir,
+	.get_rxfh          = vmxnet3_get_rss,
+	.set_rxfh          = vmxnet3_set_rss,
 #endif
 };
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 874fde01d398..e658229fee39 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -158,15 +158,11 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	Returns zero if not supported for this specific device.
  * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table.
  *	Returns zero if not supported for this specific device.
- * @get_rxfh_indir: Get the contents of the RX flow hash indirection table.
- *	Will not be called if @get_rxfh_indir_size returns zero.
  * @get_rxfh: Get the contents of the RX flow hash indirection table and hash
  *	key.
  *	Will only be called if one or both of @get_rxfh_indir_size and
  *	@get_rxfh_key_size are implemented and return non-zero.
  *	Returns a negative error code or zero.
- * @set_rxfh_indir: Set the contents of the RX flow hash indirection table.
- *	Will not be called if @get_rxfh_indir_size returns zero.
  * @set_rxfh: Set the contents of the RX flow hash indirection table and/or
  *	hash key.  In case only the indirection table or hash key is to be
  *	changed, the other argument will be %NULL.
@@ -248,8 +244,6 @@ struct ethtool_ops {
 	int	(*get_rxfh)(struct net_device *, u32 *indir, u8 *key);
 	int	(*set_rxfh)(struct net_device *, const u32 *indir,
 			    const u8 *key);
-	int	(*get_rxfh_indir)(struct net_device *, u32 *);
-	int	(*set_rxfh_indir)(struct net_device *, const u32 *);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*set_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*get_dump_flag)(struct net_device *, struct ethtool_dump *);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b8857348bdf3..8ae452afb545 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -582,7 +582,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 	int ret;
 
 	if (!dev->ethtool_ops->get_rxfh_indir_size ||
-	    !dev->ethtool_ops->get_rxfh_indir)
+	    !dev->ethtool_ops->get_rxfh)
 		return -EOPNOTSUPP;
 	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
 	if (dev_size == 0)
@@ -608,7 +608,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 	if (!indir)
 		return -ENOMEM;
 
-	ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
+	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
 	if (ret)
 		goto out;
 
@@ -632,7 +632,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 	int ret;
 	u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
 
-	if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir ||
+	if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
 	    !ops->get_rxnfc)
 		return -EOPNOTSUPP;
 
@@ -669,7 +669,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 			goto out;
 	}
 
-	ret = ops->set_rxfh_indir(dev, indir);
+	ret = ops->set_rxfh(dev, indir, NULL);
 
 out:
 	kfree(indir);
-- 
cgit v1.2.3-71-gd317


From 1dd40c3ecd9b8a4ab91dbf2e6ce10b82a3b5ae63 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 14 Mar 2014 18:41:24 -0400
Subject: dm: introduce dm_accept_partial_bio

The function dm_accept_partial_bio allows the target to specify how many
sectors of the current bio it will process.  If the target only wants to
accept part of the bio, it calls dm_accept_partial_bio and the DM core
sends the rest of the data in next bio.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++++------
 include/linux/device-mapper.h |  2 ++
 2 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 368a20dd85c2..97940fc8c302 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1110,6 +1110,46 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
+/*
+ * A target may call dm_accept_partial_bio only from the map routine.  It is
+ * allowed for all bio types except REQ_FLUSH.
+ *
+ * dm_accept_partial_bio informs the dm that the target only wants to process
+ * additional n_sectors sectors of the bio and the rest of the data should be
+ * sent in a next bio.
+ *
+ * A diagram that explains the arithmetics:
+ * +--------------------+---------------+-------+
+ * |         1          |       2       |   3   |
+ * +--------------------+---------------+-------+
+ *
+ * <-------------- *tio->len_ptr --------------->
+ *                      <------- bi_size ------->
+ *                      <-- n_sectors -->
+ *
+ * Region 1 was already iterated over with bio_advance or similar function.
+ *	(it may be empty if the target doesn't use bio_advance)
+ * Region 2 is the remaining bio size that the target wants to process.
+ *	(it may be empty if region 1 is non-empty, although there is no reason
+ *	 to make it empty)
+ * The target requires that region 3 is to be sent in the next bio.
+ *
+ * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
+ * the partially processed part (the sum of regions 1+2) must be the same for all
+ * copies of the bio.
+ */
+void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
+{
+	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+	BUG_ON(bio->bi_rw & REQ_FLUSH);
+	BUG_ON(bi_size > *tio->len_ptr);
+	BUG_ON(n_sectors > bi_size);
+	*tio->len_ptr -= bi_size - n_sectors;
+	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+
 static void __map_bio(struct dm_target_io *tio)
 {
 	int r;
@@ -1200,11 +1240,13 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
 
 static void __clone_and_map_simple_bio(struct clone_info *ci,
 				       struct dm_target *ti,
-				       unsigned target_bio_nr, unsigned len)
+				       unsigned target_bio_nr, unsigned *len)
 {
 	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
 	struct bio *clone = &tio->clone;
 
+	tio->len_ptr = len;
+
 	/*
 	 * Discard requests require the bio's inline iovecs be initialized.
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
@@ -1212,13 +1254,13 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
 	 */
 	 __bio_clone_fast(clone, ci->bio);
 	if (len)
-		bio_setup_sector(clone, ci->sector, len);
+		bio_setup_sector(clone, ci->sector, *len);
 
 	__map_bio(tio);
 }
 
 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
-				  unsigned num_bios, unsigned len)
+				  unsigned num_bios, unsigned *len)
 {
 	unsigned target_bio_nr;
 
@@ -1233,13 +1275,13 @@ static int __send_empty_flush(struct clone_info *ci)
 
 	BUG_ON(bio_has_data(ci->bio));
 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
-		__send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
+		__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
 
 	return 0;
 }
 
 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
-				     sector_t sector, unsigned len)
+				     sector_t sector, unsigned *len)
 {
 	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;
@@ -1254,7 +1296,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
 
 	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
 		tio = alloc_tio(ci, ti, 0, target_bio_nr);
-		clone_bio(tio, bio, sector, len);
+		tio->len_ptr = len;
+		clone_bio(tio, bio, sector, *len);
 		__map_bio(tio);
 	}
 }
@@ -1306,7 +1349,7 @@ static int __send_changing_extent_only(struct clone_info *ci,
 		else
 			len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
 
-		__send_duplicate_bios(ci, ti, num_bios, len);
+		__send_duplicate_bios(ci, ti, num_bios, &len);
 
 		ci->sector += len;
 	} while (ci->sector_count -= len);
@@ -1345,7 +1388,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 
 	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
 
-	__clone_and_map_data_bio(ci, ti, ci->sector, len);
+	__clone_and_map_data_bio(ci, ti, ci->sector, &len);
 
 	ci->sector += len;
 	ci->sector_count -= len;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 63da56ed9796..0adca299f238 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -291,6 +291,7 @@ struct dm_target_io {
 	struct dm_io *io;
 	struct dm_target *ti;
 	unsigned target_bio_nr;
+	unsigned *len_ptr;
 	struct bio clone;
 };
 
@@ -401,6 +402,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
 struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct dm_target *ti);
 int dm_noflush_suspending(struct dm_target *ti);
+void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
-- 
cgit v1.2.3-71-gd317


From bd67608a6127c994e897c49cc4f72d9095925301 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 3 Jun 2014 23:04:30 -0400
Subject: NVMe: Rename io_timeout to nvme_io_timeout

It's positively immoral to have a global variable called 'io_timeout'.
Keep the module parameter called io_timeout, though.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 4 ++--
 include/linux/nvme.h      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index bb6ce311ad44..2af079e571fc 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -54,8 +54,8 @@ static unsigned char admin_timeout = 60;
 module_param(admin_timeout, byte, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
 
-unsigned char io_timeout = 30;
-module_param(io_timeout, byte, 0644);
+unsigned char nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
 
 static unsigned char retry_time = 30;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 1813cfdb7e80..8541dd920bb7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -62,8 +62,8 @@ enum {
 
 #define NVME_VS(major, minor)	(major << 16 | minor)
 
-extern unsigned char io_timeout;
-#define NVME_IO_TIMEOUT	(io_timeout * HZ)
+extern unsigned char nvme_io_timeout;
+#define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
-- 
cgit v1.2.3-71-gd317


From bfec07d0f8ed78b10df3ca3bc23e27de1166ea45 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 28 May 2014 08:56:09 +0800
Subject: f2fs: avoid overflow when large directory feathure is enabled

When large directory feathure is enable, We have one case which could cause
overflow in dir_buckets() as following:
special case: level + dir_level >= 32 and level < MAX_DIR_HASH_DEPTH / 2.

Here we define MAX_DIR_BUCKETS to limit the return value when the condition
could trigger potential overflow.

Changes from V1
 o modify description of calculation in f2fs.txt suggested by Changman Lee.

Suggested-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.txt | 8 ++++----
 fs/f2fs/dir.c                      | 4 ++--
 include/linux/f2fs_fs.h            | 3 +++
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 25311e113e75..51afba17bbae 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -461,11 +461,11 @@ The number of blocks and buckets are determined by,
   # of blocks in level #n = |
                             `- 4, Otherwise
 
-                             ,- 2^ (n + dir_level),
-			     |            if n < MAX_DIR_HASH_DEPTH / 2,
+                             ,- 2^(n + dir_level),
+			     |        if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
   # of buckets in level #n = |
-                             `- 2^((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1),
-			                  Otherwise
+                             `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
+			              Otherwise
 
 When F2FS finds a file name in a directory, at first a hash value of the file
 name is calculated. Then, F2FS scans the hash table in level #0 to find the
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c3f148555c37..966acb039e3b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -23,10 +23,10 @@ static unsigned long dir_blocks(struct inode *inode)
 
 static unsigned int dir_buckets(unsigned int level, int dir_level)
 {
-	if (level < MAX_DIR_HASH_DEPTH / 2)
+	if (level + dir_level < MAX_DIR_HASH_DEPTH / 2)
 		return 1 << (level + dir_level);
 	else
-		return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1);
+		return MAX_DIR_BUCKETS;
 }
 
 static unsigned int bucket_blocks(unsigned int level)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 8c03f71307c6..ba6f3127738f 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -394,6 +394,9 @@ typedef __le32	f2fs_hash_t;
 /* MAX level for dir lookup */
 #define MAX_DIR_HASH_DEPTH	63
 
+/* MAX buckets in one level of dir */
+#define MAX_DIR_BUCKETS		(1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
+
 #define SIZE_OF_DIR_ENTRY	11	/* by byte */
 #define SIZE_OF_DENTRY_BITMAP	((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
 					BITS_PER_BYTE)
-- 
cgit v1.2.3-71-gd317


From 1dbe4152168d44fa164edbdc9f1243de70b98f7a Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Mon, 12 May 2014 12:27:43 +0900
Subject: f2fs: large volume support

f2fs's cp has one page which consists of struct f2fs_checkpoint and
version bitmap of sit and nat. To support lots of segments, we need more
blocks for sit bitmap. So let's arrange sit bitmap as following:
+-----------------+------------+
| f2fs_checkpoint | sit bitmap |
| + nat bitmap    |            |
+-----------------+------------+
0                 4k        N blocks

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
[Jaegeuk Kim: simple code change for readability]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    | 45 ++++++++++++++++++++++++++++++++++++++++-----
 fs/f2fs/f2fs.h          | 13 +++++++++++--
 include/linux/f2fs_fs.h |  2 ++
 3 files changed, 53 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index fe968c7bfc90..ecba8da3308b 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -371,7 +371,9 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
 		return;
 
 	sbi->por_doing = true;
-	start_blk = __start_cp_addr(sbi) + 1;
+
+	start_blk = __start_cp_addr(sbi) + 1 +
+		le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
 	orphan_blkaddr = __start_sum_addr(sbi) - 1;
 
 	ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
@@ -512,8 +514,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	unsigned long blk_size = sbi->blocksize;
 	unsigned long long cp1_version = 0, cp2_version = 0;
 	unsigned long long cp_start_blk_no;
+	unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+	block_t cp_blk_no;
+	int i;
 
-	sbi->ckpt = kzalloc(blk_size, GFP_KERNEL);
+	sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
 	if (!sbi->ckpt)
 		return -ENOMEM;
 	/*
@@ -544,6 +549,23 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
 	memcpy(sbi->ckpt, cp_block, blk_size);
 
+	if (cp_blks <= 1)
+		goto done;
+
+	cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+	if (cur_page == cp2)
+		cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+
+	for (i = 1; i < cp_blks; i++) {
+		void *sit_bitmap_ptr;
+		unsigned char *ckpt = (unsigned char *)sbi->ckpt;
+
+		cur_page = get_meta_page(sbi, cp_blk_no + i);
+		sit_bitmap_ptr = page_address(cur_page);
+		memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
+		f2fs_put_page(cur_page, 1);
+	}
+done:
 	f2fs_put_page(cp1, 1);
 	f2fs_put_page(cp2, 1);
 	return 0;
@@ -736,6 +758,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	__u32 crc32 = 0;
 	void *kaddr;
 	int i;
+	int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
 
 	/*
 	 * This avoids to conduct wrong roll-forward operations and uses
@@ -786,16 +809,19 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 
 	orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
 					/ F2FS_ORPHANS_PER_BLOCK;
-	ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
+	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
+			orphan_blocks);
 
 	if (is_umount) {
 		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 		ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
-			data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
+				cp_payload_blks + data_sum_blocks +
+				orphan_blocks + NR_CURSEG_NODE_TYPE);
 	} else {
 		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 		ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
-			data_sum_blocks + orphan_blocks);
+				cp_payload_blks + data_sum_blocks +
+				orphan_blocks);
 	}
 
 	if (sbi->n_orphans)
@@ -821,6 +847,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	set_page_dirty(cp_page);
 	f2fs_put_page(cp_page, 1);
 
+	for (i = 1; i < 1 + cp_payload_blks; i++) {
+		cp_page = grab_meta_page(sbi, start_blk++);
+		kaddr = page_address(cp_page);
+		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
+				(1 << sbi->log_blocksize));
+		set_page_dirty(cp_page);
+		f2fs_put_page(cp_page, 1);
+	}
+
 	if (sbi->n_orphans) {
 		write_orphan_inodes(sbi, start_blk);
 		start_blk += orphan_blocks;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 676a2c6ccec7..9684b1f77a7d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -764,9 +764,18 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
 static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-	int offset = (flag == NAT_BITMAP) ?
+	int offset;
+
+	if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) {
+		if (flag == NAT_BITMAP)
+			return &ckpt->sit_nat_version_bitmap;
+		else
+			return ((unsigned char *)ckpt + F2FS_BLKSIZE);
+	} else {
+		offset = (flag == NAT_BITMAP) ?
 			le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
-	return &ckpt->sit_nat_version_bitmap + offset;
+		return &ckpt->sit_nat_version_bitmap + offset;
+	}
 }
 
 static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index ba6f3127738f..6ff0b0b42d47 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -19,6 +19,7 @@
 #define F2FS_LOG_SECTORS_PER_BLOCK	3	/* 4KB: F2FS_BLKSIZE */
 #define F2FS_BLKSIZE			4096	/* support only 4KB block */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
+#define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE)
 
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
@@ -75,6 +76,7 @@ struct f2fs_super_block {
 	__le16 volume_name[512];	/* volume name */
 	__le32 extension_count;		/* # of extensions below */
 	__u8 extension_list[F2FS_MAX_EXTENSION][8];	/* extension array */
+	__le32 cp_payload;
 } __packed;
 
 /*
-- 
cgit v1.2.3-71-gd317


From 4f4cf5ad6fc1b16dc8dc9d750bb80b35eba5e98d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 28 May 2014 10:34:49 -0400
Subject: SUNRPC: Move congestion window constants to header file

I would like to use one of the RPC client's congestion algorithm
constants in transport-specific code.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h |  6 ++++++
 net/sunrpc/xprt.c           | 28 +++++++++-------------------
 2 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 3e5efb2b236e..5903d2c0ab4d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -24,6 +24,12 @@
 #define RPC_MAX_SLOT_TABLE_LIMIT	(65536U)
 #define RPC_MAX_SLOT_TABLE	RPC_MAX_SLOT_TABLE_LIMIT
 
+#define RPC_CWNDSHIFT		(8U)
+#define RPC_CWNDSCALE		(1U << RPC_CWNDSHIFT)
+#define RPC_INITCWND		RPC_CWNDSCALE
+#define RPC_MAXCWND(xprt)	((xprt)->max_reqs << RPC_CWNDSHIFT)
+#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
+
 /*
  * This describes a timeout strategy
  */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index d173f79947c6..2d1d5a643b95 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -71,24 +71,6 @@ static void	 xprt_destroy(struct rpc_xprt *xprt);
 static DEFINE_SPINLOCK(xprt_list_lock);
 static LIST_HEAD(xprt_list);
 
-/*
- * The transport code maintains an estimate on the maximum number of out-
- * standing RPC requests, using a smoothed version of the congestion
- * avoidance implemented in 44BSD. This is basically the Van Jacobson
- * congestion algorithm: If a retransmit occurs, the congestion window is
- * halved; otherwise, it is incremented by 1/cwnd when
- *
- *	-	a reply is received and
- *	-	a full number of requests are outstanding and
- *	-	the congestion window hasn't been updated recently.
- */
-#define RPC_CWNDSHIFT		(8U)
-#define RPC_CWNDSCALE		(1U << RPC_CWNDSHIFT)
-#define RPC_INITCWND		RPC_CWNDSCALE
-#define RPC_MAXCWND(xprt)	((xprt)->max_reqs << RPC_CWNDSHIFT)
-
-#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
-
 /**
  * xprt_register_transport - register a transport implementation
  * @transport: transport to register
@@ -446,7 +428,15 @@ EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
  * @task: recently completed RPC request used to adjust window
  * @result: result code of completed RPC request
  *
- * We use a time-smoothed congestion estimator to avoid heavy oscillation.
+ * The transport code maintains an estimate on the maximum number of out-
+ * standing RPC requests, using a smoothed version of the congestion
+ * avoidance implemented in 44BSD. This is basically the Van Jacobson
+ * congestion algorithm: If a retransmit occurs, the congestion window is
+ * halved; otherwise, it is incremented by 1/cwnd when
+ *
+ *	-	a reply is received and
+ *	-	a full number of requests are outstanding and
+ *	-	the congestion window hasn't been updated recently.
  */
 void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result)
 {
-- 
cgit v1.2.3-71-gd317


From 11f0431be2f99c574a65c6dfc0ca205511500f29 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 3 Jun 2014 10:30:28 -0400
Subject: dm: remove symbol export for dm_set_device_limits

There is no need for code other than DM core to use dm_set_device_limits
so remove its EXPORT_SYMBOL_GPL.  Also, cleanup a couple whitespace nits.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c         | 5 ++---
 drivers/md/dm.c               | 1 -
 include/linux/device-mapper.h | 8 +-------
 3 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 50601ec7017a..5f59f1e3e5b1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -465,8 +465,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 }
 EXPORT_SYMBOL(dm_get_device);
 
-int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
-			 sector_t start, sector_t len, void *data)
+static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, sector_t len, void *data)
 {
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
@@ -499,7 +499,6 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 					  (unsigned int) (PAGE_SIZE >> 9));
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
 /*
  * Decrement a device's use count and remove it if necessary.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3234a753a80d..bf1a1eaad9a9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1498,7 +1498,6 @@ static int dm_merge_bvec(struct request_queue *q,
 	 * just one page.
 	 */
 	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
-
 		max_size = 0;
 
 out:
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0adca299f238..e1707de043ae 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -115,12 +115,6 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
 
 void dm_error(const char *message);
 
-/*
- * Combine device limits.
- */
-int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
-			 sector_t start, sector_t len, void *data);
-
 struct dm_dev {
 	struct block_device *bdev;
 	fmode_t mode;
@@ -132,7 +126,7 @@ struct dm_dev {
  * are opened/closed correctly.
  */
 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
-						 struct dm_dev **result);
+		  struct dm_dev **result);
 void dm_put_device(struct dm_target *ti, struct dm_dev *d);
 
 /*
-- 
cgit v1.2.3-71-gd317


From b14903e10a06347234b387f7364f86aa07252d9f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 4 Jun 2014 16:46:00 +0200
Subject: regulator: add regulator_can_change_voltage stub

When CONFIG_REGULATOR is not set, we cannot call
regulator_can_change_voltage() from a device driver, which results
in a build error like

video/fbdev/omap2/dss/hdmi5.c: In function 'hdmi_init_regulator':
video/fbdev/omap2/dss/hdmi5.c:149:2: error: implicit declaration of function 'regulator_can_change_voltage' [-Werror=implicit-function-declaration]

even for drivers that don't require the regulator API normally.
Such a use was recently added in the omap2+ hdmi driver.

This avoids the problem by adding a static inline function
stub in the API header, as we have for most of the other
regulator functions as well.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Mark Brown <broonie@kernel.org>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 include/linux/regulator/consumer.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index e530681bea70..d60b92a7fc25 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -391,6 +391,11 @@ static inline void regulator_bulk_free(int num_consumers,
 {
 }
 
+static inline int regulator_can_change_voltage(struct regulator *regulator)
+{
+	return 0;
+}
+
 static inline int regulator_set_voltage(struct regulator *regulator,
 					int min_uV, int max_uV)
 {
-- 
cgit v1.2.3-71-gd317


From 7e2b10c1e52ca37fb522be49f4be367f9311d0cd Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Wed, 4 Jun 2014 17:20:02 -0700
Subject: net: Support for multiple checksums with gso

When creating a GSO packet segment we may need to set more than
one checksum in the packet (for instance a TCP checksum and
UDP checksum for VXLAN encapsulation). To be efficient, we want
to do checksum calculation for any part of the packet at most once.

This patch adds csum_start offset to skb_gso_cb. This tracks the
starting offset for skb->csum which is initially set in skb_segment.
When a protocol needs to compute a transport checksum it calls
gso_make_checksum which computes the checksum value from the start
of transport header to csum_start and then adds in skb->csum to get
the full checksum. skb->csum and csum_start are then updated to reflect
the checksum of the resultant packet starting from the transport header.

This patch also adds a flag to skbuff, encap_hdr_csum, which is set
in *gso_segment fucntions to indicate that a tunnel protocol needs
checksum calculation

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 26 +++++++++++++++++++++++++-
 net/core/skbuff.c         |  8 +++++++-
 net/ipv4/ip_tunnel_core.c |  8 ++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a9beeb1c458..d8d397acb52c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -567,7 +567,8 @@ struct sk_buff {
 	 * headers if needed
 	 */
 	__u8			encapsulation:1;
-	/* 6/8 bit hole (depending on ndisc_nodetype presence) */
+	__u8			encap_hdr_csum:1;
+	/* 5/7 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
 #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
@@ -2988,6 +2989,7 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 struct skb_gso_cb {
 	int	mac_offset;
 	int	encap_level;
+	__u16	csum_start;
 };
 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb)
 
@@ -3012,6 +3014,28 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
 	return 0;
 }
 
+/* Compute the checksum for a gso segment. First compute the checksum value
+ * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
+ * then add in skb->csum (checksum from csum_start to end of packet).
+ * skb->csum and csum_start are then updated to reflect the checksum of the
+ * resultant packet starting from the transport header-- the resultant checksum
+ * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo
+ * header.
+ */
+static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
+{
+	int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) -
+	    skb_transport_offset(skb);
+	__u16 csum;
+
+	csum = csum_fold(csum_partial(skb_transport_header(skb),
+				      plen, skb->csum));
+	skb->csum = res;
+	SKB_GSO_CB(skb)->csum_start -= plen;
+
+	return csum;
+}
+
 static inline bool skb_is_gso(const struct sk_buff *skb)
 {
 	return skb_shinfo(skb)->gso_size;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3f6c7e8be8a4..05f4bef2ce12 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2885,7 +2885,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	if (unlikely(!proto))
 		return ERR_PTR(-EINVAL);
 
-	csum = !!can_checksum_protocol(features, proto);
+	csum = !head_skb->encap_hdr_csum &&
+	    !!can_checksum_protocol(features, proto);
+
 	__skb_push(head_skb, doffset);
 	headroom = skb_headroom(head_skb);
 	pos = skb_headlen(head_skb);
@@ -2983,6 +2985,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 			nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
 							    skb_put(nskb, len),
 							    len, 0);
+			SKB_GSO_CB(nskb)->csum_start =
+			    skb_headroom(nskb) + offset;
 			continue;
 		}
 
@@ -3052,6 +3056,8 @@ perform_csum_check:
 			nskb->csum = skb_checksum(nskb, doffset,
 						  nskb->len - doffset, 0);
 			nskb->ip_summed = CHECKSUM_NONE;
+			SKB_GSO_CB(nskb)->csum_start =
+			    skb_headroom(nskb) + doffset;
 		}
 	} while ((offset += len) < head_skb->len);
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 847e69cbff7e..f4c987bb7e94 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -135,6 +135,14 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
 		return skb;
 	}
 
+	/* If packet is not gso and we are resolving any partial checksum,
+	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
+	 * on the outer header without confusing devices that implement
+	 * NETIF_F_IP_CSUM with encapsulation.
+	 */
+	if (csum_help)
+		skb->encapsulation = 0;
+
 	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
 		err = skb_checksum_help(skb);
 		if (unlikely(err))
-- 
cgit v1.2.3-71-gd317


From 0f4f4ffa7b7c3d29d0537a126145c9f8d8ed5dbc Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Wed, 4 Jun 2014 17:20:16 -0700
Subject: net: Add GSO support for UDP tunnels with checksum

Added a new netif feature for GSO_UDP_TUNNEL_CSUM. This indicates
that a device is capable of computing the UDP checksum in the
encapsulating header of a UDP tunnel.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  1 +
 include/linux/skbuff.h          |  2 ++
 net/ipv4/af_inet.c              |  1 +
 net/ipv4/tcp_offload.c          |  1 +
 net/ipv4/udp.c                  | 40 +++++++++++++++++++---------------------
 net/ipv4/udp_offload.c          |  4 +++-
 net/ipv6/ip6_offload.c          |  1 +
 net/ipv6/udp_offload.c          |  4 +++-
 8 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index c26d0ec2ef3a..f1338e0f9866 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -45,6 +45,7 @@ enum {
 	NETIF_F_GSO_IPIP_BIT,		/* ... IPIP tunnel with TSO */
 	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
+	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
 	NETIF_F_GSO_MPLS_BIT,		/* ... MPLS segmentation */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
 		NETIF_F_GSO_MPLS_BIT,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d8d397acb52c..5a6d10a538f5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -345,6 +345,8 @@ enum {
 	SKB_GSO_UDP_TUNNEL = 1 << 9,
 
 	SKB_GSO_MPLS = 1 << 10,
+
+	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0e9bb08a91e4..0070ab87109b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1258,6 +1258,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_SIT |
 		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
+		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_MPLS |
 		       0)))
 		goto out;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index d8de7b9e0720..c02f2d2e7bab 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -61,6 +61,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_SIT |
 			       SKB_GSO_MPLS |
 			       SKB_GSO_UDP_TUNNEL |
+			       SKB_GSO_UDP_TUNNEL_CSUM |
 			       0) ||
 			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
 			goto out;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a84f6762ea9e..8d8c33d84c9a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2528,7 +2528,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	__be16 protocol = skb->protocol;
 	netdev_features_t enc_features;
-	int outer_hlen;
+	int udp_offset, outer_hlen;
+	unsigned int oldlen;
+	bool need_csum;
+
+	oldlen = (u16)~skb->len;
 
 	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
 		goto out;
@@ -2540,6 +2544,10 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 	skb->mac_len = skb_inner_network_offset(skb);
 	skb->protocol = htons(ETH_P_TEB);
 
+	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+	if (need_csum)
+		skb->encap_hdr_csum = 1;
+
 	/* segment inner packet. */
 	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
 	segs = skb_mac_gso_segment(skb, enc_features);
@@ -2550,10 +2558,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 	}
 
 	outer_hlen = skb_tnl_header_len(skb);
+	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
 	do {
 		struct udphdr *uh;
-		int udp_offset = outer_hlen - tnl_hlen;
+		int len;
 
 		skb_reset_inner_headers(skb);
 		skb->encapsulation = 1;
@@ -2564,31 +2573,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 		skb_reset_mac_header(skb);
 		skb_set_network_header(skb, mac_len);
 		skb_set_transport_header(skb, udp_offset);
+		len = skb->len - udp_offset;
 		uh = udp_hdr(skb);
-		uh->len = htons(skb->len - udp_offset);
-
-		/* csum segment if tunnel sets skb with csum. */
-		if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
-			struct iphdr *iph = ip_hdr(skb);
+		uh->len = htons(len);
 
-			uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-						       skb->len - udp_offset,
-						       IPPROTO_UDP, 0);
-			uh->check = csum_fold(skb_checksum(skb, udp_offset,
-							   skb->len - udp_offset, 0));
-			if (uh->check == 0)
-				uh->check = CSUM_MANGLED_0;
+		if (need_csum) {
+			__be32 delta = htonl(oldlen + len);
 
-		} else if (protocol == htons(ETH_P_IPV6)) {
-			struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-			u32 len = skb->len - udp_offset;
+			uh->check = ~csum_fold((__force __wsum)
+					       ((__force u32)uh->check +
+						(__force u32)delta));
+			uh->check = gso_make_checksum(skb, ~uh->check);
 
-			uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-						     len, IPPROTO_UDP, 0);
-			uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
 			if (uh->check == 0)
 				uh->check = CSUM_MANGLED_0;
-			skb->ip_summed = CHECKSUM_NONE;
 		}
 
 		skb->protocol = protocol;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 88b4023ecfcf..5c23f4765af9 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -56,7 +56,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 	__wsum csum;
 
 	if (skb->encapsulation &&
-	    skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) {
+	    (skb_shinfo(skb)->gso_type &
+	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
 		segs = skb_udp_tunnel_segment(skb, features);
 		goto out;
 	}
@@ -71,6 +72,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 
 		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
+				      SKB_GSO_UDP_TUNNEL_CSUM |
 				      SKB_GSO_IPIP |
 				      SKB_GSO_GRE | SKB_GSO_MPLS) ||
 			     !(type & (SKB_GSO_UDP))))
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index b2f091566f88..d54c5744e3db 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -100,6 +100,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_IPIP |
 		       SKB_GSO_SIT |
 		       SKB_GSO_UDP_TUNNEL |
+		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_MPLS |
 		       SKB_GSO_TCPV6 |
 		       0)))
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index b261ee8b83fc..79da8b305ced 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -63,6 +63,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		if (unlikely(type & ~(SKB_GSO_UDP |
 				      SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
+				      SKB_GSO_UDP_TUNNEL_CSUM |
 				      SKB_GSO_GRE |
 				      SKB_GSO_IPIP |
 				      SKB_GSO_SIT |
@@ -76,7 +77,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		goto out;
 	}
 
-	if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+	if (skb->encapsulation && skb_shinfo(skb)->gso_type &
+	    (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
 		segs = skb_udp_tunnel_segment(skb, features);
 	else {
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
-- 
cgit v1.2.3-71-gd317


From 4749c09c37030ccdc44aecebe0f71b02a377fc14 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Wed, 4 Jun 2014 17:20:23 -0700
Subject: gre: Call gso_make_checksum

Call gso_make_checksum. This should have the benefit of using a
checksum that may have been previously computed for the packet.

This also adds NETIF_F_GSO_GRE_CSUM to differentiate devices that
offload GRE GSO with and without the GRE checksum offloaed.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  2 ++
 include/linux/skbuff.h          |  2 ++
 include/net/gre.h               |  5 +++--
 net/ipv4/af_inet.c              |  1 +
 net/ipv4/gre_demux.c            |  3 ++-
 net/ipv4/gre_offload.c          | 10 ++++++++--
 net/ipv4/tcp_offload.c          |  1 +
 net/ipv4/udp_offload.c          |  3 ++-
 net/ipv6/ip6_offload.c          |  1 +
 net/ipv6/udp_offload.c          |  1 +
 net/mpls/mpls_gso.c             |  1 +
 11 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index f1338e0f9866..e5a589435e2b 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -42,6 +42,7 @@ enum {
 	NETIF_F_TSO6_BIT,		/* ... TCPv6 segmentation */
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
+	NETIF_F_GSO_GRE_CSUM_BIT,	/* ... GRE with csum with TSO */
 	NETIF_F_GSO_IPIP_BIT,		/* ... IPIP tunnel with TSO */
 	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
@@ -112,6 +113,7 @@ enum {
 #define NETIF_F_RXFCS		__NETIF_F(RXFCS)
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
 #define NETIF_F_GSO_GRE		__NETIF_F(GSO_GRE)
+#define NETIF_F_GSO_GRE_CSUM	__NETIF_F(GSO_GRE_CSUM)
 #define NETIF_F_GSO_IPIP	__NETIF_F(GSO_IPIP)
 #define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5a6d10a538f5..c705808bef9c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -347,6 +347,8 @@ enum {
 	SKB_GSO_MPLS = 1 << 10,
 
 	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
+
+	SKB_GSO_GRE_CSUM = 1 << 12,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/include/net/gre.h b/include/net/gre.h
index 70046a0b0b89..b53182018743 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -37,9 +37,10 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 		      int hdr_len);
 
 static inline struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
-						  bool gre_csum)
+						  bool csum)
 {
-	return iptunnel_handle_offloads(skb, gre_csum, SKB_GSO_GRE);
+	return iptunnel_handle_offloads(skb, csum,
+					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0070ab87109b..d5e6836cf772 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1254,6 +1254,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
+		       SKB_GSO_GRE_CSUM |
 		       SKB_GSO_IPIP |
 		       SKB_GSO_SIT |
 		       SKB_GSO_TCPV6 |
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index fbfd829f4049..4e9619bca732 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -84,7 +84,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 			ptr--;
 		}
 		if (tpi->flags&TUNNEL_CSUM &&
-		    !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
+		    !(skb_shinfo(skb)->gso_type &
+		      (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
 			*ptr = 0;
 			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
 								 skb->len, 0));
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f1d32280cb54..24deb3928b9e 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -42,6 +42,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_DODGY |
 				  SKB_GSO_TCP_ECN |
 				  SKB_GSO_GRE |
+				  SKB_GSO_GRE_CSUM |
 				  SKB_GSO_IPIP)))
 		goto out;
 
@@ -55,6 +56,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		goto out;
 
 	csum = !!(greh->flags & GRE_CSUM);
+	if (csum)
+		skb->encap_hdr_csum = 1;
 
 	if (unlikely(!pskb_may_pull(skb, ghl)))
 		goto out;
@@ -94,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				}
 			}
 
-			greh = (struct gre_base_hdr *)(skb->data);
+			skb_reset_transport_header(skb);
+
+			greh = (struct gre_base_hdr *)
+			    skb_transport_header(skb);
 			pcsum = (__be32 *)(greh + 1);
 			*pcsum = 0;
-			*(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0));
+			*(__sum16 *)pcsum = gso_make_checksum(skb, 0);
 		}
 		__skb_push(skb, tnl_hlen - ghl);
 
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index c02f2d2e7bab..4e86c59ec7f7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -57,6 +57,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_TCP_ECN |
 			       SKB_GSO_TCPV6 |
 			       SKB_GSO_GRE |
+			       SKB_GSO_GRE_CSUM |
 			       SKB_GSO_IPIP |
 			       SKB_GSO_SIT |
 			       SKB_GSO_MPLS |
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 5c23f4765af9..7b1840110173 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -74,7 +74,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_UDP_TUNNEL_CSUM |
 				      SKB_GSO_IPIP |
-				      SKB_GSO_GRE | SKB_GSO_MPLS) ||
+				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
+				      SKB_GSO_MPLS) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
 
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index d54c5744e3db..65eda2a8af48 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -97,6 +97,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
+		       SKB_GSO_GRE_CSUM |
 		       SKB_GSO_IPIP |
 		       SKB_GSO_SIT |
 		       SKB_GSO_UDP_TUNNEL |
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 79da8b305ced..0ae3d98f83e0 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -65,6 +65,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_UDP_TUNNEL_CSUM |
 				      SKB_GSO_GRE |
+				      SKB_GSO_GRE_CSUM |
 				      SKB_GSO_IPIP |
 				      SKB_GSO_SIT |
 				      SKB_GSO_MPLS) ||
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 851cd880b0c0..6b38d083e1c9 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -33,6 +33,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_DODGY |
 				  SKB_GSO_TCP_ECN |
 				  SKB_GSO_GRE |
+				  SKB_GSO_GRE_CSUM |
 				  SKB_GSO_IPIP |
 				  SKB_GSO_MPLS)))
 		goto out;
-- 
cgit v1.2.3-71-gd317


From ca05e3a7551b71891d42d637d3a1e443c6bbd781 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Sun, 1 Jun 2014 23:47:24 +0200
Subject: isdn/capi: move capi_info2str to capidrv.c

capi_info2str() is apparently meant to be of general utility. It is
actually only used in capidrv.c. So move it from capiutil.c to
capidrv.c and (obviously) stop exporting it.

And, since we're touching this, merge the two versions of this
function.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/capi/capidrv.c   | 195 ++++++++++++++++++++++++++++++++++++++++
 drivers/isdn/capi/capiutil.c  | 200 ------------------------------------------
 include/linux/isdn/capiutil.h |   5 --
 3 files changed, 195 insertions(+), 205 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
index cc9f1927a322..70e67f6a22ff 100644
--- a/drivers/isdn/capi/capidrv.c
+++ b/drivers/isdn/capi/capidrv.c
@@ -763,6 +763,201 @@ static inline int new_bchan(capidrv_contr *card)
 }
 
 /* ------------------------------------------------------------------- */
+static char *capi_info2str(u16 reason)
+{
+#ifndef CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON
+	return "..";
+#else
+	switch (reason) {
+
+/*-- informative values (corresponding message was processed) -----*/
+	case 0x0001:
+		return "NCPI not supported by current protocol, NCPI ignored";
+	case 0x0002:
+		return "Flags not supported by current protocol, flags ignored";
+	case 0x0003:
+		return "Alert already sent by another application";
+
+/*-- error information concerning CAPI_REGISTER -----*/
+	case 0x1001:
+		return "Too many applications";
+	case 0x1002:
+		return "Logical block size too small, must be at least 128 Bytes";
+	case 0x1003:
+		return "Buffer exceeds 64 kByte";
+	case 0x1004:
+		return "Message buffer size too small, must be at least 1024 Bytes";
+	case 0x1005:
+		return "Max. number of logical connections not supported";
+	case 0x1006:
+		return "Reserved";
+	case 0x1007:
+		return "The message could not be accepted because of an internal busy condition";
+	case 0x1008:
+		return "OS resource error (no memory ?)";
+	case 0x1009:
+		return "CAPI not installed";
+	case 0x100A:
+		return "Controller does not support external equipment";
+	case 0x100B:
+		return "Controller does only support external equipment";
+
+/*-- error information concerning message exchange functions -----*/
+	case 0x1101:
+		return "Illegal application number";
+	case 0x1102:
+		return "Illegal command or subcommand or message length less than 12 bytes";
+	case 0x1103:
+		return "The message could not be accepted because of a queue full condition !! The error code does not imply that CAPI cannot receive messages directed to another controller, PLCI or NCCI";
+	case 0x1104:
+		return "Queue is empty";
+	case 0x1105:
+		return "Queue overflow, a message was lost !! This indicates a configuration error. The only recovery from this error is to perform a CAPI_RELEASE";
+	case 0x1106:
+		return "Unknown notification parameter";
+	case 0x1107:
+		return "The Message could not be accepted because of an internal busy condition";
+	case 0x1108:
+		return "OS Resource error (no memory ?)";
+	case 0x1109:
+		return "CAPI not installed";
+	case 0x110A:
+		return "Controller does not support external equipment";
+	case 0x110B:
+		return "Controller does only support external equipment";
+
+/*-- error information concerning resource / coding problems -----*/
+	case 0x2001:
+		return "Message not supported in current state";
+	case 0x2002:
+		return "Illegal Controller / PLCI / NCCI";
+	case 0x2003:
+		return "Out of PLCI";
+	case 0x2004:
+		return "Out of NCCI";
+	case 0x2005:
+		return "Out of LISTEN";
+	case 0x2006:
+		return "Out of FAX resources (protocol T.30)";
+	case 0x2007:
+		return "Illegal message parameter coding";
+
+/*-- error information concerning requested services  -----*/
+	case 0x3001:
+		return "B1 protocol not supported";
+	case 0x3002:
+		return "B2 protocol not supported";
+	case 0x3003:
+		return "B3 protocol not supported";
+	case 0x3004:
+		return "B1 protocol parameter not supported";
+	case 0x3005:
+		return "B2 protocol parameter not supported";
+	case 0x3006:
+		return "B3 protocol parameter not supported";
+	case 0x3007:
+		return "B protocol combination not supported";
+	case 0x3008:
+		return "NCPI not supported";
+	case 0x3009:
+		return "CIP Value unknown";
+	case 0x300A:
+		return "Flags not supported (reserved bits)";
+	case 0x300B:
+		return "Facility not supported";
+	case 0x300C:
+		return "Data length not supported by current protocol";
+	case 0x300D:
+		return "Reset procedure not supported by current protocol";
+
+/*-- informations about the clearing of a physical connection -----*/
+	case 0x3301:
+		return "Protocol error layer 1 (broken line or B-channel removed by signalling protocol)";
+	case 0x3302:
+		return "Protocol error layer 2";
+	case 0x3303:
+		return "Protocol error layer 3";
+	case 0x3304:
+		return "Another application got that call";
+/*-- T.30 specific reasons -----*/
+	case 0x3311:
+		return "Connecting not successful (remote station is no FAX G3 machine)";
+	case 0x3312:
+		return "Connecting not successful (training error)";
+	case 0x3313:
+		return "Disconnected before transfer (remote station does not support transfer mode, e.g. resolution)";
+	case 0x3314:
+		return "Disconnected during transfer (remote abort)";
+	case 0x3315:
+		return "Disconnected during transfer (remote procedure error, e.g. unsuccessful repetition of T.30 commands)";
+	case 0x3316:
+		return "Disconnected during transfer (local tx data underrun)";
+	case 0x3317:
+		return "Disconnected during transfer (local rx data overflow)";
+	case 0x3318:
+		return "Disconnected during transfer (local abort)";
+	case 0x3319:
+		return "Illegal parameter coding (e.g. SFF coding error)";
+
+/*-- disconnect causes from the network according to ETS 300 102-1/Q.931 -----*/
+	case 0x3481: return "Unallocated (unassigned) number";
+	case 0x3482: return "No route to specified transit network";
+	case 0x3483: return "No route to destination";
+	case 0x3486: return "Channel unacceptable";
+	case 0x3487:
+		return "Call awarded and being delivered in an established channel";
+	case 0x3490: return "Normal call clearing";
+	case 0x3491: return "User busy";
+	case 0x3492: return "No user responding";
+	case 0x3493: return "No answer from user (user alerted)";
+	case 0x3495: return "Call rejected";
+	case 0x3496: return "Number changed";
+	case 0x349A: return "Non-selected user clearing";
+	case 0x349B: return "Destination out of order";
+	case 0x349C: return "Invalid number format";
+	case 0x349D: return "Facility rejected";
+	case 0x349E: return "Response to STATUS ENQUIRY";
+	case 0x349F: return "Normal, unspecified";
+	case 0x34A2: return "No circuit / channel available";
+	case 0x34A6: return "Network out of order";
+	case 0x34A9: return "Temporary failure";
+	case 0x34AA: return "Switching equipment congestion";
+	case 0x34AB: return "Access information discarded";
+	case 0x34AC: return "Requested circuit / channel not available";
+	case 0x34AF: return "Resources unavailable, unspecified";
+	case 0x34B1: return "Quality of service unavailable";
+	case 0x34B2: return "Requested facility not subscribed";
+	case 0x34B9: return "Bearer capability not authorized";
+	case 0x34BA: return "Bearer capability not presently available";
+	case 0x34BF: return "Service or option not available, unspecified";
+	case 0x34C1: return "Bearer capability not implemented";
+	case 0x34C2: return "Channel type not implemented";
+	case 0x34C5: return "Requested facility not implemented";
+	case 0x34C6: return "Only restricted digital information bearer capability is available";
+	case 0x34CF: return "Service or option not implemented, unspecified";
+	case 0x34D1: return "Invalid call reference value";
+	case 0x34D2: return "Identified channel does not exist";
+	case 0x34D3: return "A suspended call exists, but this call identity does not";
+	case 0x34D4: return "Call identity in use";
+	case 0x34D5: return "No call suspended";
+	case 0x34D6: return "Call having the requested call identity has been cleared";
+	case 0x34D8: return "Incompatible destination";
+	case 0x34DB: return "Invalid transit network selection";
+	case 0x34DF: return "Invalid message, unspecified";
+	case 0x34E0: return "Mandatory information element is missing";
+	case 0x34E1: return "Message type non-existent or not implemented";
+	case 0x34E2: return "Message not compatible with call state or message type non-existent or not implemented";
+	case 0x34E3: return "Information element non-existent or not implemented";
+	case 0x34E4: return "Invalid information element contents";
+	case 0x34E5: return "Message not compatible with call state";
+	case 0x34E6: return "Recovery on timer expiry";
+	case 0x34EF: return "Protocol error, unspecified";
+	case 0x34FF: return "Interworking, unspecified";
+
+	default: return "No additional information";
+	}
+#endif
+}
 
 static void handle_controller(_cmsg *cmsg)
 {
diff --git a/drivers/isdn/capi/capiutil.c b/drivers/isdn/capi/capiutil.c
index d26f17033b68..6e797e502cfa 100644
--- a/drivers/isdn/capi/capiutil.c
+++ b/drivers/isdn/capi/capiutil.c
@@ -22,205 +22,6 @@
 
 /* from CAPI2.0 DDK AVM Berlin GmbH */
 
-#ifndef CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON
-char *capi_info2str(u16 reason)
-{
-	return "..";
-}
-#else
-char *capi_info2str(u16 reason)
-{
-	switch (reason) {
-
-/*-- informative values (corresponding message was processed) -----*/
-	case 0x0001:
-		return "NCPI not supported by current protocol, NCPI ignored";
-	case 0x0002:
-		return "Flags not supported by current protocol, flags ignored";
-	case 0x0003:
-		return "Alert already sent by another application";
-
-/*-- error information concerning CAPI_REGISTER -----*/
-	case 0x1001:
-		return "Too many applications";
-	case 0x1002:
-		return "Logical block size too small, must be at least 128 Bytes";
-	case 0x1003:
-		return "Buffer exceeds 64 kByte";
-	case 0x1004:
-		return "Message buffer size too small, must be at least 1024 Bytes";
-	case 0x1005:
-		return "Max. number of logical connections not supported";
-	case 0x1006:
-		return "Reserved";
-	case 0x1007:
-		return "The message could not be accepted because of an internal busy condition";
-	case 0x1008:
-		return "OS resource error (no memory ?)";
-	case 0x1009:
-		return "CAPI not installed";
-	case 0x100A:
-		return "Controller does not support external equipment";
-	case 0x100B:
-		return "Controller does only support external equipment";
-
-/*-- error information concerning message exchange functions -----*/
-	case 0x1101:
-		return "Illegal application number";
-	case 0x1102:
-		return "Illegal command or subcommand or message length less than 12 bytes";
-	case 0x1103:
-		return "The message could not be accepted because of a queue full condition !! The error code does not imply that CAPI cannot receive messages directed to another controller, PLCI or NCCI";
-	case 0x1104:
-		return "Queue is empty";
-	case 0x1105:
-		return "Queue overflow, a message was lost !! This indicates a configuration error. The only recovery from this error is to perform a CAPI_RELEASE";
-	case 0x1106:
-		return "Unknown notification parameter";
-	case 0x1107:
-		return "The Message could not be accepted because of an internal busy condition";
-	case 0x1108:
-		return "OS Resource error (no memory ?)";
-	case 0x1109:
-		return "CAPI not installed";
-	case 0x110A:
-		return "Controller does not support external equipment";
-	case 0x110B:
-		return "Controller does only support external equipment";
-
-/*-- error information concerning resource / coding problems -----*/
-	case 0x2001:
-		return "Message not supported in current state";
-	case 0x2002:
-		return "Illegal Controller / PLCI / NCCI";
-	case 0x2003:
-		return "Out of PLCI";
-	case 0x2004:
-		return "Out of NCCI";
-	case 0x2005:
-		return "Out of LISTEN";
-	case 0x2006:
-		return "Out of FAX resources (protocol T.30)";
-	case 0x2007:
-		return "Illegal message parameter coding";
-
-/*-- error information concerning requested services  -----*/
-	case 0x3001:
-		return "B1 protocol not supported";
-	case 0x3002:
-		return "B2 protocol not supported";
-	case 0x3003:
-		return "B3 protocol not supported";
-	case 0x3004:
-		return "B1 protocol parameter not supported";
-	case 0x3005:
-		return "B2 protocol parameter not supported";
-	case 0x3006:
-		return "B3 protocol parameter not supported";
-	case 0x3007:
-		return "B protocol combination not supported";
-	case 0x3008:
-		return "NCPI not supported";
-	case 0x3009:
-		return "CIP Value unknown";
-	case 0x300A:
-		return "Flags not supported (reserved bits)";
-	case 0x300B:
-		return "Facility not supported";
-	case 0x300C:
-		return "Data length not supported by current protocol";
-	case 0x300D:
-		return "Reset procedure not supported by current protocol";
-
-/*-- informations about the clearing of a physical connection -----*/
-	case 0x3301:
-		return "Protocol error layer 1 (broken line or B-channel removed by signalling protocol)";
-	case 0x3302:
-		return "Protocol error layer 2";
-	case 0x3303:
-		return "Protocol error layer 3";
-	case 0x3304:
-		return "Another application got that call";
-/*-- T.30 specific reasons -----*/
-	case 0x3311:
-		return "Connecting not successful (remote station is no FAX G3 machine)";
-	case 0x3312:
-		return "Connecting not successful (training error)";
-	case 0x3313:
-		return "Disconnected before transfer (remote station does not support transfer mode, e.g. resolution)";
-	case 0x3314:
-		return "Disconnected during transfer (remote abort)";
-	case 0x3315:
-		return "Disconnected during transfer (remote procedure error, e.g. unsuccessful repetition of T.30 commands)";
-	case 0x3316:
-		return "Disconnected during transfer (local tx data underrun)";
-	case 0x3317:
-		return "Disconnected during transfer (local rx data overflow)";
-	case 0x3318:
-		return "Disconnected during transfer (local abort)";
-	case 0x3319:
-		return "Illegal parameter coding (e.g. SFF coding error)";
-
-/*-- disconnect causes from the network according to ETS 300 102-1/Q.931 -----*/
-	case 0x3481: return "Unallocated (unassigned) number";
-	case 0x3482: return "No route to specified transit network";
-	case 0x3483: return "No route to destination";
-	case 0x3486: return "Channel unacceptable";
-	case 0x3487:
-		return "Call awarded and being delivered in an established channel";
-	case 0x3490: return "Normal call clearing";
-	case 0x3491: return "User busy";
-	case 0x3492: return "No user responding";
-	case 0x3493: return "No answer from user (user alerted)";
-	case 0x3495: return "Call rejected";
-	case 0x3496: return "Number changed";
-	case 0x349A: return "Non-selected user clearing";
-	case 0x349B: return "Destination out of order";
-	case 0x349C: return "Invalid number format";
-	case 0x349D: return "Facility rejected";
-	case 0x349E: return "Response to STATUS ENQUIRY";
-	case 0x349F: return "Normal, unspecified";
-	case 0x34A2: return "No circuit / channel available";
-	case 0x34A6: return "Network out of order";
-	case 0x34A9: return "Temporary failure";
-	case 0x34AA: return "Switching equipment congestion";
-	case 0x34AB: return "Access information discarded";
-	case 0x34AC: return "Requested circuit / channel not available";
-	case 0x34AF: return "Resources unavailable, unspecified";
-	case 0x34B1: return "Quality of service unavailable";
-	case 0x34B2: return "Requested facility not subscribed";
-	case 0x34B9: return "Bearer capability not authorized";
-	case 0x34BA: return "Bearer capability not presently available";
-	case 0x34BF: return "Service or option not available, unspecified";
-	case 0x34C1: return "Bearer capability not implemented";
-	case 0x34C2: return "Channel type not implemented";
-	case 0x34C5: return "Requested facility not implemented";
-	case 0x34C6: return "Only restricted digital information bearer capability is available";
-	case 0x34CF: return "Service or option not implemented, unspecified";
-	case 0x34D1: return "Invalid call reference value";
-	case 0x34D2: return "Identified channel does not exist";
-	case 0x34D3: return "A suspended call exists, but this call identity does not";
-	case 0x34D4: return "Call identity in use";
-	case 0x34D5: return "No call suspended";
-	case 0x34D6: return "Call having the requested call identity has been cleared";
-	case 0x34D8: return "Incompatible destination";
-	case 0x34DB: return "Invalid transit network selection";
-	case 0x34DF: return "Invalid message, unspecified";
-	case 0x34E0: return "Mandatory information element is missing";
-	case 0x34E1: return "Message type non-existent or not implemented";
-	case 0x34E2: return "Message not compatible with call state or message type non-existent or not implemented";
-	case 0x34E3: return "Information element non-existent or not implemented";
-	case 0x34E4: return "Invalid information element contents";
-	case 0x34E5: return "Message not compatible with call state";
-	case 0x34E6: return "Recovery on timer expiry";
-	case 0x34EF: return "Protocol error, unspecified";
-	case 0x34FF: return "Interworking, unspecified";
-
-	default: return "No additional information";
-	}
-}
-#endif
-
 typedef struct {
 	int typ;
 	size_t off;
@@ -1073,4 +874,3 @@ EXPORT_SYMBOL(capi_cmsg_header);
 EXPORT_SYMBOL(capi_cmd2str);
 EXPORT_SYMBOL(capi_cmsg2str);
 EXPORT_SYMBOL(capi_message2str);
-EXPORT_SYMBOL(capi_info2str);
diff --git a/include/linux/isdn/capiutil.h b/include/linux/isdn/capiutil.h
index 5a52f2c94f3f..44bd6046e6e2 100644
--- a/include/linux/isdn/capiutil.h
+++ b/include/linux/isdn/capiutil.h
@@ -164,11 +164,6 @@ unsigned capi_cmsg_header(_cmsg * cmsg, __u16 _ApplId,
 			  __u8 _Command, __u8 _Subcommand,
 			  __u16 _Messagenumber, __u32 _Controller);
 
-/*
- * capi_info2str generated a readable string for Capi2.0 reasons.
- */
-char *capi_info2str(__u16 reason);
-
 /*-----------------------------------------------------------------------*/
 
 /*
-- 
cgit v1.2.3-71-gd317


From 4fc828e24cd9c385d3a44e1b499ec7fc70239d8a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Fri, 2 May 2014 11:24:15 -0700
Subject: locking/rwsem: Support optimistic spinning

We have reached the point where our mutexes are quite fine tuned
for a number of situations. This includes the use of heuristics
and optimistic spinning, based on MCS locking techniques.

Exclusive ownership of read-write semaphores are, conceptually,
just about the same as mutexes, making them close cousins. To
this end we need to make them both perform similarly, and
right now, rwsems are simply not up to it. This was discovered
by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make
rmap_walk_anon() and try_to_unmap_anon() more scalable) and
similarly, converting some other mutexes (ie: i_mmap_mutex) to
rwsems. This creates a situation where users have to choose
between a rwsem and mutex taking into account this important
performance difference. Specifically, biggest difference between
both locks is when we fail to acquire a mutex in the fastpath,
optimistic spinning comes in to play and we can avoid a large
amount of unnecessary sleeping and overhead of moving tasks in
and out of wait queue. Rwsems do not have such logic.

This patch, based on the work from Tim Chen and I, adds support
for write-side optimistic spinning when the lock is contended.
It also includes support for the recently added cancelable MCS
locking for adaptive spinning. Note that is is only applicable
to the xadd method, and the spinlock rwsem variant remains intact.

Allowing optimistic spinning before putting the writer on the wait
queue reduces wait queue contention and provided greater chance
for the rwsem to get acquired. With these changes, rwsem is on par
with mutex. The performance benefits can be seen on a number of
workloads. For instance, on a 8 socket, 80 core 64bit Westmere box,
aim7 shows the following improvements in throughput:

 +--------------+---------------------+-----------------+
 |   Workload   | throughput-increase | number of users |
 +--------------+---------------------+-----------------+
 | alltests     | 20%                 | >1000           |
 | custom       | 27%, 60%            | 10-100, >1000   |
 | high_systime | 36%, 30%            | >100, >1000     |
 | shared       | 58%, 29%            | 10-100, >1000   |
 +--------------+---------------------+-----------------+

There was also improvement on smaller systems, such as a quad-core
x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up
to +60% in throughput for over 50 clients. Additionally, benefits
were also noticed in exim (mail server) workloads. Furthermore, no
performance regression have been seen at all.

Based-on-work-from: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
[peterz: rej fixup due to comment patches, sched/rt.h header]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alex Shi <alex.shi@linaro.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: "Paul E.McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "Scott J Norton" <scott.norton@hp.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1399055055.6275.15.camel@buesod1.americas.hpqcorp.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h       |  25 ++++-
 kernel/locking/rwsem-xadd.c | 225 ++++++++++++++++++++++++++++++++++++++------
 kernel/locking/rwsem.c      |  31 +++++-
 3 files changed, 248 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 03f3b05e8ec1..3e108f154cb6 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@
 
 #include <linux/atomic.h>
 
+struct optimistic_spin_queue;
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -23,9 +24,17 @@ struct rw_semaphore;
 #else
 /* All arch specific implementations share the same struct */
 struct rw_semaphore {
-	long			count;
-	raw_spinlock_t		wait_lock;
-	struct list_head	wait_list;
+	long count;
+	raw_spinlock_t wait_lock;
+	struct list_head wait_list;
+#ifdef CONFIG_SMP
+	/*
+	 * Write owner. Used as a speculative check to see
+	 * if the owner is running on the cpu.
+	 */
+	struct task_struct *owner;
+	struct optimistic_spin_queue *osq; /* spinner MCS lock */
+#endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
@@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
+#ifdef CONFIG_SMP
+#define __RWSEM_INITIALIZER(name)			\
+	{ RWSEM_UNLOCKED_VALUE,				\
+	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
+	  LIST_HEAD_INIT((name).wait_list),		\
+	  NULL, /* owner */				\
+	  NULL /* mcs lock */                           \
+	  __RWSEM_DEP_MAP_INIT(name) }
+#else
 #define __RWSEM_INITIALIZER(name)			\
 	{ RWSEM_UNLOCKED_VALUE,				\
 	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
 	  LIST_HEAD_INIT((name).wait_list)		\
 	  __RWSEM_DEP_MAP_INIT(name) }
+#endif
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b4219ff87b8c..4a75278142cd 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,17 @@
  *
  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
  * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/sched/rt.h>
+
+#include "mcs_spinlock.h"
 
 /*
  * Guide to the rw_semaphore's count field for common values.
@@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	raw_spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_SMP
+	sem->owner = NULL;
+	sem->osq = NULL;
+#endif
 }
 
 EXPORT_SYMBOL(__init_rwsem);
@@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 }
 
 /*
- * wait for the read lock to be granted
+ * Wait for the read lock to be granted
  */
 __visible
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	return sem;
 }
 
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+	if (!(count & RWSEM_ACTIVE_MASK)) {
+		/* try acquiring the write lock */
+		if (sem->count == RWSEM_WAITING_BIAS &&
+		    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+			    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+			if (!list_is_singular(&sem->wait_list))
+				rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+			return true;
+		}
+	}
+	return false;
+}
+
+#ifdef CONFIG_SMP
 /*
- * wait until we successfully acquire the write lock
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+	long old, count = ACCESS_ONCE(sem->count);
+
+	while (true) {
+		if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+			return false;
+
+		old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+		if (old == count)
+			return true;
+
+		count = old;
+	}
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+	struct task_struct *owner;
+	bool on_cpu = true;
+
+	if (need_resched())
+		return 0;
+
+	rcu_read_lock();
+	owner = ACCESS_ONCE(sem->owner);
+	if (owner)
+		on_cpu = owner->on_cpu;
+	rcu_read_unlock();
+
+	/*
+	 * If sem->owner is not set, the rwsem owner may have
+	 * just acquired it and not set the owner yet or the rwsem
+	 * has been released.
+	 */
+	return on_cpu;
+}
+
+static inline bool owner_running(struct rw_semaphore *sem,
+				 struct task_struct *owner)
+{
+	if (sem->owner != owner)
+		return false;
+
+	/*
+	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
+	 * sem->owner still matches owner, if that fails, owner might
+	 * point to free()d memory, if it still matches, the rcu_read_lock()
+	 * ensures the memory stays valid.
+	 */
+	barrier();
+
+	return owner->on_cpu;
+}
+
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+	rcu_read_lock();
+	while (owner_running(sem, owner)) {
+		if (need_resched())
+			break;
+
+		arch_mutex_cpu_relax();
+	}
+	rcu_read_unlock();
+
+	/*
+	 * We break out the loop above on need_resched() or when the
+	 * owner changed, which is a sign for heavy contention. Return
+	 * success only when sem->owner is NULL.
+	 */
+	return sem->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+	struct task_struct *owner;
+	bool taken = false;
+
+	preempt_disable();
+
+	/* sem->wait_lock should not be held when doing optimistic spinning */
+	if (!rwsem_can_spin_on_owner(sem))
+		goto done;
+
+	if (!osq_lock(&sem->osq))
+		goto done;
+
+	while (true) {
+		owner = ACCESS_ONCE(sem->owner);
+		if (owner && !rwsem_spin_on_owner(sem, owner))
+			break;
+
+		/* wait_lock will be acquired if write_lock is obtained */
+		if (rwsem_try_write_lock_unqueued(sem)) {
+			taken = true;
+			break;
+		}
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(current)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		arch_mutex_cpu_relax();
+	}
+	osq_unlock(&sem->osq);
+done:
+	preempt_enable();
+	return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+	return false;
+}
+#endif
+
+/*
+ * Wait until we successfully acquire the write lock
  */
 __visible
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+	long count;
+	bool waiting = true; /* any queued threads before us */
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk = current;
 
-	/* set up my own style of waitqueue */
-	waiter.task = tsk;
+	/* undo write bias from down_write operation, stop active locking */
+	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+	/* do optimistic spinning and steal lock if possible */
+	if (rwsem_optimistic_spin(sem))
+		return sem;
+
+	/*
+	 * Optimistic spinning failed, proceed to the slowpath
+	 * and block until we can acquire the sem.
+	 */
+	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_WRITE;
 
 	raw_spin_lock_irq(&sem->wait_lock);
+
+	/* account for this before adding a new element to the list */
 	if (list_empty(&sem->wait_list))
-		adjustment += RWSEM_WAITING_BIAS;
+		waiting = false;
+
 	list_add_tail(&waiter.list, &sem->wait_list);
 
 	/* we're now waiting on the lock, but no longer actively locking */
-	count = rwsem_atomic_update(adjustment, sem);
+	if (waiting) {
+		count = ACCESS_ONCE(sem->count);
 
-	/* If there were already threads queued before us and there are no
-	 * active writers, the lock must be read owned; so we try to wake
-	 * any read locks that were queued ahead of us. */
-	if (count > RWSEM_WAITING_BIAS &&
-	    adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+		/*
+		 * If there were already threads queued before us and there are no
+		 * active writers, the lock must be read owned; so we try to wake
+		 * any read locks that were queued ahead of us.
+		 */
+		if (count > RWSEM_WAITING_BIAS)
+			sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+
+	} else
+		count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
 
 	/* wait until we successfully acquire the lock */
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	while (true) {
-		if (!(count & RWSEM_ACTIVE_MASK)) {
-			/* Try acquiring the write lock. */
-			count = RWSEM_ACTIVE_WRITE_BIAS;
-			if (!list_is_singular(&sem->wait_list))
-				count += RWSEM_WAITING_BIAS;
-
-			if (sem->count == RWSEM_WAITING_BIAS &&
-			    cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
-							RWSEM_WAITING_BIAS)
-				break;
-		}
-
+		if (rwsem_try_write_lock(count, sem))
+			break;
 		raw_spin_unlock_irq(&sem->wait_lock);
 
 		/* Block until there are no active lockers. */
 		do {
 			schedule();
-			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE);
 		} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
 
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
+	__set_current_state(TASK_RUNNING);
 
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
-	tsk->state = TASK_RUNNING;
 
 	return sem;
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..42f806de49d4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
 
 #include <linux/atomic.h>
 
+#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+	sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+	sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
 /*
  * lock for reading
  */
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+	rwsem_set_owner(sem);
 }
 
 EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
 {
 	int ret = __down_write_trylock(sem);
 
-	if (ret == 1)
+	if (ret == 1) {
 		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+		rwsem_set_owner(sem);
+	}
+
 	return ret;
 }
 
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 
+	rwsem_clear_owner(sem);
 	__up_write(sem);
 }
 
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
 	 * lockdep: a downgraded write will live on as a write
 	 * dependency.
 	 */
+	rwsem_clear_owner(sem);
 	__downgrade_write(sem);
 }
 
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
 	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+	rwsem_set_owner(sem);
 }
 
 EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+	rwsem_set_owner(sem);
 }
 
 EXPORT_SYMBOL(down_write_nested);
-- 
cgit v1.2.3-71-gd317


From dbb5eafa23fdb7a00d64b04e23a3efc59d6474cb Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Mon, 19 May 2014 17:27:57 -0700
Subject: locking/rwsem: Fix warnings for CONFIG_RWSEM_GENERIC_SPINLOCK

Optimistic spinning is only used by the xadd variant
of rw-semaphores. Make sure that we use the old version
of the __RWSEM_INITIALIZER macro for systems that rely
on the spinlock one, otherwise warnings can be triggered,
such as the following reported on an arm box:

  ipc/ipcns_notifier.c:22:8: warning: excess elements in struct initializer [enabled by default]
  ipc/ipcns_notifier.c:22:8: warning: (near initialization for 'ipcns_chain.rwsem') [enabled by default]
  ipc/ipcns_notifier.c:22:8: warning: excess elements in struct initializer [enabled by default]
  ipc/ipcns_notifier.c:22:8: warning: (near initialization for 'ipcns_chain.rwsem') [enabled by default]

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Alex Shi <alex.shi@linaro.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1400545677.6399.10.camel@buesod1.americas.hpqcorp.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 3e108f154cb6..8d79708146aa 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -64,7 +64,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
 #define __RWSEM_INITIALIZER(name)			\
 	{ RWSEM_UNLOCKED_VALUE,				\
 	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
-- 
cgit v1.2.3-71-gd317


From fa93384f40deeb294fd29f2fdcadbd0ebc2dedf1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 23 May 2014 13:20:42 +0300
Subject: sched: Fix signedness bug in yield_to()

yield_to() is supposed to return -ESRCH if there is no task to
yield to, but because the type is bool that is the same as returning
true.

The only place I see which cares is kvm_vcpu_on_spin().

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Raghavendra <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Link: http://lkml.kernel.org/r/20140523102042.GA7267@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kvm_host.h | 2 +-
 include/linux/sched.h    | 2 +-
 kernel/sched/core.c      | 2 +-
 virt/kvm/kvm_main.c      | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d21cf9f4380..3c4bcf146159 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -584,7 +584,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-bool kvm_vcpu_yield_to(struct kvm_vcpu *target);
+int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f91d00efd87..6790c3b42072 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2180,7 +2180,7 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif
 
-extern bool yield_to(struct task_struct *p, bool preempt);
+extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 321d800e4baa..afcc84234a3e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4195,7 +4195,7 @@ EXPORT_SYMBOL(yield);
  *	false (0) if we failed to boost the target.
  *	-ESRCH if there's no task to yield to.
  */
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
 {
 	struct task_struct *curr = current;
 	struct rq *rq, *p_rq;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 56baae8c2f56..86d1c457458d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1708,11 +1708,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
 #endif /* !CONFIG_S390 */
 
-bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
+int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {
 	struct pid *pid;
 	struct task_struct *task = NULL;
-	bool ret = false;
+	int ret = 0;
 
 	rcu_read_lock();
 	pid = rcu_dereference(target->pid);
-- 
cgit v1.2.3-71-gd317


From 63b2ca30bdb3dbf60bc7ac5f46713c0d32308261 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Mon, 26 May 2014 18:19:37 -0400
Subject: sched: Let 'struct sched_group_power' care about CPU capacity

It is better not to think about compute capacity as being equivalent
to "CPU power".  The upcoming "power aware" scheduler work may create
confusion with the notion of energy consumption if "power" is used too
liberally.

Since struct sched_group_power is really about compute capacity of sched
groups, let's rename it to struct sched_group_capacity. Similarly sgp
becomes sgc. Related variables and functions dealing with groups are also
adjusted accordingly.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: linaro-kernel@lists.linaro.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-5yeix833vvgf2uyj5o36hpu9@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   2 +-
 kernel/sched/core.c   |  81 +++++++++++++++----------------
 kernel/sched/fair.c   | 131 +++++++++++++++++++++++++-------------------------
 kernel/sched/sched.h  |  16 +++---
 4 files changed, 115 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6790c3b42072..a96f03598c61 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1013,7 +1013,7 @@ typedef const int (*sched_domain_flags_f)(void);
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
-	struct sched_group_power **__percpu sgp;
+	struct sched_group_capacity **__percpu sgc;
 };
 
 struct sched_domain_topology_level {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index afcc84234a3e..2e1fb0902200 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5221,14 +5221,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		}
 
 		/*
-		 * Even though we initialize ->power to something semi-sane,
-		 * we leave power_orig unset. This allows us to detect if
+		 * Even though we initialize ->capacity to something semi-sane,
+		 * we leave capacity_orig unset. This allows us to detect if
 		 * domain iteration is still funny without causing /0 traps.
 		 */
-		if (!group->sgp->power_orig) {
+		if (!group->sgc->capacity_orig) {
 			printk(KERN_CONT "\n");
-			printk(KERN_ERR "ERROR: domain->cpu_power not "
-					"set\n");
+			printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
 			break;
 		}
 
@@ -5250,9 +5249,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
 		printk(KERN_CONT " %s", str);
-		if (group->sgp->power != SCHED_POWER_SCALE) {
-			printk(KERN_CONT " (cpu_power = %d)",
-				group->sgp->power);
+		if (group->sgc->capacity != SCHED_POWER_SCALE) {
+			printk(KERN_CONT " (cpu_capacity = %d)",
+				group->sgc->capacity);
 		}
 
 		group = group->next;
@@ -5466,7 +5465,7 @@ static struct root_domain *alloc_rootdomain(void)
 	return rd;
 }
 
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
 {
 	struct sched_group *tmp, *first;
 
@@ -5477,8 +5476,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
 	do {
 		tmp = sg->next;
 
-		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
-			kfree(sg->sgp);
+		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+			kfree(sg->sgc);
 
 		kfree(sg);
 		sg = tmp;
@@ -5496,7 +5495,7 @@ static void free_sched_domain(struct rcu_head *rcu)
 	if (sd->flags & SD_OVERLAP) {
 		free_sched_groups(sd->groups, 1);
 	} else if (atomic_dec_and_test(&sd->groups->ref)) {
-		kfree(sd->groups->sgp);
+		kfree(sd->groups->sgc);
 		kfree(sd->groups);
 	}
 	kfree(sd);
@@ -5707,17 +5706,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
 		cpumask_or(covered, covered, sg_span);
 
-		sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-		if (atomic_inc_return(&sg->sgp->ref) == 1)
+		sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+		if (atomic_inc_return(&sg->sgc->ref) == 1)
 			build_group_mask(sd, sg);
 
 		/*
-		 * Initialize sgp->power such that even if we mess up the
+		 * Initialize sgc->capacity such that even if we mess up the
 		 * domains and no possible iteration will get us here, we won't
 		 * die on a /0 trap.
 		 */
-		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
-		sg->sgp->power_orig = sg->sgp->power;
+		sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+		sg->sgc->capacity_orig = sg->sgc->capacity;
 
 		/*
 		 * Make sure the first group of this domain contains the
@@ -5755,8 +5754,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 
 	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
-		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
-		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+		atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
 	}
 
 	return cpu;
@@ -5819,16 +5818,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 }
 
 /*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
  *
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
  */
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
 
@@ -5842,8 +5841,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 	if (cpu != group_balance_cpu(sg))
 		return;
 
-	update_group_power(sd, cpu);
-	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+	update_group_capacity(sd, cpu);
+	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 
 /*
@@ -5934,8 +5933,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
 
-	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
+	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+		*per_cpu_ptr(sdd->sgc, cpu) = NULL;
 }
 
 #ifdef CONFIG_NUMA
@@ -6337,14 +6336,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 		if (!sdd->sg)
 			return -ENOMEM;
 
-		sdd->sgp = alloc_percpu(struct sched_group_power *);
-		if (!sdd->sgp)
+		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+		if (!sdd->sgc)
 			return -ENOMEM;
 
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			struct sched_group *sg;
-			struct sched_group_power *sgp;
+			struct sched_group_capacity *sgc;
 
 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
@@ -6362,12 +6361,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 			*per_cpu_ptr(sdd->sg, j) = sg;
 
-			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
-			if (!sgp)
+			if (!sgc)
 				return -ENOMEM;
 
-			*per_cpu_ptr(sdd->sgp, j) = sgp;
+			*per_cpu_ptr(sdd->sgc, j) = sgc;
 		}
 	}
 
@@ -6394,15 +6393,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 			if (sdd->sg)
 				kfree(*per_cpu_ptr(sdd->sg, j));
-			if (sdd->sgp)
-				kfree(*per_cpu_ptr(sdd->sgp, j));
+			if (sdd->sgc)
+				kfree(*per_cpu_ptr(sdd->sgc, j));
 		}
 		free_percpu(sdd->sd);
 		sdd->sd = NULL;
 		free_percpu(sdd->sg);
 		sdd->sg = NULL;
-		free_percpu(sdd->sgp);
-		sdd->sgp = NULL;
+		free_percpu(sdd->sgc);
+		sdd->sgc = NULL;
 	}
 }
 
@@ -6479,7 +6478,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			claim_allocations(i, sd);
-			init_sched_groups_power(i, sd);
+			init_sched_groups_capacity(i, sd);
 		}
 	}
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e401e446e87c..36bd4d23fca8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4369,8 +4369,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 			avg_load += load;
 		}
 
-		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+		/* Adjust by relative CPU capacity of the group */
+		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -5532,7 +5532,7 @@ struct sg_lb_stats {
 	unsigned long group_load; /* Total load over the CPUs of the group */
 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 	unsigned long load_per_task;
-	unsigned long group_power;
+	unsigned long group_capacity;
 	unsigned int sum_nr_running; /* Nr tasks running in the group */
 	unsigned int group_capacity_factor;
 	unsigned int idle_cpus;
@@ -5553,7 +5553,7 @@ struct sd_lb_stats {
 	struct sched_group *busiest;	/* Busiest group in this sd */
 	struct sched_group *local;	/* Local group in this sd */
 	unsigned long total_load;	/* Total load of all groups in sd */
-	unsigned long total_pwr;	/* Total power of all groups in sd */
+	unsigned long total_capacity;	/* Total capacity of all groups in sd */
 	unsigned long avg_load;	/* Average load across all groups in sd */
 
 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5572,7 +5572,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 		.busiest = NULL,
 		.local = NULL,
 		.total_load = 0UL,
-		.total_pwr = 0UL,
+		.total_capacity = 0UL,
 		.busiest_stat = {
 			.avg_load = 0UL,
 		},
@@ -5681,7 +5681,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_POWER_SHIFT;
 	}
 
-	sdg->sgp->power_orig = power;
+	sdg->sgc->capacity_orig = power;
 
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
@@ -5697,26 +5697,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power = 1;
 
 	cpu_rq(cpu)->cpu_power = power;
-	sdg->sgp->power = power;
+	sdg->sgc->capacity = power;
 }
 
-void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_capacity(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
-	unsigned long power, power_orig;
+	unsigned long capacity, capacity_orig;
 	unsigned long interval;
 
 	interval = msecs_to_jiffies(sd->balance_interval);
 	interval = clamp(interval, 1UL, max_load_balance_interval);
-	sdg->sgp->next_update = jiffies + interval;
+	sdg->sgc->next_update = jiffies + interval;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
 		return;
 	}
 
-	power_orig = power = 0;
+	capacity_orig = capacity = 0;
 
 	if (child->flags & SD_OVERLAP) {
 		/*
@@ -5725,31 +5725,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
 		 */
 
 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
-			struct sched_group_power *sgp;
+			struct sched_group_capacity *sgc;
 			struct rq *rq = cpu_rq(cpu);
 
 			/*
-			 * build_sched_domains() -> init_sched_groups_power()
+			 * build_sched_domains() -> init_sched_groups_capacity()
 			 * gets here before we've attached the domains to the
 			 * runqueues.
 			 *
 			 * Use power_of(), which is set irrespective of domains
 			 * in update_cpu_power().
 			 *
-			 * This avoids power/power_orig from being 0 and
+			 * This avoids capacity/capacity_orig from being 0 and
 			 * causing divide-by-zero issues on boot.
 			 *
-			 * Runtime updates will correct power_orig.
+			 * Runtime updates will correct capacity_orig.
 			 */
 			if (unlikely(!rq->sd)) {
-				power_orig += power_of(cpu);
-				power += power_of(cpu);
+				capacity_orig += power_of(cpu);
+				capacity += power_of(cpu);
 				continue;
 			}
 
-			sgp = rq->sd->groups->sgp;
-			power_orig += sgp->power_orig;
-			power += sgp->power;
+			sgc = rq->sd->groups->sgc;
+			capacity_orig += sgc->capacity_orig;
+			capacity += sgc->capacity;
 		}
 	} else  {
 		/*
@@ -5759,14 +5759,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
 
 		group = child->groups;
 		do {
-			power_orig += group->sgp->power_orig;
-			power += group->sgp->power;
+			capacity_orig += group->sgc->capacity_orig;
+			capacity += group->sgc->capacity;
 			group = group->next;
 		} while (group != child->groups);
 	}
 
-	sdg->sgp->power_orig = power_orig;
-	sdg->sgp->power = power;
+	sdg->sgc->capacity_orig = capacity_orig;
+	sdg->sgc->capacity = capacity;
 }
 
 /*
@@ -5786,9 +5786,9 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 		return 0;
 
 	/*
-	 * If ~90% of the cpu_power is still there, we're good.
+	 * If ~90% of the cpu_capacity is still there, we're good.
 	 */
-	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+	if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
 		return 1;
 
 	return 0;
@@ -5825,7 +5825,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 
 static inline int sg_imbalanced(struct sched_group *group)
 {
-	return group->sgp->imbalance;
+	return group->sgc->imbalance;
 }
 
 /*
@@ -5833,22 +5833,23 @@ static inline int sg_imbalanced(struct sched_group *group)
  *
  * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
  * first dividing out the smt factor and computing the actual number of cores
- * and limit power unit capacity with that.
+ * and limit unit capacity with that.
  */
 static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
 {
 	unsigned int capacity_factor, smt, cpus;
-	unsigned int power, power_orig;
+	unsigned int capacity, capacity_orig;
 
-	power = group->sgp->power;
-	power_orig = group->sgp->power_orig;
+	capacity = group->sgc->capacity;
+	capacity_orig = group->sgc->capacity_orig;
 	cpus = group->group_weight;
 
-	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
-	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
+	/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig);
 	capacity_factor = cpus / smt; /* cores */
 
-	capacity_factor = min_t(unsigned, capacity_factor, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+	capacity_factor = min_t(unsigned,
+		capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE));
 	if (!capacity_factor)
 		capacity_factor = fix_small_capacity(env->sd, group);
 
@@ -5892,9 +5893,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			sgs->idle_cpus++;
 	}
 
-	/* Adjust by relative CPU power of the group */
-	sgs->group_power = group->sgp->power;
-	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
+	/* Adjust by relative CPU capacity of the group */
+	sgs->group_capacity = group->sgc->capacity;
+	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity;
 
 	if (sgs->sum_nr_running)
 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -6009,8 +6010,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 			sgs = &sds->local_stat;
 
 			if (env->idle != CPU_NEWLY_IDLE ||
-			    time_after_eq(jiffies, sg->sgp->next_update))
-				update_group_power(env->sd, env->dst_cpu);
+			    time_after_eq(jiffies, sg->sgc->next_update))
+				update_group_capacity(env->sd, env->dst_cpu);
 		}
 
 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
@@ -6040,7 +6041,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 next_group:
 		/* Now, start updating sd_lb_stats */
 		sds->total_load += sgs->group_load;
-		sds->total_pwr += sgs->group_power;
+		sds->total_capacity += sgs->group_capacity;
 
 		sg = sg->next;
 	} while (sg != env->sd->groups);
@@ -6087,7 +6088,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 		return 0;
 
 	env->imbalance = DIV_ROUND_CLOSEST(
-		sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
+		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
 		SCHED_POWER_SCALE);
 
 	return 1;
@@ -6103,7 +6104,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 static inline
 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
-	unsigned long tmp, pwr_now = 0, pwr_move = 0;
+	unsigned long tmp, capa_now = 0, capa_move = 0;
 	unsigned int imbn = 2;
 	unsigned long scaled_busy_load_per_task;
 	struct sg_lb_stats *local, *busiest;
@@ -6118,7 +6119,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 
 	scaled_busy_load_per_task =
 		(busiest->load_per_task * SCHED_POWER_SCALE) /
-		busiest->group_power;
+		busiest->group_capacity;
 
 	if (busiest->avg_load + scaled_busy_load_per_task >=
 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -6132,34 +6133,34 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 	 * moving them.
 	 */
 
-	pwr_now += busiest->group_power *
+	capa_now += busiest->group_capacity *
 			min(busiest->load_per_task, busiest->avg_load);
-	pwr_now += local->group_power *
+	capa_now += local->group_capacity *
 			min(local->load_per_task, local->avg_load);
-	pwr_now /= SCHED_POWER_SCALE;
+	capa_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
 	if (busiest->avg_load > scaled_busy_load_per_task) {
-		pwr_move += busiest->group_power *
+		capa_move += busiest->group_capacity *
 			    min(busiest->load_per_task,
 				busiest->avg_load - scaled_busy_load_per_task);
 	}
 
 	/* Amount of load we'd add */
-	if (busiest->avg_load * busiest->group_power <
+	if (busiest->avg_load * busiest->group_capacity <
 	    busiest->load_per_task * SCHED_POWER_SCALE) {
-		tmp = (busiest->avg_load * busiest->group_power) /
-		      local->group_power;
+		tmp = (busiest->avg_load * busiest->group_capacity) /
+		      local->group_capacity;
 	} else {
 		tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
-		      local->group_power;
+		      local->group_capacity;
 	}
-	pwr_move += local->group_power *
+	capa_move += local->group_capacity *
 		    min(local->load_per_task, local->avg_load + tmp);
-	pwr_move /= SCHED_POWER_SCALE;
+	capa_move /= SCHED_POWER_SCALE;
 
 	/* Move if we gain throughput */
-	if (pwr_move > pwr_now)
+	if (capa_move > capa_now)
 		env->imbalance = busiest->load_per_task;
 }
 
@@ -6207,7 +6208,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 			(busiest->sum_nr_running - busiest->group_capacity_factor);
 
 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-		load_above_capacity /= busiest->group_power;
+		load_above_capacity /= busiest->group_capacity;
 	}
 
 	/*
@@ -6222,8 +6223,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 
 	/* How much load to actually move to equalise the imbalance */
 	env->imbalance = min(
-		max_pull * busiest->group_power,
-		(sds->avg_load - local->avg_load) * local->group_power
+		max_pull * busiest->group_capacity,
+		(sds->avg_load - local->avg_load) * local->group_capacity
 	) / SCHED_POWER_SCALE;
 
 	/*
@@ -6278,7 +6279,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 
-	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity;
 
 	/*
 	 * If the busiest group is imbalanced the below checks don't
@@ -6611,7 +6612,7 @@ more_balance:
 		 * We failed to reach balance because of affinity.
 		 */
 		if (sd_parent) {
-			int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
 
 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
 				*group_imbalance = 1;
@@ -6998,7 +6999,7 @@ static inline void set_cpu_sd_state_busy(void)
 		goto unlock;
 	sd->nohz_idle = 0;
 
-	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -7015,7 +7016,7 @@ void set_cpu_sd_state_idle(void)
 		goto unlock;
 	sd->nohz_idle = 1;
 
-	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -7219,7 +7220,7 @@ end:
  * of an idle cpu is the system.
  *   - This rq has more than one task.
  *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *     busy cpu's exceeding the group's power.
+ *     busy cpu's exceeding the group's capacity.
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
@@ -7227,7 +7228,7 @@ static inline int nohz_kick_needed(struct rq *rq)
 {
 	unsigned long now = jiffies;
 	struct sched_domain *sd;
-	struct sched_group_power *sgp;
+	struct sched_group_capacity *sgc;
 	int nr_busy, cpu = rq->cpu;
 
 	if (unlikely(rq->idle_balance))
@@ -7257,8 +7258,8 @@ static inline int nohz_kick_needed(struct rq *rq)
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
 	if (sd) {
-		sgp = sd->groups->sgp;
-		nr_busy = atomic_read(&sgp->nr_busy_cpus);
+		sgc = sd->groups->sgc;
+		nr_busy = atomic_read(&sgc->nr_busy_cpus);
 
 		if (nr_busy > 1)
 			goto need_kick_unlock;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 600e2291a75c..a5b957d53c92 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -728,15 +728,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 
-struct sched_group_power {
+struct sched_group_capacity {
 	atomic_t ref;
 	/*
-	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
-	 * single CPU.
+	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+	 * for a single CPU.
 	 */
-	unsigned int power, power_orig;
+	unsigned int capacity, capacity_orig;
 	unsigned long next_update;
-	int imbalance; /* XXX unrelated to power but shared group state */
+	int imbalance; /* XXX unrelated to capacity but shared group state */
 	/*
 	 * Number of busy cpus in this group.
 	 */
@@ -750,7 +750,7 @@ struct sched_group {
 	atomic_t ref;
 
 	unsigned int group_weight;
-	struct sched_group_power *sgp;
+	struct sched_group_capacity *sgc;
 
 	/*
 	 * The CPUs this group covers.
@@ -773,7 +773,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
  */
 static inline struct cpumask *sched_group_mask(struct sched_group *sg)
 {
-	return to_cpumask(sg->sgp->cpumask);
+	return to_cpumask(sg->sgc->cpumask);
 }
 
 /**
@@ -1167,7 +1167,7 @@ extern const struct sched_class idle_sched_class;
 
 #ifdef CONFIG_SMP
 
-extern void update_group_power(struct sched_domain *sd, int cpu);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-- 
cgit v1.2.3-71-gd317


From ca8ce3d0b144c318a5a9ce99649053e9029061ea Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Mon, 26 May 2014 18:19:39 -0400
Subject: sched: Final power vs. capacity cleanups

It is better not to think about compute capacity as being equivalent
to "CPU power".  The upcoming "power aware" scheduler work may create
confusion with the notion of energy consumption if "power" is used too
liberally.

This contains the architecture visible changes.  Incidentally, only ARM
takes advantage of the available pow^H^H^Hcapacity scaling hooks and
therefore those changes outside kernel/sched/ are confined to one ARM
specific file.  The default arch_scale_smt_power() hook is not overridden
by anyone.

Replacements are as follows:

	arch_scale_freq_power  --> arch_scale_freq_capacity
	arch_scale_smt_power   --> arch_scale_smt_capacity
	SCHED_POWER_SCALE      --> SCHED_CAPACITY_SCALE
	SCHED_POWER_SHIFT      --> SCHED_CAPACITY_SHIFT

The local usage of "power" in arch/arm/kernel/topology.c is also changed
to "capacity" as appropriate.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: linaro-kernel@lists.linaro.org
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Brown <broonie@linaro.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Sudeep KarkadaNagesha <sudeep.karkadanagesha@arm.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: devicetree@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-48zba9qbznvglwelgq2cfygh@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/topology.c | 54 +++++++++++++++++++++---------------------
 include/linux/sched.h      |  6 ++---
 kernel/sched/core.c        |  6 ++---
 kernel/sched/fair.c        | 59 +++++++++++++++++++++++-----------------------
 4 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 71e1fec6d31a..d42a7db22236 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -26,30 +26,30 @@
 #include <asm/topology.h>
 
 /*
- * cpu power scale management
+ * cpu capacity scale management
  */
 
 /*
- * cpu power table
+ * cpu capacity table
  * This per cpu data structure describes the relative capacity of each core.
  * On a heteregenous system, cores don't have the same computation capacity
- * and we reflect that difference in the cpu_power field so the scheduler can
- * take this difference into account during load balance. A per cpu structure
- * is preferred because each CPU updates its own cpu_power field during the
- * load balance except for idle cores. One idle core is selected to run the
- * rebalance_domains for all idle cores and the cpu_power can be updated
- * during this sequence.
+ * and we reflect that difference in the cpu_capacity field so the scheduler
+ * can take this difference into account during load balance. A per cpu
+ * structure is preferred because each CPU updates its own cpu_capacity field
+ * during the load balance except for idle cores. One idle core is selected
+ * to run the rebalance_domains for all idle cores and the cpu_capacity can be
+ * updated during this sequence.
  */
 static DEFINE_PER_CPU(unsigned long, cpu_scale);
 
-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 {
 	return per_cpu(cpu_scale, cpu);
 }
 
-static void set_power_scale(unsigned int cpu, unsigned long power)
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
 {
-	per_cpu(cpu_scale, cpu) = power;
+	per_cpu(cpu_scale, cpu) = capacity;
 }
 
 #ifdef CONFIG_OF
@@ -62,11 +62,11 @@ struct cpu_efficiency {
  * Table of relative efficiency of each processors
  * The efficiency value must fit in 20bit and the final
  * cpu_scale value must be in the range
- *   0 < cpu_scale < 3*SCHED_POWER_SCALE/2
+ *   0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2
  * in order to return at most 1 when DIV_ROUND_CLOSEST
  * is used to compute the capacity of a CPU.
  * Processors that are not defined in the table,
- * use the default SCHED_POWER_SCALE value for cpu_scale.
+ * use the default SCHED_CAPACITY_SCALE value for cpu_scale.
  */
 static const struct cpu_efficiency table_efficiency[] = {
 	{"arm,cortex-a15", 3891},
@@ -83,9 +83,9 @@ static unsigned long middle_capacity = 1;
  * Iterate all CPUs' descriptor in DT and compute the efficiency
  * (as per table_efficiency). Also calculate a middle efficiency
  * as close as possible to  (max{eff_i} - min{eff_i}) / 2
- * This is later used to scale the cpu_power field such that an
- * 'average' CPU is of middle power. Also see the comments near
- * table_efficiency[] and update_cpu_power().
+ * This is later used to scale the cpu_capacity field such that an
+ * 'average' CPU is of middle capacity. Also see the comments near
+ * table_efficiency[] and update_cpu_capacity().
  */
 static void __init parse_dt_topology(void)
 {
@@ -141,15 +141,15 @@ static void __init parse_dt_topology(void)
 	 * cpu_scale because all CPUs have the same capacity. Otherwise, we
 	 * compute a middle_capacity factor that will ensure that the capacity
 	 * of an 'average' CPU of the system will be as close as possible to
-	 * SCHED_POWER_SCALE, which is the default value, but with the
+	 * SCHED_CAPACITY_SCALE, which is the default value, but with the
 	 * constraint explained near table_efficiency[].
 	 */
 	if (4*max_capacity < (3*(max_capacity + min_capacity)))
 		middle_capacity = (min_capacity + max_capacity)
-				>> (SCHED_POWER_SHIFT+1);
+				>> (SCHED_CAPACITY_SHIFT+1);
 	else
 		middle_capacity = ((max_capacity / 3)
-				>> (SCHED_POWER_SHIFT-1)) + 1;
+				>> (SCHED_CAPACITY_SHIFT-1)) + 1;
 
 }
 
@@ -158,20 +158,20 @@ static void __init parse_dt_topology(void)
  * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
  * function returns directly for SMP system.
  */
-static void update_cpu_power(unsigned int cpu)
+static void update_cpu_capacity(unsigned int cpu)
 {
 	if (!cpu_capacity(cpu))
 		return;
 
-	set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+	set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
 
-	printk(KERN_INFO "CPU%u: update cpu_power %lu\n",
-		cpu, arch_scale_freq_power(NULL, cpu));
+	printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n",
+		cpu, arch_scale_freq_capacity(NULL, cpu));
 }
 
 #else
 static inline void parse_dt_topology(void) {}
-static inline void update_cpu_power(unsigned int cpuid) {}
+static inline void update_cpu_capacity(unsigned int cpuid) {}
 #endif
 
  /*
@@ -267,7 +267,7 @@ void store_cpu_topology(unsigned int cpuid)
 
 	update_siblings_masks(cpuid);
 
-	update_cpu_power(cpuid);
+	update_cpu_capacity(cpuid);
 
 	printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
 		cpuid, cpu_topology[cpuid].thread_id,
@@ -297,7 +297,7 @@ void __init init_cpu_topology(void)
 {
 	unsigned int cpu;
 
-	/* init core mask and power*/
+	/* init core mask and capacity */
 	for_each_possible_cpu(cpu) {
 		struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]);
 
@@ -307,7 +307,7 @@ void __init init_cpu_topology(void)
 		cpumask_clear(&cpu_topo->core_sibling);
 		cpumask_clear(&cpu_topo->thread_sibling);
 
-		set_power_scale(cpu, SCHED_POWER_SCALE);
+		set_capacity_scale(cpu, SCHED_CAPACITY_SCALE);
 	}
 	smp_wmb();
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a96f03598c61..322110affe63 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -854,10 +854,10 @@ enum cpu_idle_type {
 };
 
 /*
- * Increase resolution of cpu_power calculations
+ * Increase resolution of cpu_capacity calculations
  */
-#define SCHED_POWER_SHIFT	10
-#define SCHED_POWER_SCALE	(1L << SCHED_POWER_SHIFT)
+#define SCHED_CAPACITY_SHIFT	10
+#define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
 /*
  * sched-domains (multiprocessor balancing) declarations:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 07bc78a50329..7ba4f5413a10 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5249,7 +5249,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
 		printk(KERN_CONT " %s", str);
-		if (group->sgc->capacity != SCHED_POWER_SCALE) {
+		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
 			printk(KERN_CONT " (cpu_capacity = %d)",
 				group->sgc->capacity);
 		}
@@ -5715,7 +5715,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		 * domains and no possible iteration will get us here, we won't
 		 * die on a /0 trap.
 		 */
-		sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
 		sg->sgc->capacity_orig = sg->sgc->capacity;
 
 		/*
@@ -6921,7 +6921,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
-		rq->cpu_capacity = SCHED_POWER_SCALE;
+		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
 		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 58684f684fa8..dc7d6527a282 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,9 +1062,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 	if (!cpus)
 		return;
 
-	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity;
+	ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
 	ns->task_capacity =
-		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE);
+		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
 
@@ -4370,7 +4370,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 
 		/* Adjust by relative CPU capacity of the group */
-		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity;
+		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -5609,10 +5609,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 
 static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
 {
-	return SCHED_POWER_SCALE;
+	return SCHED_CAPACITY_SCALE;
 }
 
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 {
 	return default_scale_capacity(sd, cpu);
 }
@@ -5627,7 +5627,7 @@ static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu
 	return smt_gain;
 }
 
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
 {
 	return default_scale_smt_capacity(sd, cpu);
 }
@@ -5658,10 +5658,10 @@ static unsigned long scale_rt_capacity(int cpu)
 		available = total - avg;
 	}
 
-	if (unlikely((s64)total < SCHED_POWER_SCALE))
-		total = SCHED_POWER_SCALE;
+	if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
+		total = SCHED_CAPACITY_SCALE;
 
-	total >>= SCHED_POWER_SHIFT;
+	total >>= SCHED_CAPACITY_SHIFT;
 
 	return div_u64(available, total);
 }
@@ -5669,29 +5669,29 @@ static unsigned long scale_rt_capacity(int cpu)
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = sd->span_weight;
-	unsigned long capacity = SCHED_POWER_SCALE;
+	unsigned long capacity = SCHED_CAPACITY_SCALE;
 	struct sched_group *sdg = sd->groups;
 
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 		if (sched_feat(ARCH_POWER))
-			capacity *= arch_scale_smt_power(sd, cpu);
+			capacity *= arch_scale_smt_capacity(sd, cpu);
 		else
 			capacity *= default_scale_smt_capacity(sd, cpu);
 
-		capacity >>= SCHED_POWER_SHIFT;
+		capacity >>= SCHED_CAPACITY_SHIFT;
 	}
 
 	sdg->sgc->capacity_orig = capacity;
 
 	if (sched_feat(ARCH_POWER))
-		capacity *= arch_scale_freq_power(sd, cpu);
+		capacity *= arch_scale_freq_capacity(sd, cpu);
 	else
 		capacity *= default_scale_capacity(sd, cpu);
 
-	capacity >>= SCHED_POWER_SHIFT;
+	capacity >>= SCHED_CAPACITY_SHIFT;
 
 	capacity *= scale_rt_capacity(cpu);
-	capacity >>= SCHED_POWER_SHIFT;
+	capacity >>= SCHED_CAPACITY_SHIFT;
 
 	if (!capacity)
 		capacity = 1;
@@ -5780,7 +5780,7 @@ static inline int
 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 {
 	/*
-	 * Only siblings can have significantly less than SCHED_POWER_SCALE
+	 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
 	 */
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return 0;
@@ -5845,11 +5845,11 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
 	cpus = group->group_weight;
 
 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
-	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig);
+	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
 	capacity_factor = cpus / smt; /* cores */
 
 	capacity_factor = min_t(unsigned,
-		capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE));
+		capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
 	if (!capacity_factor)
 		capacity_factor = fix_small_capacity(env->sd, group);
 
@@ -5895,7 +5895,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 	/* Adjust by relative CPU capacity of the group */
 	sgs->group_capacity = group->sgc->capacity;
-	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity;
+	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
 
 	if (sgs->sum_nr_running)
 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -6089,7 +6089,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 
 	env->imbalance = DIV_ROUND_CLOSEST(
 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
-		SCHED_POWER_SCALE);
+		SCHED_CAPACITY_SCALE);
 
 	return 1;
 }
@@ -6118,7 +6118,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 		imbn = 1;
 
 	scaled_busy_load_per_task =
-		(busiest->load_per_task * SCHED_POWER_SCALE) /
+		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
 		busiest->group_capacity;
 
 	if (busiest->avg_load + scaled_busy_load_per_task >=
@@ -6137,7 +6137,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 			min(busiest->load_per_task, busiest->avg_load);
 	capa_now += local->group_capacity *
 			min(local->load_per_task, local->avg_load);
-	capa_now /= SCHED_POWER_SCALE;
+	capa_now /= SCHED_CAPACITY_SCALE;
 
 	/* Amount of load we'd subtract */
 	if (busiest->avg_load > scaled_busy_load_per_task) {
@@ -6148,16 +6148,16 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 
 	/* Amount of load we'd add */
 	if (busiest->avg_load * busiest->group_capacity <
-	    busiest->load_per_task * SCHED_POWER_SCALE) {
+	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
 		tmp = (busiest->avg_load * busiest->group_capacity) /
 		      local->group_capacity;
 	} else {
-		tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
+		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
 		      local->group_capacity;
 	}
 	capa_move += local->group_capacity *
 		    min(local->load_per_task, local->avg_load + tmp);
-	capa_move /= SCHED_POWER_SCALE;
+	capa_move /= SCHED_CAPACITY_SCALE;
 
 	/* Move if we gain throughput */
 	if (capa_move > capa_now)
@@ -6207,7 +6207,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		load_above_capacity =
 			(busiest->sum_nr_running - busiest->group_capacity_factor);
 
-		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
+		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
 		load_above_capacity /= busiest->group_capacity;
 	}
 
@@ -6225,7 +6225,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	env->imbalance = min(
 		max_pull * busiest->group_capacity,
 		(sds->avg_load - local->avg_load) * local->group_capacity
-	) / SCHED_POWER_SCALE;
+	) / SCHED_CAPACITY_SCALE;
 
 	/*
 	 * if *imbalance is less than the average load per runnable task
@@ -6279,7 +6279,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 
-	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity;
+	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+						/ sds.total_capacity;
 
 	/*
 	 * If the busiest group is imbalanced the below checks don't
@@ -6378,7 +6379,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 			continue;
 
 		capacity = capacity_of(i);
-		capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE);
+		capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
 		if (!capacity_factor)
 			capacity_factor = fix_small_capacity(env->sd, group);
 
-- 
cgit v1.2.3-71-gd317


From 5d4dfddd4f02b028d6ddaaa04d75d3b0cad1c9ae Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 27 May 2014 13:50:41 -0400
Subject: sched: Rename capacity related flags

It is better not to think about compute capacity as being equivalent
to "CPU power".  The upcoming "power aware" scheduler work may create
confusion with the notion of energy consumption if "power" is used too
liberally.

Let's rename the following feature flags since they do relate to capacity:

	SD_SHARE_CPUPOWER  -> SD_SHARE_CPUCAPACITY
	ARCH_POWER         -> ARCH_CAPACITY
	NONTASK_POWER      -> NONTASK_CAPACITY

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: linaro-kernel@lists.linaro.org
Cc: Andy Fleming <afleming@freescale.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: devicetree@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/n/tip-e93lpnxb87owfievqatey6b5@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/smp.c |  2 +-
 include/linux/sched.h     |  4 ++--
 kernel/sched/core.c       | 14 +++++++-------
 kernel/sched/fair.c       |  8 ++++----
 kernel/sched/features.h   |  8 ++++----
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 10ffffef0414..c51d16379cba 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -770,7 +770,7 @@ int setup_profiling_timer(unsigned int multiplier)
 /* cpumask of CPUs with asymetric SMT dependancy */
 static const int powerpc_smt_flags(void)
 {
-	int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
 
 	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
 		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 322110affe63..ce93768a3312 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -869,7 +869,7 @@ enum cpu_idle_type {
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu power */
 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
@@ -881,7 +881,7 @@ enum cpu_idle_type {
 #ifdef CONFIG_SCHED_SMT
 static inline const int cpu_smt_flags(void)
 {
-	return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7ba4f5413a10..5976ca579d3e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -872,7 +872,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	rq->clock_task += delta;
 
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		sched_rt_avg_update(rq, irq_delta + steal);
 #endif
 }
@@ -5309,7 +5309,7 @@ static int sd_degenerate(struct sched_domain *sd)
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
-			 SD_SHARE_CPUPOWER |
+			 SD_SHARE_CPUCAPACITY |
 			 SD_SHARE_PKG_RESOURCES |
 			 SD_SHARE_POWERDOMAIN)) {
 		if (sd->groups != sd->groups->next)
@@ -5340,7 +5340,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
-				SD_SHARE_CPUPOWER |
+				SD_SHARE_CPUCAPACITY |
 				SD_SHARE_PKG_RESOURCES |
 				SD_PREFER_SIBLING |
 				SD_SHARE_POWERDOMAIN);
@@ -5947,7 +5947,7 @@ static int sched_domains_curr_level;
 /*
  * SD_flags allowed in topology descriptions.
  *
- * SD_SHARE_CPUPOWER      - describes SMT topologies
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
  * SD_SHARE_PKG_RESOURCES - describes shared caches
  * SD_NUMA                - describes NUMA topologies
  * SD_SHARE_POWERDOMAIN   - describes shared power domain
@@ -5956,7 +5956,7 @@ static int sched_domains_curr_level;
  * SD_ASYM_PACKING        - describes SMT quirks
  */
 #define TOPOLOGY_SD_FLAGS		\
-	(SD_SHARE_CPUPOWER |		\
+	(SD_SHARE_CPUCAPACITY |		\
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA |			\
 	 SD_ASYM_PACKING |		\
@@ -6002,7 +6002,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 					| 1*SD_BALANCE_FORK
 					| 0*SD_BALANCE_WAKE
 					| 1*SD_WAKE_AFFINE
-					| 0*SD_SHARE_CPUPOWER
+					| 0*SD_SHARE_CPUCAPACITY
 					| 0*SD_SHARE_PKG_RESOURCES
 					| 0*SD_SERIALIZE
 					| 0*SD_PREFER_SIBLING
@@ -6024,7 +6024,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 	 * Convert topological properties into behaviour.
 	 */
 
-	if (sd->flags & SD_SHARE_CPUPOWER) {
+	if (sd->flags & SD_SHARE_CPUCAPACITY) {
 		sd->imbalance_pct = 110;
 		sd->smt_gain = 1178; /* ~15% */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dc7d6527a282..d3c731222199 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5672,8 +5672,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	unsigned long capacity = SCHED_CAPACITY_SCALE;
 	struct sched_group *sdg = sd->groups;
 
-	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-		if (sched_feat(ARCH_POWER))
+	if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+		if (sched_feat(ARCH_CAPACITY))
 			capacity *= arch_scale_smt_capacity(sd, cpu);
 		else
 			capacity *= default_scale_smt_capacity(sd, cpu);
@@ -5683,7 +5683,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 
 	sdg->sgc->capacity_orig = capacity;
 
-	if (sched_feat(ARCH_POWER))
+	if (sched_feat(ARCH_CAPACITY))
 		capacity *= arch_scale_freq_capacity(sd, cpu);
 	else
 		capacity *= default_scale_capacity(sd, cpu);
@@ -5782,7 +5782,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 	/*
 	 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
 	 */
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
+	if (!(sd->flags & SD_SHARE_CPUCAPACITY))
 		return 0;
 
 	/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929a2e3a..90284d117fe6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
 SCHED_FEAT(WAKEUP_PREEMPTION, true)
 
 /*
- * Use arch dependent cpu power functions
+ * Use arch dependent cpu capacity functions
  */
-SCHED_FEAT(ARCH_POWER, true)
+SCHED_FEAT(ARCH_CAPACITY, true)
 
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
 
 /*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
  */
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)
 
 /*
  * Queue remote wakeups on the target CPU and process them
-- 
cgit v1.2.3-71-gd317


From 53b25335dd60981ad608da7890420898a34469a6 Mon Sep 17 00:00:00 2001
From: Vince Weaver <vincent.weaver@maine.edu>
Date: Fri, 16 May 2014 17:12:12 -0400
Subject: perf: Disable sampled events if no PMU interrupt

Add common code to generate -ENOTSUPP at event creation time if an
architecture attempts to create a sampled event and
PERF_PMU_NO_INTERRUPT is set.

This adds a new pmu->capabilities flag.  Initially we only support
PERF_PMU_NO_INTERRUPT (to indicate a PMU has no support for generating
hardware interrupts) but there are other capabilities that can be
added later.

Signed-off-by: Vince Weaver <vincent.weaver@maine.edu>
Acked-by: Will Deacon <will.deacon@arm.com>
[peterz: rename to PERF_PMU_CAP_* and moved the pmu::capabilities word into a hole]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1405161708060.11099@vincent-weaver-1.umelst.maine.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 10 ++++++++++
 kernel/events/core.c       |  7 +++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index af6dcf1d9e47..267c8f37012c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -166,6 +166,11 @@ struct perf_event;
  */
 #define PERF_EVENT_TXN 0x1
 
+/**
+ * pmu::capabilities flags
+ */
+#define PERF_PMU_CAP_NO_INTERRUPT		0x01
+
 /**
  * struct pmu - generic performance monitoring unit
  */
@@ -178,6 +183,11 @@ struct pmu {
 	const char			*name;
 	int				type;
 
+	/*
+	 * various common per-pmu feature flags
+	 */
+	int				capabilities;
+
 	int * __percpu			pmu_disable_count;
 	struct perf_cpu_context * __percpu pmu_cpu_context;
 	int				task_ctx_nr;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a62d142ad498..e9ef0c6646af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7120,6 +7120,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
+	if (is_sampling_event(event)) {
+		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+			err = -ENOTSUPP;
+			goto err_alloc;
+		}
+	}
+
 	account_event(event);
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 762380ad9322951cea4ce9d24864265f9c66a916 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 5 Jun 2014 13:38:39 -0600
Subject: block: add notion of a chunk size for request merging

Some drivers have different limits on what size a request should
optimally be, depending on the offset of the request. Similar to
dividing a device into chunks. Add a setting that allows the driver
to inform the block layer of such a chunk size. The block layer will
then prevent merging across the chunks.

This is needed to optimally support NVMe with a non-zero stripe size.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c            |  3 ++-
 block/blk-settings.c   | 18 ++++++++++++++++++
 include/linux/blkdev.h | 22 +++++++++++++++++++++-
 3 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 96d28eee8a1e..97e832cc9b9c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -849,7 +849,8 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
+
+	return __bio_add_page(q, bio, page, len, offset, blk_max_size_offset(q, bio->bi_iter.bi_sector));
 }
 EXPORT_SYMBOL(bio_add_page);
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5d21239bc859..a2b9cb195e70 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -113,6 +113,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
 	lim->max_discard_sectors = 0;
 	lim->discard_granularity = 0;
@@ -276,6 +277,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
+/**
+ * blk_queue_chunk_sectors - set size of the chunk for this queue
+ * @q:  the request queue for the device
+ * @chunk_sectors:  chunk sectors in the usual 512b unit
+ *
+ * Description:
+ *    If a driver doesn't want IOs to cross a given chunk size, it can set
+ *    this limit and prevent merging across chunks. Note that the chunk size
+ *    must currently be a power-of-2 in sectors.
+ **/
+void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
+{
+	BUG_ON(!is_power_of_2(chunk_sectors));
+	q->limits.chunk_sectors = chunk_sectors;
+}
+EXPORT_SYMBOL(blk_queue_chunk_sectors);
+
 /**
  * blk_queue_max_discard_sectors - set max sectors for a single discard
  * @q:  the request queue for the device
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3cd426e971db..dc2c703f05fd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -280,6 +280,7 @@ struct queue_limits {
 	unsigned long		seg_boundary_mask;
 
 	unsigned int		max_hw_sectors;
+	unsigned int		chunk_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_segment_size;
 	unsigned int		physical_block_size;
@@ -910,6 +911,20 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	return q->limits.max_sectors;
 }
 
+/*
+ * Return maximum size of a request at given offset. Only valid for
+ * file system requests.
+ */
+static inline unsigned int blk_max_size_offset(struct request_queue *q,
+					       sector_t offset)
+{
+	if (!q->limits.chunk_sectors)
+		return q->limits.max_hw_sectors;
+
+	return q->limits.chunk_sectors -
+			(offset & (q->limits.chunk_sectors - 1));
+}
+
 static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 {
 	struct request_queue *q = rq->q;
@@ -917,7 +932,11 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 	if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
 		return q->limits.max_hw_sectors;
 
-	return blk_queue_get_max_sectors(q, rq->cmd_flags);
+	if (!q->limits.chunk_sectors)
+		return blk_queue_get_max_sectors(q, rq->cmd_flags);
+
+	return min(blk_max_size_offset(q, blk_rq_pos(rq)),
+			blk_queue_get_max_sectors(q, rq->cmd_flags));
 }
 
 static inline unsigned int blk_rq_count_bios(struct request *rq)
@@ -983,6 +1002,7 @@ extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
+extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
-- 
cgit v1.2.3-71-gd317


From 1c03a2d04d7ab6d27c1fef8614f08187d974bd21 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 2 Jun 2014 22:49:28 +0530
Subject: cpufreq: add support for intermediate (stable) frequencies

Douglas Anderson, recently pointed out an interesting problem due to which
udelay() was expiring earlier than it should.

While transitioning between frequencies few platforms may temporarily switch to
a stable frequency, waiting for the main PLL to stabilize.

For example: When we transition between very low frequencies on exynos, like
between 200MHz and 300MHz, we may temporarily switch to a PLL running at 800MHz.
No CPUFREQ notification is sent for that. That means there's a period of time
when we're running at 800MHz but loops_per_jiffy is calibrated at between 200MHz
and 300MHz. And so udelay behaves badly.

To get this fixed in a generic way, introduce another set of callbacks
get_intermediate() and target_intermediate(), only for drivers with
target_index() and CPUFREQ_ASYNC_NOTIFICATION unset.

get_intermediate() should return a stable intermediate frequency platform wants
to switch to, and target_intermediate() should set CPU to that frequency,
before jumping to the frequency corresponding to 'index'. Core will take care of
sending notifications and driver doesn't have to handle them in
target_intermediate() or target_index().

NOTE: ->target_index() should restore to policy->restore_freq in case of
failures as core would send notifications for that.

Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Doug Anderson <dianders@chromium.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/cpu-drivers.txt | 29 ++++++++++++++-
 drivers/cpufreq/cpufreq.c              | 67 ++++++++++++++++++++++++++++++----
 include/linux/cpufreq.h                | 25 +++++++++++++
 3 files changed, 112 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
index b045fe54986a..14f4e6336d88 100644
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -26,6 +26,7 @@ Contents:
 1.4  target/target_index or setpolicy?
 1.5  target/target_index
 1.6  setpolicy
+1.7  get_intermediate and target_intermediate
 2.   Frequency Table Helpers
 
 
@@ -79,6 +80,10 @@ cpufreq_driver.attr -		A pointer to a NULL-terminated list of
 				"struct freq_attr" which allow to
 				export values to sysfs.
 
+cpufreq_driver.get_intermediate
+and target_intermediate		Used to switch to stable frequency while
+				changing CPU frequency.
+
 
 1.2 Per-CPU Initialization
 --------------------------
@@ -151,7 +156,7 @@ Some cpufreq-capable processors switch the frequency between certain
 limits on their own. These shall use the ->setpolicy call
 
 
-1.4. target/target_index
+1.5. target/target_index
 -------------
 
 The target_index call has two arguments: struct cpufreq_policy *policy,
@@ -160,6 +165,9 @@ and unsigned int index (into the exposed frequency table).
 The CPUfreq driver must set the new frequency when called here. The
 actual frequency must be determined by freq_table[index].frequency.
 
+It should always restore to earlier frequency (i.e. policy->restore_freq) in
+case of errors, even if we switched to intermediate frequency earlier.
+
 Deprecated:
 ----------
 The target call has three arguments: struct cpufreq_policy *policy,
@@ -179,7 +187,7 @@ Here again the frequency table helper might assist you - see section 2
 for details.
 
 
-1.5 setpolicy
+1.6 setpolicy
 ---------------
 
 The setpolicy call only takes a struct cpufreq_policy *policy as
@@ -190,6 +198,23 @@ setting when policy->policy is CPUFREQ_POLICY_PERFORMANCE, and a
 powersaving-oriented setting when CPUFREQ_POLICY_POWERSAVE. Also check
 the reference implementation in drivers/cpufreq/longrun.c
 
+1.7 get_intermediate and target_intermediate
+--------------------------------------------
+
+Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset.
+
+get_intermediate should return a stable intermediate frequency platform wants to
+switch to, and target_intermediate() should set CPU to to that frequency, before
+jumping to the frequency corresponding to 'index'. Core will take care of
+sending notifications and driver doesn't have to handle them in
+target_intermediate() or target_index().
+
+Drivers can return '0' from get_intermediate() in case they don't wish to switch
+to intermediate frequency for some target frequency. In that case core will
+directly call ->target_index().
+
+NOTE: ->target_index() should restore to policy->restore_freq in case of
+failures as core would send notifications for that.
 
 
 2. Frequency Table Helpers
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ae11dd51f81d..aed2b0cb83dc 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1816,20 +1816,55 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier);
  *                              GOVERNORS                            *
  *********************************************************************/
 
+/* Must set freqs->new to intermediate frequency */
+static int __target_intermediate(struct cpufreq_policy *policy,
+				 struct cpufreq_freqs *freqs, int index)
+{
+	int ret;
+
+	freqs->new = cpufreq_driver->get_intermediate(policy, index);
+
+	/* We don't need to switch to intermediate freq */
+	if (!freqs->new)
+		return 0;
+
+	pr_debug("%s: cpu: %d, switching to intermediate freq: oldfreq: %u, intermediate freq: %u\n",
+		 __func__, policy->cpu, freqs->old, freqs->new);
+
+	cpufreq_freq_transition_begin(policy, freqs);
+	ret = cpufreq_driver->target_intermediate(policy, index);
+	cpufreq_freq_transition_end(policy, freqs, ret);
+
+	if (ret)
+		pr_err("%s: Failed to change to intermediate frequency: %d\n",
+		       __func__, ret);
+
+	return ret;
+}
+
 static int __target_index(struct cpufreq_policy *policy,
 			  struct cpufreq_frequency_table *freq_table, int index)
 {
-	struct cpufreq_freqs freqs;
+	struct cpufreq_freqs freqs = {.old = policy->cur, .flags = 0};
+	unsigned int intermediate_freq = 0;
 	int retval = -EINVAL;
 	bool notify;
 
 	notify = !(cpufreq_driver->flags & CPUFREQ_ASYNC_NOTIFICATION);
-
 	if (notify) {
-		freqs.old = policy->cur;
-		freqs.new = freq_table[index].frequency;
-		freqs.flags = 0;
+		/* Handle switching to intermediate frequency */
+		if (cpufreq_driver->get_intermediate) {
+			retval = __target_intermediate(policy, &freqs, index);
+			if (retval)
+				return retval;
+
+			intermediate_freq = freqs.new;
+			/* Set old freq to intermediate */
+			if (intermediate_freq)
+				freqs.old = freqs.new;
+		}
 
+		freqs.new = freq_table[index].frequency;
 		pr_debug("%s: cpu: %d, oldfreq: %u, new freq: %u\n",
 			 __func__, policy->cpu, freqs.old, freqs.new);
 
@@ -1841,9 +1876,23 @@ static int __target_index(struct cpufreq_policy *policy,
 		pr_err("%s: Failed to change cpu frequency: %d\n", __func__,
 		       retval);
 
-	if (notify)
+	if (notify) {
 		cpufreq_freq_transition_end(policy, &freqs, retval);
 
+		/*
+		 * Failed after setting to intermediate freq? Driver should have
+		 * reverted back to initial frequency and so should we. Check
+		 * here for intermediate_freq instead of get_intermediate, in
+		 * case we have't switched to intermediate freq at all.
+		 */
+		if (unlikely(retval && intermediate_freq)) {
+			freqs.old = intermediate_freq;
+			freqs.new = policy->restore_freq;
+			cpufreq_freq_transition_begin(policy, &freqs);
+			cpufreq_freq_transition_end(policy, &freqs, 0);
+		}
+	}
+
 	return retval;
 }
 
@@ -1875,6 +1924,9 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 	if (target_freq == policy->cur)
 		return 0;
 
+	/* Save last value to restore later on errors */
+	policy->restore_freq = policy->cur;
+
 	if (cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
 	else if (cpufreq_driver->target_index) {
@@ -2361,7 +2413,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	    !(driver_data->setpolicy || driver_data->target_index ||
 		    driver_data->target) ||
 	     (driver_data->setpolicy && (driver_data->target_index ||
-		    driver_data->target)))
+		    driver_data->target)) ||
+	     (!!driver_data->get_intermediate != !!driver_data->target_intermediate))
 		return -EINVAL;
 
 	pr_debug("trying to register driver %s\n", driver_data->name);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 3f458896d45c..ec4112d257bc 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -75,6 +75,7 @@ struct cpufreq_policy {
 	unsigned int		max;    /* in kHz */
 	unsigned int		cur;    /* in kHz, only needed if cpufreq
 					 * governors are used */
+	unsigned int		restore_freq; /* = policy->cur before transition */
 	unsigned int		suspend_freq; /* freq to set during suspend */
 
 	unsigned int		policy; /* see above */
@@ -221,11 +222,35 @@ struct cpufreq_driver {
 
 	/* define one out of two */
 	int	(*setpolicy)	(struct cpufreq_policy *policy);
+
+	/*
+	 * On failure, should always restore frequency to policy->restore_freq
+	 * (i.e. old freq).
+	 */
 	int	(*target)	(struct cpufreq_policy *policy,	/* Deprecated */
 				 unsigned int target_freq,
 				 unsigned int relation);
 	int	(*target_index)	(struct cpufreq_policy *policy,
 				 unsigned int index);
+	/*
+	 * Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION
+	 * unset.
+	 *
+	 * get_intermediate should return a stable intermediate frequency
+	 * platform wants to switch to and target_intermediate() should set CPU
+	 * to to that frequency, before jumping to the frequency corresponding
+	 * to 'index'. Core will take care of sending notifications and driver
+	 * doesn't have to handle them in target_intermediate() or
+	 * target_index().
+	 *
+	 * Drivers can return '0' from get_intermediate() in case they don't
+	 * wish to switch to intermediate frequency for some target frequency.
+	 * In that case core will directly call ->target_index().
+	 */
+	unsigned int (*get_intermediate)(struct cpufreq_policy *policy,
+					 unsigned int index);
+	int	(*target_intermediate)(struct cpufreq_policy *policy,
+				       unsigned int index);
 
 	/* should be defined, if possible */
 	unsigned int	(*get)	(unsigned int cpu);
-- 
cgit v1.2.3-71-gd317


From 46cfd6ea23b0a207c87269d86457727dc4485708 Mon Sep 17 00:00:00 2001
From: Konrad Zapalowicz <bergo.torino@gmail.com>
Date: Thu, 5 Jun 2014 20:27:42 +0200
Subject: net: phy: fix sparse warning in fixed.c

This commit fixes the following sparse warning:

drivers/net/phy/fixed.c:207
    - warning: symbol 'fixed_phy_del' was not declared.
      Should it be static?

by adding symbol definition to the phy_fixed.h API file. It is ok to do
because the function in question is an exported symbol.

Signed-off-by: Konrad Zapalowicz <bergo.torino@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy_fixed.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 4f2478b47136..ae612acebb53 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -17,6 +17,7 @@ extern int fixed_phy_add(unsigned int irq, int phy_id,
 extern int fixed_phy_register(unsigned int irq,
 			      struct fixed_phy_status *status,
 			      struct device_node *np);
+extern void fixed_phy_del(int phy_addr);
 #else
 static inline int fixed_phy_add(unsigned int irq, int phy_id,
 				struct fixed_phy_status *status)
@@ -29,6 +30,10 @@ static inline int fixed_phy_register(unsigned int irq,
 {
 	return -ENODEV;
 }
+static inline int fixed_phy_del(int phy_addr)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_FIXED_PHY */
 
 /*
-- 
cgit v1.2.3-71-gd317


From f98a128a55ff85d0087de89f304f10bd75e792aa Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Thu, 17 Apr 2014 08:55:50 +0800
Subject: ceph: update inode fields according to issued caps

Cap message and request reply from non-auth MDS may carry stale
information (corresponding locks are in LOCK states) even they
have the newest inode version. So client should update inode fields
according to issued caps.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/caps.c               | 58 ++++++++++++++++++++----------------
 fs/ceph/inode.c              | 70 ++++++++++++++++++++++++--------------------
 include/linux/ceph/ceph_fs.h |  2 ++
 3 files changed, 73 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index de39a03f5b71..5f6d24ede794 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2476,7 +2476,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 
 	__check_cap_issue(ci, cap, newcaps);
 
-	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(grant->mode);
 		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
 		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
@@ -2485,7 +2486,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		     from_kgid(&init_user_ns, inode->i_gid));
 	}
 
-	if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+	    (issued & CEPH_CAP_LINK_EXCL) == 0) {
 		set_nlink(inode, le32_to_cpu(grant->nlink));
 		if (inode->i_nlink == 0 &&
 		    (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
@@ -2512,31 +2514,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
 		queue_revalidate = 1;
 
-	/* size/ctime/mtime/atime? */
-	queue_trunc = ceph_fill_file_size(inode, issued,
-					  le32_to_cpu(grant->truncate_seq),
-					  le64_to_cpu(grant->truncate_size),
-					  size);
-	ceph_decode_timespec(&mtime, &grant->mtime);
-	ceph_decode_timespec(&atime, &grant->atime);
-	ceph_decode_timespec(&ctime, &grant->ctime);
-	ceph_fill_file_time(inode, issued,
-			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
-			    &atime);
-
-
-	/* file layout may have changed */
-	ci->i_layout = grant->layout;
-
-	/* max size increase? */
-	if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
-		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
-		ci->i_max_size = max_size;
-		if (max_size >= ci->i_wanted_max_size) {
-			ci->i_wanted_max_size = 0;  /* reset */
-			ci->i_requested_max_size = 0;
+	if (newcaps & CEPH_CAP_ANY_RD) {
+		/* ctime/mtime/atime? */
+		ceph_decode_timespec(&mtime, &grant->mtime);
+		ceph_decode_timespec(&atime, &grant->atime);
+		ceph_decode_timespec(&ctime, &grant->ctime);
+		ceph_fill_file_time(inode, issued,
+				    le32_to_cpu(grant->time_warp_seq),
+				    &ctime, &mtime, &atime);
+	}
+
+	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+		/* file layout may have changed */
+		ci->i_layout = grant->layout;
+		/* size/truncate_seq? */
+		queue_trunc = ceph_fill_file_size(inode, issued,
+					le32_to_cpu(grant->truncate_seq),
+					le64_to_cpu(grant->truncate_size),
+					size);
+		/* max size increase? */
+		if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+			dout("max_size %lld -> %llu\n",
+			     ci->i_max_size, max_size);
+			ci->i_max_size = max_size;
+			if (max_size >= ci->i_wanted_max_size) {
+				ci->i_wanted_max_size = 0;  /* reset */
+				ci->i_requested_max_size = 0;
+			}
+			wake = 1;
 		}
-		wake = 1;
 	}
 
 	/* check cap bits */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 233c6f96910a..f9e7399877d6 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -585,14 +585,15 @@ static int fill_inode(struct inode *inode,
 	struct ceph_mds_reply_inode *info = iinfo->in;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int i;
-	int issued = 0, implemented;
+	int issued = 0, implemented, new_issued;
 	struct timespec mtime, atime, ctime;
 	u32 nsplits;
 	struct ceph_inode_frag *frag;
 	struct rb_node *rb_node;
 	struct ceph_buffer *xattr_blob = NULL;
 	int err = 0;
-	int queue_trunc = 0;
+	bool queue_trunc = false;
+	bool new_version = false;
 
 	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
 	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
@@ -623,19 +624,23 @@ static int fill_inode(struct inode *inode,
 	 *   3    2     skip
 	 *   3    3     update
 	 */
-	if (le64_to_cpu(info->version) > 0 &&
-	    (ci->i_version & ~1) >= le64_to_cpu(info->version))
-		goto no_change;
-	
+	if (ci->i_version == 0 ||
+	    ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+	     le64_to_cpu(info->version) > (ci->i_version & ~1)))
+		new_version = true;
+
 	issued = __ceph_caps_issued(ci, &implemented);
 	issued |= implemented | __ceph_caps_dirty(ci);
+	new_issued = ~issued & le32_to_cpu(info->cap.caps);
 
 	/* update inode */
 	ci->i_version = le64_to_cpu(info->version);
 	inode->i_version++;
 	inode->i_rdev = le32_to_cpu(info->rdev);
+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
 
-	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+	if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(info->mode);
 		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
 		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
@@ -644,23 +649,35 @@ static int fill_inode(struct inode *inode,
 		     from_kgid(&init_user_ns, inode->i_gid));
 	}
 
-	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+	if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+	    (issued & CEPH_CAP_LINK_EXCL) == 0)
 		set_nlink(inode, le32_to_cpu(info->nlink));
 
-	/* be careful with mtime, atime, size */
-	ceph_decode_timespec(&atime, &info->atime);
-	ceph_decode_timespec(&mtime, &info->mtime);
-	ceph_decode_timespec(&ctime, &info->ctime);
-	queue_trunc = ceph_fill_file_size(inode, issued,
-					  le32_to_cpu(info->truncate_seq),
-					  le64_to_cpu(info->truncate_size),
-					  le64_to_cpu(info->size));
-	ceph_fill_file_time(inode, issued,
-			    le32_to_cpu(info->time_warp_seq),
-			    &ctime, &mtime, &atime);
-
-	ci->i_layout = info->layout;
-	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+	if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+		/* be careful with mtime, atime, size */
+		ceph_decode_timespec(&atime, &info->atime);
+		ceph_decode_timespec(&mtime, &info->mtime);
+		ceph_decode_timespec(&ctime, &info->ctime);
+		ceph_fill_file_time(inode, issued,
+				le32_to_cpu(info->time_warp_seq),
+				&ctime, &mtime, &atime);
+	}
+
+	if (new_version ||
+	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		ci->i_layout = info->layout;
+		queue_trunc = ceph_fill_file_size(inode, issued,
+					le32_to_cpu(info->truncate_seq),
+					le64_to_cpu(info->truncate_size),
+					le64_to_cpu(info->size));
+		/* only update max_size on auth cap */
+		if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+		    ci->i_max_size != le64_to_cpu(info->max_size)) {
+			dout("max_size %lld -> %llu\n", ci->i_max_size,
+					le64_to_cpu(info->max_size));
+			ci->i_max_size = le64_to_cpu(info->max_size);
+		}
+	}
 
 	/* xattrs */
 	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -745,15 +762,6 @@ static int fill_inode(struct inode *inode,
 		dout(" marking %p complete (empty)\n", inode);
 		__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
 	}
-no_change:
-	/* only update max_size on auth cap */
-	if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
-	    ci->i_max_size != le64_to_cpu(info->max_size)) {
-		dout("max_size %lld -> %llu\n", ci->i_max_size,
-		     le64_to_cpu(info->max_size));
-		ci->i_max_size = le64_to_cpu(info->max_size);
-	}
-
 	spin_unlock(&ci->i_ceph_lock);
 
 	/* queue truncate if we saw i_size decrease */
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 5f6db18d72e8..3c97d5e9b951 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -625,6 +625,8 @@ int ceph_flags_to_mode(int flags);
 			   CEPH_CAP_LINK_EXCL |		\
 			   CEPH_CAP_XATTR_EXCL |	\
 			   CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+			      CEPH_CAP_FILE_SHARED)
 #define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
 			      CEPH_CAP_FILE_EXCL)
 #define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
-- 
cgit v1.2.3-71-gd317


From 513a8243d67f8e8d27f2883bd2f18bc87c7ca376 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 13 May 2014 11:19:26 +0400
Subject: libceph: mon_get_version request infrastructure

Add support for mon_get_version requests to libceph.  This reuses much
of the ceph_mon_generic_request infrastructure, with one exception.
Older OSDs don't set mon_get_version reply hdr->tid even if the
original request had a non-zero tid, which makes it impossible to
lookup ceph_mon_generic_request contexts by tid in get_generic_reply()
for such replies.  As a workaround, we allocate a reply message on the
reply path.  This can probably interfere with revoke, but I don't see
a better way.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/mon_client.h |   9 ++-
 net/ceph/ceph_common.c          |   2 +
 net/ceph/debugfs.c              |   2 +
 net/ceph/mon_client.c           | 123 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 128 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index a486f390dfbe..585ef9450e9d 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -40,9 +40,9 @@ struct ceph_mon_request {
 };
 
 /*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
+ * ceph_mon_generic_request is being used for the statfs, poolop and
+ * mon_get_version requests which are being done a bit differently
+ * because we need to get data back to the caller
  */
 struct ceph_mon_generic_request {
 	struct kref kref;
@@ -108,6 +108,9 @@ extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
 			       struct ceph_statfs *buf);
 
+extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
+				    const char *what, u64 *newest);
+
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 
 extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 67d7721d237e..1675021d8c12 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -72,6 +72,8 @@ const char *ceph_msg_type_name(int type)
 	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
 	case CEPH_MSG_STATFS: return "statfs";
 	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MON_GET_VERSION: return "mon_get_version";
+	case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply";
 	case CEPH_MSG_MDS_MAP: return "mds_map";
 	case CEPH_MSG_CLIENT_SESSION: return "client_session";
 	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 8903dcee8d8e..d1a62c69a9f4 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -129,6 +129,8 @@ static int monc_show(struct seq_file *s, void *p)
 			seq_printf(s, "%llu statfs\n", req->tid);
 		else if (op == CEPH_MSG_POOLOP)
 			seq_printf(s, "%llu poolop\n", req->tid);
+		else if (op == CEPH_MSG_MON_GET_VERSION)
+			seq_printf(s, "%llu mon_get_version", req->tid);
 		else
 			seq_printf(s, "%llu unknown\n", req->tid);
 	}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 2ac9ef35110b..11d8d2f2708a 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -477,14 +477,13 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	return m;
 }
 
-static int do_generic_request(struct ceph_mon_client *monc,
-			      struct ceph_mon_generic_request *req)
+static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
+				struct ceph_mon_generic_request *req)
 {
 	int err;
 
 	/* register request */
-	mutex_lock(&monc->mutex);
-	req->tid = ++monc->last_tid;
+	req->tid = tid != 0 ? tid : ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
 	__insert_generic_request(monc, req);
 	monc->num_generic_requests++;
@@ -496,13 +495,24 @@ static int do_generic_request(struct ceph_mon_client *monc,
 	mutex_lock(&monc->mutex);
 	rb_erase(&req->node, &monc->generic_request_tree);
 	monc->num_generic_requests--;
-	mutex_unlock(&monc->mutex);
 
 	if (!err)
 		err = req->result;
 	return err;
 }
 
+static int do_generic_request(struct ceph_mon_client *monc,
+			      struct ceph_mon_generic_request *req)
+{
+	int err;
+
+	mutex_lock(&monc->mutex);
+	err = __do_generic_request(monc, 0, req);
+	mutex_unlock(&monc->mutex);
+
+	return err;
+}
+
 /*
  * statfs
  */
@@ -579,6 +589,96 @@ out:
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
 
+static void handle_get_version_reply(struct ceph_mon_client *monc,
+				     struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front_alloc_len;
+	u64 handle;
+
+	dout("%s %p tid %llu\n", __func__, msg, tid);
+
+	ceph_decode_need(&p, end, 2*sizeof(u64), bad);
+	handle = ceph_decode_64(&p);
+	if (tid != 0 && tid != handle)
+		goto bad;
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, handle);
+	if (req) {
+		*(u64 *)req->buf = ceph_decode_64(&p);
+		req->result = 0;
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete_all(&req->completion);
+		put_generic_request(req);
+	}
+
+	return;
+bad:
+	pr_err("corrupt mon_get_version reply\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
+			     u64 *newest)
+{
+	struct ceph_mon_generic_request *req;
+	void *p, *end;
+	u64 tid;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = newest;
+	req->buf_len = sizeof(*newest);
+	init_completion(&req->completion);
+
+	req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
+				    sizeof(u64) + sizeof(u32) + strlen(what),
+				    GFP_NOFS, true);
+	if (!req->request) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
+				  GFP_NOFS, true);
+	if (!req->reply) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	p = req->request->front.iov_base;
+	end = p + req->request->front_alloc_len;
+
+	/* fill out request */
+	mutex_lock(&monc->mutex);
+	tid = ++monc->last_tid;
+	ceph_encode_64(&p, tid); /* handle */
+	ceph_encode_string(&p, end, what, strlen(what));
+
+	err = __do_generic_request(monc, tid, req);
+
+	mutex_unlock(&monc->mutex);
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_get_version);
+
 /*
  * pool ops
  */
@@ -981,6 +1081,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		handle_statfs_reply(monc, msg);
 		break;
 
+	case CEPH_MSG_MON_GET_VERSION_REPLY:
+		handle_get_version_reply(monc, msg);
+		break;
+
 	case CEPH_MSG_POOLOP_REPLY:
 		handle_poolop_reply(monc, msg);
 		break;
@@ -1029,6 +1133,15 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_AUTH_REPLY:
 		m = ceph_msg_get(monc->m_auth_reply);
 		break;
+	case CEPH_MSG_MON_GET_VERSION_REPLY:
+		if (le64_to_cpu(hdr->tid) != 0)
+			return get_generic_reply(con, hdr, skip);
+
+		/*
+		 * Older OSDs don't set reply tid even if the orignal
+		 * request had a non-zero tid.  Workaround this weirdness
+		 * by falling through to the allocate case.
+		 */
 	case CEPH_MSG_MON_MAP:
 	case CEPH_MSG_MDS_MAP:
 	case CEPH_MSG_OSD_MAP:
-- 
cgit v1.2.3-71-gd317


From 6044cde6f2a94d88142d4401624152a741866338 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 13 May 2014 11:19:27 +0400
Subject: libceph: add ceph_monc_wait_osdmap()

Add ceph_monc_wait_osdmap(), which will block until the osdmap with the
specified epoch is received or timeout occurs.

Export both of these as they are going to be needed by rbd.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/mon_client.h |  2 ++
 net/ceph/mon_client.c           | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 585ef9450e9d..deb47e45ac7c 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -104,6 +104,8 @@ extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
 extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
 
 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+				 unsigned long timeout);
 
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
 			       struct ceph_statfs *buf);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 11d8d2f2708a..067d3af2eaf6 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -296,6 +296,33 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 		__send_subscribe(monc);
 	mutex_unlock(&monc->mutex);
 }
+EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+
+int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+			  unsigned long timeout)
+{
+	unsigned long started = jiffies;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	while (monc->have_osdmap < epoch) {
+		mutex_unlock(&monc->mutex);
+
+		if (timeout != 0 && time_after_eq(jiffies, started + timeout))
+			return -ETIMEDOUT;
+
+		ret = wait_event_interruptible_timeout(monc->client->auth_wq,
+					 monc->have_osdmap >= epoch, timeout);
+		if (ret < 0)
+			return ret;
+
+		mutex_lock(&monc->mutex);
+	}
+
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_wait_osdmap);
 
 /*
  *
-- 
cgit v1.2.3-71-gd317


From e041e328c4b41e1db79bfe5ba9992c2ed771ad19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 21 May 2014 17:32:19 +0200
Subject: perf: Fix perf_event_comm() vs. exec() assumption

perf_event_comm() assumes that set_task_comm() is only called on
exec(), and in particular that its only called on current.

Neither are true, as Dave reported a WARN triggered by set_task_comm()
being called on !current.

Separate the exec() hook from the comm hook.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140521153219.GH5226@laptop.programming.kicks-ass.net
[ Build fix. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                  |  1 +
 include/linux/perf_event.h |  4 +++-
 kernel/events/core.c       | 28 ++++++++++++++++------------
 3 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 238b7aa26f68..a038a41a3677 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1110,6 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	else
 		set_dumpable(current->mm, suid_dumpable);
 
+	perf_event_exec();
 	set_task_comm(current, kbasename(bprm->filename));
 
 	/* Set the new mm task size. We have to do that late because it may
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3ef6ea12806a..9b5cd1992a88 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -695,6 +695,7 @@ extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 
+extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
@@ -772,7 +773,7 @@ extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
 extern int __perf_event_disable(void *info);
 extern void perf_event_task_tick(void);
-#else
+#else /* !CONFIG_PERF_EVENTS: */
 static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
@@ -802,6 +803,7 @@ static inline int perf_unregister_guest_info_callbacks
 (struct perf_guest_info_callbacks *callbacks)				{ return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
+static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 440eefc67397..647698f91988 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2970,6 +2970,22 @@ out:
 	local_irq_restore(flags);
 }
 
+void perf_event_exec(void)
+{
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	rcu_read_lock();
+	for_each_task_context_nr(ctxn) {
+		ctx = current->perf_event_ctxp[ctxn];
+		if (!ctx)
+			continue;
+
+		perf_event_enable_on_exec(ctx);
+	}
+	rcu_read_unlock();
+}
+
 /*
  * Cross CPU call to read the hardware event
  */
@@ -5057,18 +5073,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 void perf_event_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
-	struct perf_event_context *ctx;
-	int ctxn;
-
-	rcu_read_lock();
-	for_each_task_context_nr(ctxn) {
-		ctx = task->perf_event_ctxp[ctxn];
-		if (!ctx)
-			continue;
-
-		perf_event_enable_on_exec(ctx);
-	}
-	rcu_read_unlock();
 
 	if (!atomic_read(&nr_comm_events))
 		return;
-- 
cgit v1.2.3-71-gd317


From 82b897782d10fcc4930c9d4a15b175348fdd2871 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 28 May 2014 11:45:04 +0300
Subject: perf: Differentiate exec() and non-exec() comm events

perf tools like 'perf report' can aggregate samples by comm strings,
which generally works.  However, there are other potential use-cases.
For example, to pair up 'calls' with 'returns' accurately (from branch
events like Intel BTS) it is necessary to identify whether the process
has exec'd.  Although a comm event is generated when an 'exec' happens
it is also generated whenever the comm string is changed on a whim
(e.g. by prctl PR_SET_NAME).  This patch adds a flag to the comm event
to differentiate one case from the other.

In order to determine whether the kernel supports the new flag, a
selection bit named 'exec' is added to struct perf_event_attr.  The
bit does nothing but will cause perf_event_open() to fail if the bit
is set on kernels that do not have it defined.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/537D9EBE.7030806@intel.com
Cc: Paul Mackerras <paulus@samba.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                       | 6 +++---
 include/linux/perf_event.h      | 4 ++--
 include/linux/sched.h           | 6 +++++-
 include/uapi/linux/perf_event.h | 9 +++++++--
 kernel/events/core.c            | 4 ++--
 5 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index a038a41a3677..a3d33fe592d6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1046,13 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm);
  * so that a new one can be started
  */
 
-void set_task_comm(struct task_struct *tsk, const char *buf)
+void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
 {
 	task_lock(tsk);
 	trace_task_rename(tsk, buf);
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
-	perf_event_comm(tsk);
+	perf_event_comm(tsk, exec);
 }
 
 int flush_old_exec(struct linux_binprm * bprm)
@@ -1111,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 		set_dumpable(current->mm, suid_dumpable);
 
 	perf_event_exec();
-	set_task_comm(current, kbasename(bprm->filename));
+	__set_task_comm(current, kbasename(bprm->filename), true);
 
 	/* Set the new mm task size. We have to do that late because it may
 	 * depend on TIF_32BIT which is only updated in flush_thread() on
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b4c1d4685bf0..707617a8c0f6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -707,7 +707,7 @@ extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 
 extern void perf_event_exec(void);
-extern void perf_event_comm(struct task_struct *tsk);
+extern void perf_event_comm(struct task_struct *tsk, bool exec);
 extern void perf_event_fork(struct task_struct *tsk);
 
 /* Callchains */
@@ -815,7 +815,7 @@ static inline int perf_unregister_guest_info_callbacks
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_exec(void)				{ }
-static inline void perf_event_comm(struct task_struct *tsk)		{ }
+static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
 static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 221b2bde3723..ad86e1d7dbc2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2379,7 +2379,11 @@ extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, i
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
-extern void set_task_comm(struct task_struct *tsk, const char *from);
+extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
+static inline void set_task_comm(struct task_struct *tsk, const char *from)
+{
+	__set_task_comm(tsk, from, false);
+}
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d9cd853818ad..5312fae47218 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -302,8 +302,8 @@ struct perf_event_attr {
 				exclude_callchain_kernel : 1, /* exclude kernel callchains */
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
-
-				__reserved_1   : 40;
+				comm_exec      :  1, /* flag comm events that are due to an exec */
+				__reserved_1   : 39;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -502,7 +502,12 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
 #define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
+/*
+ * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
+ * different events so can reuse the same bit position.
+ */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
+#define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
 /*
  * Indicates that the content of PERF_SAMPLE_IP points to
  * the actual instruction that triggered the event. See also
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8fac2056d51e..7da5e561e89a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5090,7 +5090,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 		       NULL);
 }
 
-void perf_event_comm(struct task_struct *task)
+void perf_event_comm(struct task_struct *task, bool exec)
 {
 	struct perf_comm_event comm_event;
 
@@ -5104,7 +5104,7 @@ void perf_event_comm(struct task_struct *task)
 		.event_id  = {
 			.header = {
 				.type = PERF_RECORD_COMM,
-				.misc = 0,
+				.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
 				/* .size */
 			},
 			/* .pid */
-- 
cgit v1.2.3-71-gd317


From f27b087b81b70513b8c61ec20596c868f7b93474 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 6 Jun 2014 07:57:37 -0600
Subject: block: add blk_rq_set_block_pc()

With the optimizations around not clearing the full request at alloc
time, we are leaving some of the needed init for REQ_TYPE_BLOCK_PC
up to the user allocating the request.

Add a blk_rq_set_block_pc() that sets the command type to
REQ_TYPE_BLOCK_PC, and properly initializes the members associated
with this type of request. Update callers to use this function instead
of manipulating rq->cmd_type directly.

Includes fixes from Christoph Hellwig <hch@lst.de> for my half-assed
attempt.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                            | 18 ++++++++++++++++++
 block/bsg.c                                 |  3 ++-
 block/scsi_ioctl.c                          |  6 +++---
 drivers/block/pktcdvd.c                     |  2 +-
 drivers/cdrom/cdrom.c                       |  2 +-
 drivers/scsi/device_handler/scsi_dh_alua.c  |  2 +-
 drivers/scsi/device_handler/scsi_dh_emc.c   |  2 +-
 drivers/scsi/device_handler/scsi_dh_hp_sw.c |  4 ++--
 drivers/scsi/device_handler/scsi_dh_rdac.c  |  2 +-
 drivers/scsi/osd/osd_initiator.c            |  4 ++--
 drivers/scsi/osst.c                         |  2 +-
 drivers/scsi/scsi_error.c                   |  3 ++-
 drivers/scsi/scsi_lib.c                     |  2 +-
 drivers/scsi/sg.c                           |  3 +--
 drivers/scsi/st.c                           |  2 +-
 drivers/target/target_core_pscsi.c          |  3 ++-
 include/linux/blkdev.h                      |  1 +
 17 files changed, 41 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 40d654861c33..9aca8c71e70b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1218,6 +1218,8 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio,
 	if (unlikely(!rq))
 		return ERR_PTR(-ENOMEM);
 
+	blk_rq_set_block_pc(rq);
+
 	for_each_bio(bio) {
 		struct bio *bounce_bio = bio;
 		int ret;
@@ -1234,6 +1236,22 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio,
 }
 EXPORT_SYMBOL(blk_make_request);
 
+/**
+ * blk_rq_set_block_pc - initialize a requeest to type BLOCK_PC
+ * @rq:		request to be initialized
+ *
+ */
+void blk_rq_set_block_pc(struct request *rq)
+{
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	rq->__data_len = 0;
+	rq->__sector = (sector_t) -1;
+	rq->bio = rq->biotail = NULL;
+	memset(rq->__cmd, 0, sizeof(rq->__cmd));
+	rq->cmd = rq->__cmd;
+}
+EXPORT_SYMBOL(blk_rq_set_block_pc);
+
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
diff --git a/block/bsg.c b/block/bsg.c
index e5214c148096..ff46addde5d8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -196,7 +196,6 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 	 * fill in request structure
 	 */
 	rq->cmd_len = hdr->request_len;
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 
 	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
@@ -273,6 +272,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 	rq = blk_get_request(q, rw, GFP_KERNEL);
 	if (!rq)
 		return ERR_PTR(-ENOMEM);
+	blk_rq_set_block_pc(rq);
+
 	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
 	if (ret)
 		goto out;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9c28a5b38042..14695c6221c8 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -229,7 +229,6 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 	 * fill in request structure
 	 */
 	rq->cmd_len = hdr->cmd_len;
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 
 	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
@@ -311,6 +310,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
 	if (!rq)
 		return -ENOMEM;
+	blk_rq_set_block_pc(rq);
 
 	if (blk_fill_sghdr_rq(q, rq, hdr, mode)) {
 		blk_put_request(rq);
@@ -491,7 +491,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	memset(sense, 0, sizeof(sense));
 	rq->sense = sense;
 	rq->sense_len = 0;
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	blk_rq_set_block_pc(rq);
 
 	blk_execute_rq(q, disk, rq, 0);
 
@@ -524,7 +524,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	int err;
 
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	blk_rq_set_block_pc(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	rq->cmd[0] = cmd;
 	rq->cmd[4] = data;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ef166ad2dbad..758ac442c5b5 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -704,6 +704,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 
 	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
 			     WRITE : READ, __GFP_WAIT);
+	blk_rq_set_block_pc(rq);
 
 	if (cgc->buflen) {
 		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
@@ -716,7 +717,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
 
 	rq->timeout = 60*HZ;
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	if (cgc->quiet)
 		rq->cmd_flags |= REQ_QUIET;
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 49ac5662585b..0f40c95049c0 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2184,6 +2184,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 			ret = -ENOMEM;
 			break;
 		}
+		blk_rq_set_block_pc(rq);
 
 		ret = blk_rq_map_user(q, rq, NULL, ubuf, len, GFP_KERNEL);
 		if (ret) {
@@ -2203,7 +2204,6 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		rq->cmd[9] = 0xf8;
 
 		rq->cmd_len = 12;
-		rq->cmd_type = REQ_TYPE_BLOCK_PC;
 		rq->timeout = 60 * HZ;
 		bio = rq->bio;
 
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 5248c888552b..7bcf67eec921 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -120,6 +120,7 @@ static struct request *get_alua_req(struct scsi_device *sdev,
 			    "%s: blk_get_request failed\n", __func__);
 		return NULL;
 	}
+	blk_rq_set_block_pc(rq);
 
 	if (buflen && blk_rq_map_kern(q, rq, buffer, buflen, GFP_NOIO)) {
 		blk_put_request(rq);
@@ -128,7 +129,6 @@ static struct request *get_alua_req(struct scsi_device *sdev,
 		return NULL;
 	}
 
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
 			 REQ_FAILFAST_DRIVER;
 	rq->retries = ALUA_FAILOVER_RETRIES;
diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c
index e1c8be06de9d..6f07f7fe3aa1 100644
--- a/drivers/scsi/device_handler/scsi_dh_emc.c
+++ b/drivers/scsi/device_handler/scsi_dh_emc.c
@@ -280,6 +280,7 @@ static struct request *get_req(struct scsi_device *sdev, int cmd,
 		return NULL;
 	}
 
+	blk_rq_set_block_pc(rq);
 	rq->cmd_len = COMMAND_SIZE(cmd);
 	rq->cmd[0] = cmd;
 
@@ -304,7 +305,6 @@ static struct request *get_req(struct scsi_device *sdev, int cmd,
 		break;
 	}
 
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
 			 REQ_FAILFAST_DRIVER;
 	rq->timeout = CLARIION_TIMEOUT;
diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
index 084062bb8ee9..e9d9fea9e272 100644
--- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c
+++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
@@ -120,7 +120,7 @@ retry:
 	if (!req)
 		return SCSI_DH_RES_TEMP_UNAVAIL;
 
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
+	blk_rq_set_block_pc(req);
 	req->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
 			  REQ_FAILFAST_DRIVER;
 	req->cmd_len = COMMAND_SIZE(TEST_UNIT_READY);
@@ -250,7 +250,7 @@ static int hp_sw_start_stop(struct hp_sw_dh_data *h)
 	if (!req)
 		return SCSI_DH_RES_TEMP_UNAVAIL;
 
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
+	blk_rq_set_block_pc(req);
 	req->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
 			  REQ_FAILFAST_DRIVER;
 	req->cmd_len = COMMAND_SIZE(START_STOP);
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index 4b9cf93f3fb6..826069db9848 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -279,6 +279,7 @@ static struct request *get_rdac_req(struct scsi_device *sdev,
 				"get_rdac_req: blk_get_request failed.\n");
 		return NULL;
 	}
+	blk_rq_set_block_pc(rq);
 
 	if (buflen && blk_rq_map_kern(q, rq, buffer, buflen, GFP_NOIO)) {
 		blk_put_request(rq);
@@ -287,7 +288,6 @@ static struct request *get_rdac_req(struct scsi_device *sdev,
 		return NULL;
 	}
 
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
 			 REQ_FAILFAST_DRIVER;
 	rq->retries = RDAC_RETRIES;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index bac04c2335aa..5f4cbf0c4759 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1570,6 +1570,7 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 		if (unlikely(!req))
 			return ERR_PTR(-ENOMEM);
 
+		blk_rq_set_block_pc(req);
 		return req;
 	}
 }
@@ -1590,7 +1591,6 @@ static int _init_blk_request(struct osd_request *or,
 	}
 
 	or->request = req;
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
 	req->cmd_flags |= REQ_QUIET;
 
 	req->timeout = or->timeout;
@@ -1608,7 +1608,7 @@ static int _init_blk_request(struct osd_request *or,
 				ret = PTR_ERR(req);
 				goto out;
 			}
-			req->cmd_type = REQ_TYPE_BLOCK_PC;
+			blk_rq_set_block_pc(req);
 			or->in.req = or->request->next_rq = req;
 		}
 	} else if (has_in)
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 21883a2d6324..0727ea7cc387 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -365,7 +365,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 	if (!req)
 		return DRIVER_ERROR << 24;
 
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
+	blk_rq_set_block_pc(req);
 	req->cmd_flags |= REQ_QUIET;
 
 	SRpnt->bio = NULL;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f17aa7aa7879..af624619d547 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1951,6 +1951,8 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	 */
 	req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL);
 
+	blk_rq_set_block_pc(req);
+
 	req->cmd[0] = ALLOW_MEDIUM_REMOVAL;
 	req->cmd[1] = 0;
 	req->cmd[2] = 0;
@@ -1960,7 +1962,6 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 
 	req->cmd_len = COMMAND_SIZE(req->cmd[0]);
 
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
 	req->cmd_flags |= REQ_QUIET;
 	req->timeout = 10 * HZ;
 	req->retries = 5;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a0c95cac91f0..c3c1697b143e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -195,6 +195,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	req = blk_get_request(sdev->request_queue, write, __GFP_WAIT);
 	if (!req)
 		return ret;
+	blk_rq_set_block_pc(req);
 
 	if (bufflen &&	blk_rq_map_kern(sdev->request_queue, req,
 					buffer, bufflen, __GFP_WAIT))
@@ -206,7 +207,6 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	req->sense_len = 0;
 	req->retries = retries;
 	req->timeout = timeout;
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
 	req->cmd_flags |= flags | REQ_QUIET | REQ_PREEMPT;
 
 	/*
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index df5e961484e1..53268aaba559 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1653,10 +1653,9 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	if (!rq)
 		return -ENOMEM;
 
+	blk_rq_set_block_pc(rq);
 	memcpy(rq->cmd, cmd, hp->cmd_len);
-
 	rq->cmd_len = hp->cmd_len;
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 
 	srp->rq = rq;
 	rq->end_io_data = srp;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index afc834e172c6..14eb4b256a03 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -484,7 +484,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 	if (!req)
 		return DRIVER_ERROR << 24;
 
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
+	blk_rq_set_block_pc(req);
 	req->cmd_flags |= REQ_QUIET;
 
 	mdata->null_mapped = 1;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 0f199f6a0738..94d00df28f39 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1055,6 +1055,8 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 			ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 			goto fail;
 		}
+
+		blk_rq_set_block_pc(req);
 	} else {
 		BUG_ON(!cmd->data_length);
 
@@ -1071,7 +1073,6 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 		}
 	}
 
-	req->cmd_type = REQ_TYPE_BLOCK_PC;
 	req->end_io = pscsi_req_done;
 	req->end_io_data = cmd;
 	req->cmd_len = scsi_command_size(pt->pscsi_cdb);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dc2c703f05fd..31e11051f1ba 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -796,6 +796,7 @@ extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern struct request *blk_make_request(struct request_queue *, struct bio *,
 					gfp_t);
+extern void blk_rq_set_block_pc(struct request *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
-- 
cgit v1.2.3-71-gd317


From a4391c6465d9c978fd4bded12e34bdde3f5458f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 5 Jun 2014 15:21:56 -0600
Subject: blk-mq: bump max tag depth to 10K tags

For some scsi-mq cases, the tag map can be huge. So increase the
max number of tags we support.

Additionally, don't fail with EINVAL if a user requests too many
tags. Warn that the tag depth has been adjusted down, and store
the new value inside the tag_set passed in.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 13 ++++++++++++-
 include/linux/blk-mq.h |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e4cd6208052..a6ee74e27957 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1967,13 +1967,19 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+/*
+ * Alloc a tag set to be associated with one or more request queues.
+ * May fail with EINVAL for various error conditions. May adjust the
+ * requested depth down, if if it too large. In that case, the set
+ * value will be stored in set->queue_depth.
+ */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
 	int i;
 
 	if (!set->nr_hw_queues)
 		return -EINVAL;
-	if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+	if (!set->queue_depth)
 		return -EINVAL;
 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
 		return -EINVAL;
@@ -1981,6 +1987,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
 		return -EINVAL;
 
+	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
+		pr_info("blk-mq: reduced tag depth to %u\n",
+			BLK_MQ_MAX_DEPTH);
+		set->queue_depth = BLK_MQ_MAX_DEPTH;
+	}
 
 	set->tags = kmalloc_node(set->nr_hw_queues *
 				 sizeof(struct blk_mq_tags *),
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0feedebfde48..a002cf191427 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -135,7 +135,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 
-	BLK_MQ_MAX_DEPTH	= 2048,
+	BLK_MQ_MAX_DEPTH	= 10240,
 
 	BLK_MQ_CPU_WORK_BATCH	= 8,
 };
-- 
cgit v1.2.3-71-gd317


From ce369a545aac3da653dd95d8117093a862bf94d3 Mon Sep 17 00:00:00 2001
From: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Date: Fri, 16 May 2014 05:45:58 -0500
Subject: ARM: OMAP5+: dpll: support Duty Cycle Correction(DCC)

Duty Cycle Correction(DCC) needs to be enabled if the MPU is to run at
frequencies beyond 1.4GHz for OMAP5, DRA75x, DRA72x.

MPU DPLL has a limitation on the maximum frequency it can be locked
at. Duty Cycle Correction circuit is used to recover a correct duty
cycle for achieving higher frequencies (hardware internally switches
output to M3 output(CLKOUTHIF) from M2 output (CLKOUT)).

For further information, See the note on OMAP5432 Technical Reference
Manual(SWPU282U) chapter 3.6.3.3.1 "DPLLs Output Clocks Parameters",
and also the "OMAP543x ES2.0 DM Operating Conditions Addendum v0.5"
chapter 2.1 "Micro Processor Unit (MPU)". Equivalent information is
present in relevant DRA75x, 72x documentation(SPRUHP2E, SPRUHI2P).

Signed-off-by: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Signed-off-by: Taras Kondratiuk <taras@ti.com>
Signed-off-by: J Keerthy <j-keerthy@ti.com>
Signed-off-by: Nishanth Menon <nm@ti.com>
[t-kristo@ti.com: added TRM / DM references for DCC clock rate]
Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 arch/arm/mach-omap2/dpll3xxx.c | 9 +++++++++
 include/linux/clk/ti.h         | 4 ++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/dpll3xxx.c b/arch/arm/mach-omap2/dpll3xxx.c
index fcd8036af910..6d7ba37e2257 100644
--- a/arch/arm/mach-omap2/dpll3xxx.c
+++ b/arch/arm/mach-omap2/dpll3xxx.c
@@ -319,6 +319,15 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 
 	/* Set DPLL multiplier, divider */
 	v = omap2_clk_readl(clk, dd->mult_div1_reg);
+
+	/* Handle Duty Cycle Correction */
+	if (dd->dcc_mask) {
+		if (dd->last_rounded_rate >= dd->dcc_rate)
+			v |= dd->dcc_mask; /* Enable DCC */
+		else
+			v &= ~dd->dcc_mask; /* Disable DCC */
+	}
+
 	v &= ~(dd->mult_mask | dd->div1_mask);
 	v |= dd->last_rounded_m << __ffs(dd->mult_mask);
 	v |= (dd->last_rounded_n - 1) << __ffs(dd->div1_mask);
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 4231c41bed51..e8d8a35034a5 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -41,6 +41,8 @@
  * @idlest_reg: register containing the DPLL idle status bitfield
  * @autoidle_mask: mask of the DPLL autoidle mode bitfield in @autoidle_reg
  * @freqsel_mask: mask of the DPLL jitter correction bitfield in @control_reg
+ * @dcc_mask: mask of the DPLL DCC correction bitfield @mult_div1_reg
+ * @dcc_rate: rate atleast which DCC @dcc_mask must be set
  * @idlest_mask: mask of the DPLL idle status bitfield in @idlest_reg
  * @lpmode_mask: mask of the DPLL low-power mode bitfield in @control_reg
  * @m4xen_mask: mask of the DPLL M4X multiplier bitfield in @control_reg
@@ -86,6 +88,8 @@ struct dpll_data {
 	u32			idlest_mask;
 	u32			dco_mask;
 	u32			sddiv_mask;
+	u32			dcc_mask;
+	unsigned long		dcc_rate;
 	u32			lpmode_mask;
 	u32			m4xen_mask;
 	u8			auto_recal_bit;
-- 
cgit v1.2.3-71-gd317


From 999e568354b8c797f344a2154761dd94cc84e4ac Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 3 Jun 2014 17:33:35 -0400
Subject: nfs4: remove unused CHANGE_SECURITY_LABEL

This constant has the wrong value.  And we don't use it.  And it's been
removed from the 4.2 spec anyway.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/nfs4proc.c    | 2 +-
 include/linux/nfs4.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 397be39c6dc8..7f55fed8dc64 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2750,7 +2750,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
 #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL)
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 12c2cb947df5..a1e3064a8d99 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -399,8 +399,6 @@ enum lock_type4 {
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
-#define FATTR4_WORD2_CHANGE_SECURITY_LABEL \
-					(1UL << 17)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
-- 
cgit v1.2.3-71-gd317


From 0bf4828983dff062cd502f27ab8644b32774e72e Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Wed, 28 May 2014 15:12:01 -0500
Subject: svcrdma: refactor marshalling logic

This patch refactors the NFSRDMA server marshalling logic to
remove the intermediary map structures.  It also fixes an existing bug
where the NFSRDMA server was not minding the device fast register page
list length limitations.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 include/linux/sunrpc/svc_rdma.h          |   3 +-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 643 +++++++++++++------------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 230 ++---------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  62 +--
 4 files changed, 332 insertions(+), 606 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 0b8e3e6bdacf..5cf99a016368 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -115,14 +115,13 @@ struct svc_rdma_fastreg_mr {
 	struct list_head frmr_list;
 };
 struct svc_rdma_req_map {
-	struct svc_rdma_fastreg_mr *frmr;
 	unsigned long count;
 	union {
 		struct kvec sge[RPCSVC_MAXPAGES];
 		struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
+		unsigned long lkey[RPCSVC_MAXPAGES];
 	};
 };
-#define RDMACTXT_F_FAST_UNREG	1
 #define RDMACTXT_F_LAST_CTXT	2
 
 #define	SVCRDMA_DEVCAP_FAST_REG		1	/* fast mr registration */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 8d904e4eef15..52d9f2ce20b0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 
 	/* Set up the XDR head */
 	rqstp->rq_arg.head[0].iov_base = page_address(page);
-	rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
+	rqstp->rq_arg.head[0].iov_len =
+		min_t(size_t, byte_count, ctxt->sge[0].length);
 	rqstp->rq_arg.len = byte_count;
 	rqstp->rq_arg.buflen = byte_count;
 
@@ -85,7 +87,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 		page = ctxt->pages[sge_no];
 		put_page(rqstp->rq_pages[sge_no]);
 		rqstp->rq_pages[sge_no] = page;
-		bc -= min(bc, ctxt->sge[sge_no].length);
+		bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
 		rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
 		sge_no++;
 	}
@@ -113,291 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	rqstp->rq_arg.tail[0].iov_len = 0;
 }
 
-/* Encode a read-chunk-list as an array of IB SGE
- *
- * Assumptions:
- * - chunk[0]->position points to pages[0] at an offset of 0
- * - pages[] is not physically or virtually contiguous and consists of
- *   PAGE_SIZE elements.
- *
- * Output:
- * - sge array pointing into pages[] array.
- * - chunk_sge array specifying sge index and count for each
- *   chunk in the read list
- *
- */
-static int map_read_chunks(struct svcxprt_rdma *xprt,
-			   struct svc_rqst *rqstp,
-			   struct svc_rdma_op_ctxt *head,
-			   struct rpcrdma_msg *rmsgp,
-			   struct svc_rdma_req_map *rpl_map,
-			   struct svc_rdma_req_map *chl_map,
-			   int ch_count,
-			   int byte_count)
+static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
 {
-	int sge_no;
-	int sge_bytes;
-	int page_off;
-	int page_no;
-	int ch_bytes;
-	int ch_no;
-	struct rpcrdma_read_chunk *ch;
+	if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
+	     RDMA_TRANSPORT_IWARP)
+		return 1;
+	else
+		return min_t(int, sge_count, xprt->sc_max_sge);
+}
 
-	sge_no = 0;
-	page_no = 0;
-	page_off = 0;
-	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-	ch_no = 0;
-	ch_bytes = ntohl(ch->rc_target.rs_length);
-	head->arg.head[0] = rqstp->rq_arg.head[0];
-	head->arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->arg.pages = &head->pages[head->count];
-	head->hdr_count = head->count; /* save count of hdr pages */
-	head->arg.page_base = 0;
-	head->arg.page_len = ch_bytes;
-	head->arg.len = rqstp->rq_arg.len + ch_bytes;
-	head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
-	head->count++;
-	chl_map->ch[0].start = 0;
-	while (byte_count) {
-		rpl_map->sge[sge_no].iov_base =
-			page_address(rqstp->rq_arg.pages[page_no]) + page_off;
-		sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
-		rpl_map->sge[sge_no].iov_len = sge_bytes;
-		/*
-		 * Don't bump head->count here because the same page
-		 * may be used by multiple SGE.
-		 */
-		head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
-		rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
+			      struct svc_rqst *rqstp,
+			      struct svc_rdma_op_ctxt *head,
+			      int *page_no,
+			      u32 *page_offset,
+			      u32 rs_handle,
+			      u32 rs_length,
+			      u64 rs_offset,
+			      int last);
+
+/* Issue an RDMA_READ using the local lkey to map the data sink */
+static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+			       struct svc_rqst *rqstp,
+			       struct svc_rdma_op_ctxt *head,
+			       int *page_no,
+			       u32 *page_offset,
+			       u32 rs_handle,
+			       u32 rs_length,
+			       u64 rs_offset,
+			       int last)
+{
+	struct ib_send_wr read_wr;
+	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+	struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+	int ret, read, pno;
+	u32 pg_off = *page_offset;
+	u32 pg_no = *page_no;
+
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->read_hdr = head;
+	pages_needed =
+		min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
+	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+
+	for (pno = 0; pno < pages_needed; pno++) {
+		int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+		head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+		head->arg.page_len += len;
+		head->arg.len += len;
+		if (!pg_off)
+			head->count++;
+		rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
 		rqstp->rq_next_page = rqstp->rq_respages + 1;
+		ctxt->sge[pno].addr =
+			ib_dma_map_page(xprt->sc_cm_id->device,
+					head->arg.pages[pg_no], pg_off,
+					PAGE_SIZE - pg_off,
+					DMA_FROM_DEVICE);
+		ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+					   ctxt->sge[pno].addr);
+		if (ret)
+			goto err;
+		atomic_inc(&xprt->sc_dma_used);
 
-		byte_count -= sge_bytes;
-		ch_bytes -= sge_bytes;
-		sge_no++;
-		/*
-		 * If all bytes for this chunk have been mapped to an
-		 * SGE, move to the next SGE
-		 */
-		if (ch_bytes == 0) {
-			chl_map->ch[ch_no].count =
-				sge_no - chl_map->ch[ch_no].start;
-			ch_no++;
-			ch++;
-			chl_map->ch[ch_no].start = sge_no;
-			ch_bytes = ntohl(ch->rc_target.rs_length);
-			/* If bytes remaining account for next chunk */
-			if (byte_count) {
-				head->arg.page_len += ch_bytes;
-				head->arg.len += ch_bytes;
-				head->arg.buflen += ch_bytes;
-			}
+		/* The lkey here is either a local dma lkey or a dma_mr lkey */
+		ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+		ctxt->sge[pno].length = len;
+		ctxt->count++;
+
+		/* adjust offset and wrap to next page if needed */
+		pg_off += len;
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			pg_no++;
 		}
-		/*
-		 * If this SGE consumed all of the page, move to the
-		 * next page
-		 */
-		if ((sge_bytes + page_off) == PAGE_SIZE) {
-			page_no++;
-			page_off = 0;
-			/*
-			 * If there are still bytes left to map, bump
-			 * the page count
-			 */
-			if (byte_count)
-				head->count++;
-		} else
-			page_off += sge_bytes;
+		rs_length -= len;
 	}
-	BUG_ON(byte_count != 0);
-	return sge_no;
+
+	if (last && rs_length == 0)
+		set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+	else
+		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+
+	memset(&read_wr, 0, sizeof(read_wr));
+	read_wr.wr_id = (unsigned long)ctxt;
+	read_wr.opcode = IB_WR_RDMA_READ;
+	ctxt->wr_op = read_wr.opcode;
+	read_wr.send_flags = IB_SEND_SIGNALED;
+	read_wr.wr.rdma.rkey = rs_handle;
+	read_wr.wr.rdma.remote_addr = rs_offset;
+	read_wr.sg_list = ctxt->sge;
+	read_wr.num_sge = pages_needed;
+
+	ret = svc_rdma_send(xprt, &read_wr);
+	if (ret) {
+		pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+		goto err;
+	}
+
+	/* return current location in page array */
+	*page_no = pg_no;
+	*page_offset = pg_off;
+	ret = read;
+	atomic_inc(&rdma_stat_read);
+	return ret;
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 0);
+	return ret;
 }
 
-/* Map a read-chunk-list to an XDR and fast register the page-list.
- *
- * Assumptions:
- * - chunk[0]	position points to pages[0] at an offset of 0
- * - pages[]	will be made physically contiguous by creating a one-off memory
- *		region using the fastreg verb.
- * - byte_count is # of bytes in read-chunk-list
- * - ch_count	is # of chunks in read-chunk-list
- *
- * Output:
- * - sge array pointing into pages[] array.
- * - chunk_sge array specifying sge index and count for each
- *   chunk in the read list
- */
-static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+/* Issue an RDMA_READ using an FRMR to map the data sink */
+static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
 				struct svc_rqst *rqstp,
 				struct svc_rdma_op_ctxt *head,
-				struct rpcrdma_msg *rmsgp,
-				struct svc_rdma_req_map *rpl_map,
-				struct svc_rdma_req_map *chl_map,
-				int ch_count,
-				int byte_count)
+				int *page_no,
+				u32 *page_offset,
+				u32 rs_handle,
+				u32 rs_length,
+				u64 rs_offset,
+				int last)
 {
-	int page_no;
-	int ch_no;
-	u32 offset;
-	struct rpcrdma_read_chunk *ch;
-	struct svc_rdma_fastreg_mr *frmr;
-	int ret = 0;
+	struct ib_send_wr read_wr;
+	struct ib_send_wr inv_wr;
+	struct ib_send_wr fastreg_wr;
+	u8 key;
+	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+	struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+	struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
+	int ret, read, pno;
+	u32 pg_off = *page_offset;
+	u32 pg_no = *page_no;
 
-	frmr = svc_rdma_get_frmr(xprt);
 	if (IS_ERR(frmr))
 		return -ENOMEM;
 
-	head->frmr = frmr;
-	head->arg.head[0] = rqstp->rq_arg.head[0];
-	head->arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->arg.pages = &head->pages[head->count];
-	head->hdr_count = head->count; /* save count of hdr pages */
-	head->arg.page_base = 0;
-	head->arg.page_len = byte_count;
-	head->arg.len = rqstp->rq_arg.len + byte_count;
-	head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->frmr = frmr;
+	pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
+	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
 
-	/* Fast register the page list */
-	frmr->kva = page_address(rqstp->rq_arg.pages[0]);
+	frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
 	frmr->direction = DMA_FROM_DEVICE;
 	frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
-	frmr->map_len = byte_count;
-	frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
-	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
-		frmr->page_list->page_list[page_no] =
+	frmr->map_len = pages_needed << PAGE_SHIFT;
+	frmr->page_list_len = pages_needed;
+
+	for (pno = 0; pno < pages_needed; pno++) {
+		int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+		head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+		head->arg.page_len += len;
+		head->arg.len += len;
+		if (!pg_off)
+			head->count++;
+		rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+		rqstp->rq_next_page = rqstp->rq_respages + 1;
+		frmr->page_list->page_list[pno] =
 			ib_dma_map_page(xprt->sc_cm_id->device,
-					rqstp->rq_arg.pages[page_no], 0,
+					head->arg.pages[pg_no], 0,
 					PAGE_SIZE, DMA_FROM_DEVICE);
-		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-					 frmr->page_list->page_list[page_no]))
-			goto fatal_err;
+		ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+					   frmr->page_list->page_list[pno]);
+		if (ret)
+			goto err;
 		atomic_inc(&xprt->sc_dma_used);
-		head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
-	}
-	head->count += page_no;
-
-	/* rq_respages points one past arg pages */
-	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
-	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
-	/* Create the reply and chunk maps */
-	offset = 0;
-	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-	for (ch_no = 0; ch_no < ch_count; ch_no++) {
-		int len = ntohl(ch->rc_target.rs_length);
-		rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
-		rpl_map->sge[ch_no].iov_len = len;
-		chl_map->ch[ch_no].count = 1;
-		chl_map->ch[ch_no].start = ch_no;
-		offset += len;
-		ch++;
+		/* adjust offset and wrap to next page if needed */
+		pg_off += len;
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			pg_no++;
+		}
+		rs_length -= len;
 	}
 
-	ret = svc_rdma_fastreg(xprt, frmr);
-	if (ret)
-		goto fatal_err;
-
-	return ch_no;
-
- fatal_err:
-	printk("svcrdma: error fast registering xdr for xprt %p", xprt);
-	svc_rdma_put_frmr(xprt, frmr);
-	return -EIO;
-}
-
-static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
-			     struct svc_rdma_op_ctxt *ctxt,
-			     struct svc_rdma_fastreg_mr *frmr,
-			     struct kvec *vec,
-			     u64 *sgl_offset,
-			     int count)
-{
-	int i;
-	unsigned long off;
+	if (last && rs_length == 0)
+		set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+	else
+		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
 
-	ctxt->count = count;
-	ctxt->direction = DMA_FROM_DEVICE;
-	for (i = 0; i < count; i++) {
-		ctxt->sge[i].length = 0; /* in case map fails */
-		if (!frmr) {
-			BUG_ON(!virt_to_page(vec[i].iov_base));
-			off = (unsigned long)vec[i].iov_base & ~PAGE_MASK;
-			ctxt->sge[i].addr =
-				ib_dma_map_page(xprt->sc_cm_id->device,
-						virt_to_page(vec[i].iov_base),
-						off,
-						vec[i].iov_len,
-						DMA_FROM_DEVICE);
-			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-						 ctxt->sge[i].addr))
-				return -EINVAL;
-			ctxt->sge[i].lkey = xprt->sc_dma_lkey;
-			atomic_inc(&xprt->sc_dma_used);
-		} else {
-			ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
-			ctxt->sge[i].lkey = frmr->mr->lkey;
-		}
-		ctxt->sge[i].length = vec[i].iov_len;
-		*sgl_offset = *sgl_offset + vec[i].iov_len;
+	/* Bump the key */
+	key = (u8)(frmr->mr->lkey & 0x000000FF);
+	ib_update_fast_reg_key(frmr->mr, ++key);
+
+	ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
+	ctxt->sge[0].lkey = frmr->mr->lkey;
+	ctxt->sge[0].length = read;
+	ctxt->count = 1;
+	ctxt->read_hdr = head;
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.send_flags = IB_SEND_SIGNALED;
+	fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+	fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+	fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = frmr->map_len;
+	fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+	fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+	fastreg_wr.next = &read_wr;
+
+	/* Prepare RDMA_READ */
+	memset(&read_wr, 0, sizeof(read_wr));
+	read_wr.send_flags = IB_SEND_SIGNALED;
+	read_wr.wr.rdma.rkey = rs_handle;
+	read_wr.wr.rdma.remote_addr = rs_offset;
+	read_wr.sg_list = ctxt->sge;
+	read_wr.num_sge = 1;
+	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
+		read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+		read_wr.wr_id = (unsigned long)ctxt;
+		read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
+	} else {
+		read_wr.opcode = IB_WR_RDMA_READ;
+		read_wr.next = &inv_wr;
+		/* Prepare invalidate */
+		memset(&inv_wr, 0, sizeof(inv_wr));
+		inv_wr.wr_id = (unsigned long)ctxt;
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.send_flags = IB_SEND_SIGNALED;
+		inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
+	}
+	ctxt->wr_op = read_wr.opcode;
+
+	/* Post the chain */
+	ret = svc_rdma_send(xprt, &fastreg_wr);
+	if (ret) {
+		pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+		goto err;
 	}
-	return 0;
-}
 
-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
-{
-	if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
-	     RDMA_TRANSPORT_IWARP) &&
-	    sge_count > 1)
-		return 1;
-	else
-		return min_t(int, sge_count, xprt->sc_max_sge);
+	/* return current location in page array */
+	*page_no = pg_no;
+	*page_offset = pg_off;
+	ret = read;
+	atomic_inc(&rdma_stat_read);
+	return ret;
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 0);
+	svc_rdma_put_frmr(xprt, frmr);
+	return ret;
 }
 
-/*
- * Use RDMA_READ to read data from the advertised client buffer into the
- * XDR stream starting at rq_arg.head[0].iov_base.
- * Each chunk in the array
- * contains the following fields:
- * discrim      - '1', This isn't used for data placement
- * position     - The xdr stream offset (the same for every chunk)
- * handle       - RMR for client memory region
- * length       - data transfer length
- * offset       - 64 bit tagged offset in remote memory region
- *
- * On our side, we need to read into a pagelist. The first page immediately
- * follows the RPC header.
- *
- * This function returns:
- * 0 - No error and no read-list found.
- *
- * 1 - Successful read-list processing. The data is not yet in
- * the pagelist and therefore the RPC request must be deferred. The
- * I/O completion will enqueue the transport again and
- * svc_rdma_recvfrom will complete the request.
- *
- * <0 - Error processing/posting read-list.
- *
- * NOTE: The ctxt must not be touched after the last WR has been posted
- * because the I/O completion processing may occur on another
- * processor and free / modify the context. Ne touche pas!
- */
-static int rdma_read_xdr(struct svcxprt_rdma *xprt,
-			 struct rpcrdma_msg *rmsgp,
-			 struct svc_rqst *rqstp,
-			 struct svc_rdma_op_ctxt *hdr_ctxt)
+static int rdma_read_chunks(struct svcxprt_rdma *xprt,
+			    struct rpcrdma_msg *rmsgp,
+			    struct svc_rqst *rqstp,
+			    struct svc_rdma_op_ctxt *head)
 {
-	struct ib_send_wr read_wr;
-	struct ib_send_wr inv_wr;
-	int err = 0;
-	int ch_no;
-	int ch_count;
-	int byte_count;
-	int sge_count;
-	u64 sgl_offset;
+	int page_no, ch_count, ret;
 	struct rpcrdma_read_chunk *ch;
-	struct svc_rdma_op_ctxt *ctxt = NULL;
-	struct svc_rdma_req_map *rpl_map;
-	struct svc_rdma_req_map *chl_map;
+	u32 page_offset, byte_count;
+	u64 rs_offset;
+	rdma_reader_fn reader;
 
 	/* If no read list is present, return 0 */
 	ch = svc_rdma_get_read_chunk(rmsgp);
@@ -408,122 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 	if (ch_count > RPCSVC_MAXPAGES)
 		return -EINVAL;
 
-	/* Allocate temporary reply and chunk maps */
-	rpl_map = svc_rdma_get_req_map();
-	chl_map = svc_rdma_get_req_map();
+	/* The request is completed when the RDMA_READs complete. The
+	 * head context keeps all the pages that comprise the
+	 * request.
+	 */
+	head->arg.head[0] = rqstp->rq_arg.head[0];
+	head->arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->arg.pages = &head->pages[head->count];
+	head->hdr_count = head->count;
+	head->arg.page_base = 0;
+	head->arg.page_len = 0;
+	head->arg.len = rqstp->rq_arg.len;
+	head->arg.buflen = rqstp->rq_arg.buflen;
 
-	if (!xprt->sc_frmr_pg_list_len)
-		sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
-					    rpl_map, chl_map, ch_count,
-					    byte_count);
+	/* Use FRMR if supported */
+	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
+		reader = rdma_read_chunk_frmr;
 	else
-		sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
-						 rpl_map, chl_map, ch_count,
-						 byte_count);
-	if (sge_count < 0) {
-		err = -EIO;
-		goto out;
-	}
-
-	sgl_offset = 0;
-	ch_no = 0;
+		reader = rdma_read_chunk_lcl;
 
+	page_no = 0; page_offset = 0;
 	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-	     ch->rc_discrim != 0; ch++, ch_no++) {
-		u64 rs_offset;
-next_sge:
-		ctxt = svc_rdma_get_context(xprt);
-		ctxt->direction = DMA_FROM_DEVICE;
-		ctxt->frmr = hdr_ctxt->frmr;
-		ctxt->read_hdr = NULL;
-		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
-		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+	     ch->rc_discrim != 0; ch++) {
 
-		/* Prepare READ WR */
-		memset(&read_wr, 0, sizeof read_wr);
-		read_wr.wr_id = (unsigned long)ctxt;
-		read_wr.opcode = IB_WR_RDMA_READ;
-		ctxt->wr_op = read_wr.opcode;
-		read_wr.send_flags = IB_SEND_SIGNALED;
-		read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle);
 		xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
 				 &rs_offset);
-		read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset;
-		read_wr.sg_list = ctxt->sge;
-		read_wr.num_sge =
-			rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
-		err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
-					&rpl_map->sge[chl_map->ch[ch_no].start],
-					&sgl_offset,
-					read_wr.num_sge);
-		if (err) {
-			svc_rdma_unmap_dma(ctxt);
-			svc_rdma_put_context(ctxt, 0);
-			goto out;
-		}
-		if (((ch+1)->rc_discrim == 0) &&
-		    (read_wr.num_sge == chl_map->ch[ch_no].count)) {
-			/*
-			 * Mark the last RDMA_READ with a bit to
-			 * indicate all RPC data has been fetched from
-			 * the client and the RPC needs to be enqueued.
-			 */
-			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
-			if (hdr_ctxt->frmr) {
-				set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
-				/*
-				 * Invalidate the local MR used to map the data
-				 * sink.
-				 */
-				if (xprt->sc_dev_caps &
-				    SVCRDMA_DEVCAP_READ_W_INV) {
-					read_wr.opcode =
-						IB_WR_RDMA_READ_WITH_INV;
-					ctxt->wr_op = read_wr.opcode;
-					read_wr.ex.invalidate_rkey =
-						ctxt->frmr->mr->lkey;
-				} else {
-					/* Prepare INVALIDATE WR */
-					memset(&inv_wr, 0, sizeof inv_wr);
-					inv_wr.opcode = IB_WR_LOCAL_INV;
-					inv_wr.send_flags = IB_SEND_SIGNALED;
-					inv_wr.ex.invalidate_rkey =
-						hdr_ctxt->frmr->mr->lkey;
-					read_wr.next = &inv_wr;
-				}
-			}
-			ctxt->read_hdr = hdr_ctxt;
-		}
-		/* Post the read */
-		err = svc_rdma_send(xprt, &read_wr);
-		if (err) {
-			printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
-			       err);
-			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
-			svc_rdma_unmap_dma(ctxt);
-			svc_rdma_put_context(ctxt, 0);
-			goto out;
+		byte_count = ntohl(ch->rc_target.rs_length);
+
+		while (byte_count > 0) {
+			ret = reader(xprt, rqstp, head,
+				     &page_no, &page_offset,
+				     ntohl(ch->rc_target.rs_handle),
+				     byte_count, rs_offset,
+				     ((ch+1)->rc_discrim == 0) /* last */
+				     );
+			if (ret < 0)
+				goto err;
+			byte_count -= ret;
+			rs_offset += ret;
+			head->arg.buflen += ret;
 		}
-		atomic_inc(&rdma_stat_read);
-
-		if (read_wr.num_sge < chl_map->ch[ch_no].count) {
-			chl_map->ch[ch_no].count -= read_wr.num_sge;
-			chl_map->ch[ch_no].start += read_wr.num_sge;
-			goto next_sge;
-		}
-		sgl_offset = 0;
-		err = 1;
 	}
-
- out:
-	svc_rdma_put_req_map(rpl_map);
-	svc_rdma_put_req_map(chl_map);
-
+	ret = 1;
+ err:
 	/* Detach arg pages. svc_recv will replenish them */
-	for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
-		rqstp->rq_pages[ch_no] = NULL;
+	for (page_no = 0;
+	     &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++)
+		rqstp->rq_pages[page_no] = NULL;
 
-	return err;
+	return ret;
 }
 
 static int rdma_read_complete(struct svc_rqst *rqstp,
@@ -595,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 				  struct svc_rdma_op_ctxt,
 				  dto_q);
 		list_del_init(&ctxt->dto_q);
-	}
-	if (ctxt) {
 		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
 		return rdma_read_complete(rqstp, ctxt);
-	}
-
-	if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
 		ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
 				  struct svc_rdma_op_ctxt,
 				  dto_q);
@@ -621,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
 			goto close_out;
 
-		BUG_ON(ret);
 		goto out;
 	}
 	dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
@@ -644,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	}
 
 	/* Read read-list data. */
-	ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
+	ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
 	if (ret > 0) {
 		/* read-list posted, defer until data received from client. */
 		goto defer;
-	}
-	if (ret < 0) {
+	} else if (ret < 0) {
 		/* Post of read-list failed, free context. */
 		svc_rdma_put_context(ctxt, 1);
 		return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 7e024a51617e..49fd21a5c215 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -49,152 +50,6 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
-/* Encode an XDR as an array of IB SGE
- *
- * Assumptions:
- * - head[0] is physically contiguous.
- * - tail[0] is physically contiguous.
- * - pages[] is not physically or virtually contiguous and consists of
- *   PAGE_SIZE elements.
- *
- * Output:
- * SGE[0]              reserved for RCPRDMA header
- * SGE[1]              data from xdr->head[]
- * SGE[2..sge_count-2] data from xdr->pages[]
- * SGE[sge_count-1]    data from xdr->tail.
- *
- * The max SGE we need is the length of the XDR / pagesize + one for
- * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
- * reserves a page for both the request and the reply header, and this
- * array is only concerned with the reply we are assured that we have
- * on extra page for the RPCRMDA header.
- */
-static int fast_reg_xdr(struct svcxprt_rdma *xprt,
-			struct xdr_buf *xdr,
-			struct svc_rdma_req_map *vec)
-{
-	int sge_no;
-	u32 sge_bytes;
-	u32 page_bytes;
-	u32 page_off;
-	int page_no = 0;
-	u8 *frva;
-	struct svc_rdma_fastreg_mr *frmr;
-
-	frmr = svc_rdma_get_frmr(xprt);
-	if (IS_ERR(frmr))
-		return -ENOMEM;
-	vec->frmr = frmr;
-
-	/* Skip the RPCRDMA header */
-	sge_no = 1;
-
-	/* Map the head. */
-	frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
-	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
-	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
-	vec->count = 2;
-	sge_no++;
-
-	/* Map the XDR head */
-	frmr->kva = frva;
-	frmr->direction = DMA_TO_DEVICE;
-	frmr->access_flags = 0;
-	frmr->map_len = PAGE_SIZE;
-	frmr->page_list_len = 1;
-	page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
-	frmr->page_list->page_list[page_no] =
-		ib_dma_map_page(xprt->sc_cm_id->device,
-				virt_to_page(xdr->head[0].iov_base),
-				page_off,
-				PAGE_SIZE - page_off,
-				DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-				 frmr->page_list->page_list[page_no]))
-		goto fatal_err;
-	atomic_inc(&xprt->sc_dma_used);
-
-	/* Map the XDR page list */
-	page_off = xdr->page_base;
-	page_bytes = xdr->page_len + page_off;
-	if (!page_bytes)
-		goto encode_tail;
-
-	/* Map the pages */
-	vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
-	vec->sge[sge_no].iov_len = page_bytes;
-	sge_no++;
-	while (page_bytes) {
-		struct page *page;
-
-		page = xdr->pages[page_no++];
-		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
-		page_bytes -= sge_bytes;
-
-		frmr->page_list->page_list[page_no] =
-			ib_dma_map_page(xprt->sc_cm_id->device,
-					page, page_off,
-					sge_bytes, DMA_TO_DEVICE);
-		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-					 frmr->page_list->page_list[page_no]))
-			goto fatal_err;
-
-		atomic_inc(&xprt->sc_dma_used);
-		page_off = 0; /* reset for next time through loop */
-		frmr->map_len += PAGE_SIZE;
-		frmr->page_list_len++;
-	}
-	vec->count++;
-
- encode_tail:
-	/* Map tail */
-	if (0 == xdr->tail[0].iov_len)
-		goto done;
-
-	vec->count++;
-	vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
-
-	if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
-	    ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
-		/*
-		 * If head and tail use the same page, we don't need
-		 * to map it again.
-		 */
-		vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
-	} else {
-		void *va;
-
-		/* Map another page for the tail */
-		page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
-		va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
-		vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
-
-		frmr->page_list->page_list[page_no] =
-		    ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va),
-				    page_off,
-				    PAGE_SIZE,
-				    DMA_TO_DEVICE);
-		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-					 frmr->page_list->page_list[page_no]))
-			goto fatal_err;
-		atomic_inc(&xprt->sc_dma_used);
-		frmr->map_len += PAGE_SIZE;
-		frmr->page_list_len++;
-	}
-
- done:
-	if (svc_rdma_fastreg(xprt, frmr))
-		goto fatal_err;
-
-	return 0;
-
- fatal_err:
-	printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
-	vec->frmr = NULL;
-	svc_rdma_put_frmr(xprt, frmr);
-	return -EIO;
-}
-
 static int map_xdr(struct svcxprt_rdma *xprt,
 		   struct xdr_buf *xdr,
 		   struct svc_rdma_req_map *vec)
@@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 	BUG_ON(xdr->len !=
 	       (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
 
-	if (xprt->sc_frmr_pg_list_len)
-		return fast_reg_xdr(xprt, xdr, vec);
-
 	/* Skip the first sge, this is for the RPCRDMA header */
 	sge_no = 1;
 
@@ -282,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
 }
 
 /* Assumptions:
- * - We are using FRMR
- *     - or -
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
 static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
@@ -327,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 		sge_bytes = min_t(size_t,
 			  bc, vec->sge[xdr_sge_no].iov_len-sge_off);
 		sge[sge_no].length = sge_bytes;
-		if (!vec->frmr) {
-			sge[sge_no].addr =
-				dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
-					    sge_bytes, DMA_TO_DEVICE);
-			xdr_off += sge_bytes;
-			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-						 sge[sge_no].addr))
-				goto err;
-			atomic_inc(&xprt->sc_dma_used);
-			sge[sge_no].lkey = xprt->sc_dma_lkey;
-		} else {
-			sge[sge_no].addr = (unsigned long)
-				vec->sge[xdr_sge_no].iov_base + sge_off;
-			sge[sge_no].lkey = vec->frmr->mr->lkey;
-		}
+		sge[sge_no].addr =
+			dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+				    sge_bytes, DMA_TO_DEVICE);
+		xdr_off += sge_bytes;
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 sge[sge_no].addr))
+			goto err;
+		atomic_inc(&xprt->sc_dma_used);
+		sge[sge_no].lkey = xprt->sc_dma_lkey;
 		ctxt->count++;
-		ctxt->frmr = vec->frmr;
 		sge_off = 0;
 		sge_no++;
 		xdr_sge_no++;
@@ -369,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 	return 0;
  err:
 	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_frmr(xprt, vec->frmr);
 	svc_rdma_put_context(ctxt, 0);
 	/* Fatal error, close transport */
 	return -EIO;
@@ -397,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
 	res_ary = (struct rpcrdma_write_array *)
 		&rdma_resp->rm_body.rm_chunks[1];
 
-	if (vec->frmr)
-		max_write = vec->frmr->map_len;
-	else
-		max_write = xprt->sc_max_sge * PAGE_SIZE;
+	max_write = xprt->sc_max_sge * PAGE_SIZE;
 
 	/* Write chunks start at the pagelist */
 	for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
@@ -472,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
 	res_ary = (struct rpcrdma_write_array *)
 		&rdma_resp->rm_body.rm_chunks[2];
 
-	if (vec->frmr)
-		max_write = vec->frmr->map_len;
-	else
-		max_write = xprt->sc_max_sge * PAGE_SIZE;
+	max_write = xprt->sc_max_sge * PAGE_SIZE;
 
 	/* xdr offset starts at RPC message */
 	nchunks = ntohl(arg_ary->wc_nchunks);
@@ -545,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		      int byte_count)
 {
 	struct ib_send_wr send_wr;
-	struct ib_send_wr inv_wr;
 	int sge_no;
 	int sge_bytes;
 	int page_no;
@@ -559,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		       "svcrdma: could not post a receive buffer, err=%d."
 		       "Closing transport %p.\n", ret, rdma);
 		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
-		svc_rdma_put_frmr(rdma, vec->frmr);
 		svc_rdma_put_context(ctxt, 0);
 		return -ENOTCONN;
 	}
@@ -567,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	/* Prepare the context */
 	ctxt->pages[0] = page;
 	ctxt->count = 1;
-	ctxt->frmr = vec->frmr;
-	if (vec->frmr)
-		set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
-	else
-		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
 
 	/* Prepare the SGE for the RPCRDMA Header */
 	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
@@ -590,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		int xdr_off = 0;
 		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
 		byte_count -= sge_bytes;
-		if (!vec->frmr) {
-			ctxt->sge[sge_no].addr =
-				dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
-					    sge_bytes, DMA_TO_DEVICE);
-			xdr_off += sge_bytes;
-			if (ib_dma_mapping_error(rdma->sc_cm_id->device,
-						 ctxt->sge[sge_no].addr))
-				goto err;
-			atomic_inc(&rdma->sc_dma_used);
-			ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
-		} else {
-			ctxt->sge[sge_no].addr = (unsigned long)
-				vec->sge[sge_no].iov_base;
-			ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
-		}
+		ctxt->sge[sge_no].addr =
+			dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+				    sge_bytes, DMA_TO_DEVICE);
+		xdr_off += sge_bytes;
+		if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+					 ctxt->sge[sge_no].addr))
+			goto err;
+		atomic_inc(&rdma->sc_dma_used);
+		ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
 		ctxt->sge[sge_no].length = sge_bytes;
 	}
 	BUG_ON(byte_count != 0);
@@ -627,6 +450,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 			ctxt->sge[page_no+1].length = 0;
 	}
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
 	BUG_ON(sge_no > rdma->sc_max_sge);
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
@@ -635,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	send_wr.num_sge = sge_no;
 	send_wr.opcode = IB_WR_SEND;
 	send_wr.send_flags =  IB_SEND_SIGNALED;
-	if (vec->frmr) {
-		/* Prepare INVALIDATE WR */
-		memset(&inv_wr, 0, sizeof inv_wr);
-		inv_wr.opcode = IB_WR_LOCAL_INV;
-		inv_wr.send_flags = IB_SEND_SIGNALED;
-		inv_wr.ex.invalidate_rkey =
-			vec->frmr->mr->lkey;
-		send_wr.next = &inv_wr;
-	}
 
 	ret = svc_rdma_send(rdma, &send_wr);
 	if (ret)
@@ -653,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 
  err:
 	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_frmr(rdma, vec->frmr);
 	svc_rdma_put_context(ctxt, 1);
 	return -EIO;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 02db8d9cc994..e7323fbbd348 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -162,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
 		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
 	}
 	map->count = 0;
-	map->frmr = NULL;
 	return map;
 }
 
@@ -338,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt,
 
 	switch (ctxt->wr_op) {
 	case IB_WR_SEND:
-		if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
-			svc_rdma_put_frmr(xprt, ctxt->frmr);
+		BUG_ON(ctxt->frmr);
 		svc_rdma_put_context(ctxt, 1);
 		break;
 
 	case IB_WR_RDMA_WRITE:
+		BUG_ON(ctxt->frmr);
 		svc_rdma_put_context(ctxt, 0);
 		break;
 
 	case IB_WR_RDMA_READ:
 	case IB_WR_RDMA_READ_WITH_INV:
+		svc_rdma_put_frmr(xprt, ctxt->frmr);
 		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
 			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
 			BUG_ON(!read_hdr);
-			if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
-				svc_rdma_put_frmr(xprt, ctxt->frmr);
 			spin_lock_bh(&xprt->sc_rq_dto_lock);
 			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
 			list_add_tail(&read_hdr->dto_q,
@@ -365,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt,
 		break;
 
 	default:
+		BUG_ON(1);
 		printk(KERN_ERR "svcrdma: unexpected completion type, "
 		       "opcode=%d\n",
 		       ctxt->wr_op);
@@ -380,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt,
 static void sq_cq_reap(struct svcxprt_rdma *xprt)
 {
 	struct svc_rdma_op_ctxt *ctxt = NULL;
-	struct ib_wc wc;
+	struct ib_wc wc_a[6];
+	struct ib_wc *wc;
 	struct ib_cq *cq = xprt->sc_sq_cq;
 	int ret;
 
+	memset(wc_a, 0, sizeof(wc_a));
+
 	if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
 		return;
 
 	ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
 	atomic_inc(&rdma_stat_sq_poll);
-	while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
-		if (wc.status != IB_WC_SUCCESS)
-			/* Close the transport */
-			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+	while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
+		int i;
 
-		/* Decrement used SQ WR count */
-		atomic_dec(&xprt->sc_sq_count);
-		wake_up(&xprt->sc_send_wait);
+		for (i = 0; i < ret; i++) {
+			wc = &wc_a[i];
+			if (wc->status != IB_WC_SUCCESS) {
+				dprintk("svcrdma: sq wc err status %d\n",
+					wc->status);
 
-		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
-		if (ctxt)
-			process_context(xprt, ctxt);
+				/* Close the transport */
+				set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			}
 
-		svc_xprt_put(&xprt->sc_xprt);
+			/* Decrement used SQ WR count */
+			atomic_dec(&xprt->sc_sq_count);
+			wake_up(&xprt->sc_send_wait);
+
+			ctxt = (struct svc_rdma_op_ctxt *)
+				(unsigned long)wc->wr_id;
+			if (ctxt)
+				process_context(xprt, ctxt);
+
+			svc_xprt_put(&xprt->sc_xprt);
+		}
 	}
 
 	if (ctxt)
@@ -995,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 			need_dma_mr = 0;
 		break;
 	case RDMA_TRANSPORT_IB:
-		if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else if (!(devattr.device_cap_flags &
+			     IB_DEVICE_LOCAL_DMA_LKEY)) {
 			need_dma_mr = 1;
 			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
 		} else
@@ -1192,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
 
 	/*
-	 * If there are fewer SQ WR available than required to send a
-	 * simple response, return false.
-	 */
-	if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
-		return 0;
-
-	/*
-	 * ...or there are already waiters on the SQ,
+	 * If there are already waiters on the SQ,
 	 * return false.
 	 */
 	if (waitqueue_active(&rdma->sc_send_wait))
-- 
cgit v1.2.3-71-gd317


From 8b8b36834d0fff67fc8668093f4312dd04dcf21d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 10 Jun 2014 09:46:00 -0400
Subject: ring-buffer: Check if buffer exists before polling

The per_cpu buffers are created one per possible CPU. But these do
not mean that those CPUs are online, nor do they even exist.

With the addition of the ring buffer polling, it assumes that the
caller polls on an existing buffer. But this is not the case if
the user reads trace_pipe from a CPU that does not exist, and this
causes the kernel to crash.

Simple fix is to check the cpu against buffer bitmask against to see
if the buffer was allocated or not and return -ENODEV if it is
not.

More updates were done to pass the -ENODEV back up to userspace.

Link: http://lkml.kernel.org/r/5393DB61.6060707@oracle.com

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # 3.10+
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  2 +-
 kernel/trace/ring_buffer.c  |  5 ++++-
 kernel/trace/trace.c        | 22 ++++++++++++++++------
 3 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index d69cf637a15a..49a4d6f59108 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 	__ring_buffer_alloc((size), (flags), &__key);	\
 })
 
-void ring_buffer_wait(struct ring_buffer *buffer, int cpu);
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu);
 int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 			  struct file *filp, poll_table *poll_table);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c634868c2921..7c56c3d06943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
  * as data is added to any of the @buffer's cpu buffers. Otherwise
  * it will wait for data to be added to a specific cpu buffer.
  */
-void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	DEFINE_WAIT(wait);
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
 	if (cpu == RING_BUFFER_ALL_CPUS)
 		work = &buffer->irq_work;
 	else {
+		if (!cpumask_test_cpu(cpu, buffer->cpumask))
+			return -ENODEV;
 		cpu_buffer = buffer->buffers[cpu];
 		work = &cpu_buffer->irq_work;
 	}
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
 		schedule();
 
 	finish_wait(&work->waiters, &wait);
+	return 0;
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 16f7038d1f4d..56422f1decba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1085,13 +1085,13 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
-static void wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter)
 {
 	/* Iterators are static, they should be filled or empty */
 	if (trace_buffer_iter(iter, iter->cpu_file))
-		return;
+		return 0;
 
-	ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
 }
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4378,6 +4378,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 static int tracing_wait_pipe(struct file *filp)
 {
 	struct trace_iterator *iter = filp->private_data;
+	int ret;
 
 	while (trace_empty(iter)) {
 
@@ -4399,10 +4400,13 @@ static int tracing_wait_pipe(struct file *filp)
 
 		mutex_unlock(&iter->mutex);
 
-		wait_on_pipe(iter);
+		ret = wait_on_pipe(iter);
 
 		mutex_lock(&iter->mutex);
 
+		if (ret)
+			return ret;
+
 		if (signal_pending(current))
 			return -EINTR;
 	}
@@ -5327,8 +5331,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 				goto out_unlock;
 			}
 			mutex_unlock(&trace_types_lock);
-			wait_on_pipe(iter);
+			ret = wait_on_pipe(iter);
 			mutex_lock(&trace_types_lock);
+			if (ret) {
+				size = ret;
+				goto out_unlock;
+			}
 			if (signal_pending(current)) {
 				size = -EINTR;
 				goto out_unlock;
@@ -5538,8 +5546,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			goto out;
 		}
 		mutex_unlock(&trace_types_lock);
-		wait_on_pipe(iter);
+		ret = wait_on_pipe(iter);
 		mutex_lock(&trace_types_lock);
+		if (ret)
+			goto out;
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
-- 
cgit v1.2.3-71-gd317


From 962bd40bc30e412828e091bfda041b7547e779c8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 10 Jun 2014 12:24:40 -0400
Subject: locks: add missing memory barrier in break_deleg

break_deleg is subject to the same potential race as break_lease. Add
a memory barrier to prevent it.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
---
 include/linux/fs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3f46e499dd0..22ae79650b82 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1914,6 +1914,12 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 static inline int break_deleg(struct inode *inode, unsigned int mode)
 {
+	/*
+	 * Since this check is lockless, we must ensure that any refcounts
+	 * taken are done before checking inode->i_flock. Otherwise, we could
+	 * end up racing with tasks trying to set a new lease on this file.
+	 */
+	smp_mb();
 	if (inode->i_flock)
 		return __break_lease(inode, mode, FL_DELEG);
 	return 0;
-- 
cgit v1.2.3-71-gd317


From b5097e956a4d2919ee248d6481e4204c5568ed5c Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Tue, 10 Jun 2014 20:04:50 +0200
Subject: block: add __init to elv_register

elv_register is only called by elevator init functions:

__init cfq_init
__init deadline_init
__init noop_init

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/elevator.c         | 2 +-
 include/linux/elevator.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/elevator.c b/block/elevator.c
index 1e01b66a0b92..f35edddfe9b5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -845,7 +845,7 @@ void elv_unregister_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(elv_unregister_queue);
 
-int elv_register(struct elevator_type *e)
+int __init elv_register(struct elevator_type *e)
 {
 	char *def = "";
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index df63bd3a8cf1..4ff262e2bf37 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -144,7 +144,7 @@ extern void elv_drain_elevator(struct request_queue *);
  * io scheduler registration
  */
 extern void __init load_default_elevator_module(void);
-extern int elv_register(struct elevator_type *);
+extern int __init elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
 
 /*
-- 
cgit v1.2.3-71-gd317


From 87a1ef8058d9ab26bc289ea4f27bc3c11ce9acb8 Mon Sep 17 00:00:00 2001
From: David Cohen <david.a.cohen@linux.intel.com>
Date: Tue, 15 Apr 2014 13:06:05 -0700
Subject: watchdog: add Intel MID watchdog driver support

Add initial Intel MID watchdog driver support.

This driver is an initial implementation of generic Intel MID watchdog
driver. Currently it supports Intel Merrifield platform.

Signed-off-by: Eric Ernst <eric.ernst@intel.com>
Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/Kconfig                    |  13 ++
 drivers/watchdog/Makefile                   |   1 +
 drivers/watchdog/intel-mid_wdt.c            | 184 ++++++++++++++++++++++++++++
 include/linux/platform_data/intel-mid_wdt.h |  22 ++++
 4 files changed, 220 insertions(+)
 create mode 100644 drivers/watchdog/intel-mid_wdt.c
 create mode 100644 include/linux/platform_data/intel-mid_wdt.h

(limited to 'include/linux')

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index cbd5ac7b8832..c845527b503a 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -665,6 +665,19 @@ config INTEL_SCU_WATCHDOG
 
 	  To compile this driver as a module, choose M here.
 
+config INTEL_MID_WATCHDOG
+	tristate "Intel MID Watchdog Timer"
+	depends on X86_INTEL_MID
+	select WATCHDOG_CORE
+	---help---
+	  Watchdog timer driver built into the Intel SCU for Intel MID
+	  Platforms.
+
+	  This driver currently supports only the watchdog evolution
+	  implementation in SCU, available for Merrifield generation.
+
+	  To compile this driver as a module, choose M here.
+
 config ITCO_WDT
 	tristate "Intel TCO Timer/Watchdog"
 	depends on (X86 || IA64) && PCI
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 1384531eaa45..7b8a91ed20e7 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -112,6 +112,7 @@ obj-$(CONFIG_W83977F_WDT) += w83977f_wdt.o
 obj-$(CONFIG_MACHZ_WDT) += machzwd.o
 obj-$(CONFIG_SBC_EPX_C3_WATCHDOG) += sbc_epx_c3.o
 obj-$(CONFIG_INTEL_SCU_WATCHDOG) += intel_scu_watchdog.o
+obj-$(CONFIG_INTEL_MID_WATCHDOG) += intel-mid_wdt.o
 
 # M32R Architecture
 
diff --git a/drivers/watchdog/intel-mid_wdt.c b/drivers/watchdog/intel-mid_wdt.c
new file mode 100644
index 000000000000..ca66e8e74635
--- /dev/null
+++ b/drivers/watchdog/intel-mid_wdt.c
@@ -0,0 +1,184 @@
+/*
+ *      intel-mid_wdt: generic Intel MID SCU watchdog driver
+ *
+ *      Platforms supported so far:
+ *      - Merrifield only
+ *
+ *      Copyright (C) 2014 Intel Corporation. All rights reserved.
+ *      Contact: David Cohen <david.a.cohen@linux.intel.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of version 2 of the GNU General
+ *      Public License as published by the Free Software Foundation.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/platform_device.h>
+#include <linux/watchdog.h>
+#include <linux/platform_data/intel-mid_wdt.h>
+
+#include <asm/intel_scu_ipc.h>
+#include <asm/intel-mid.h>
+
+#define IPC_WATCHDOG 0xf8
+
+#define MID_WDT_PRETIMEOUT		15
+#define MID_WDT_TIMEOUT_MIN		(1 + MID_WDT_PRETIMEOUT)
+#define MID_WDT_TIMEOUT_MAX		170
+#define MID_WDT_DEFAULT_TIMEOUT		90
+
+/* SCU watchdog messages */
+enum {
+	SCU_WATCHDOG_START = 0,
+	SCU_WATCHDOG_STOP,
+	SCU_WATCHDOG_KEEPALIVE,
+};
+
+static inline int wdt_command(int sub, u32 *in, int inlen)
+{
+	return intel_scu_ipc_command(IPC_WATCHDOG, sub, in, inlen, NULL, 0);
+}
+
+static int wdt_start(struct watchdog_device *wd)
+{
+	int ret, in_size;
+	int timeout = wd->timeout;
+	struct ipc_wd_start {
+		u32 pretimeout;
+		u32 timeout;
+	} ipc_wd_start = { timeout - MID_WDT_PRETIMEOUT, timeout };
+
+	/*
+	 * SCU expects the input size for watchdog IPC to
+	 * be based on 4 bytes
+	 */
+	in_size = DIV_ROUND_UP(sizeof(ipc_wd_start), 4);
+
+	ret = wdt_command(SCU_WATCHDOG_START, (u32 *)&ipc_wd_start, in_size);
+	if (ret) {
+		struct device *dev = watchdog_get_drvdata(wd);
+		dev_crit(dev, "error starting watchdog: %d\n", ret);
+	}
+
+	return ret;
+}
+
+static int wdt_ping(struct watchdog_device *wd)
+{
+	int ret;
+
+	ret = wdt_command(SCU_WATCHDOG_KEEPALIVE, NULL, 0);
+	if (ret) {
+		struct device *dev = watchdog_get_drvdata(wd);
+		dev_crit(dev, "Error executing keepalive: 0x%x\n", ret);
+	}
+
+	return ret;
+}
+
+static int wdt_stop(struct watchdog_device *wd)
+{
+	int ret;
+
+	ret = wdt_command(SCU_WATCHDOG_STOP, NULL, 0);
+	if (ret) {
+		struct device *dev = watchdog_get_drvdata(wd);
+		dev_crit(dev, "Error stopping watchdog: 0x%x\n", ret);
+	}
+
+	return ret;
+}
+
+static irqreturn_t mid_wdt_irq(int irq, void *dev_id)
+{
+	panic("Kernel Watchdog");
+
+	/* This code should not be reached */
+	return IRQ_HANDLED;
+}
+
+static const struct watchdog_info mid_wdt_info = {
+	.identity = "Intel MID SCU watchdog",
+	.options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT,
+};
+
+static const struct watchdog_ops mid_wdt_ops = {
+	.owner = THIS_MODULE,
+	.start = wdt_start,
+	.stop = wdt_stop,
+	.ping = wdt_ping,
+};
+
+static int mid_wdt_probe(struct platform_device *pdev)
+{
+	struct watchdog_device *wdt_dev;
+	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
+	int ret;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "missing platform data\n");
+		return -EINVAL;
+	}
+
+	if (pdata->probe) {
+		ret = pdata->probe(pdev);
+		if (ret)
+			return ret;
+	}
+
+	wdt_dev = devm_kzalloc(&pdev->dev, sizeof(*wdt_dev), GFP_KERNEL);
+	if (!wdt_dev)
+		return -ENOMEM;
+
+	wdt_dev->info = &mid_wdt_info;
+	wdt_dev->ops = &mid_wdt_ops;
+	wdt_dev->min_timeout = MID_WDT_TIMEOUT_MIN;
+	wdt_dev->max_timeout = MID_WDT_TIMEOUT_MAX;
+	wdt_dev->timeout = MID_WDT_DEFAULT_TIMEOUT;
+
+	watchdog_set_drvdata(wdt_dev, &pdev->dev);
+	platform_set_drvdata(pdev, wdt_dev);
+
+	ret = devm_request_irq(&pdev->dev, pdata->irq, mid_wdt_irq,
+			       IRQF_SHARED | IRQF_NO_SUSPEND, "watchdog",
+			       wdt_dev);
+	if (ret) {
+		dev_err(&pdev->dev, "error requesting warning irq %d\n",
+			pdata->irq);
+		return ret;
+	}
+
+	ret = watchdog_register_device(wdt_dev);
+	if (ret) {
+		dev_err(&pdev->dev, "error registering watchdog device\n");
+		return ret;
+	}
+
+	dev_info(&pdev->dev, "Intel MID watchdog device probed\n");
+
+	return 0;
+}
+
+static int mid_wdt_remove(struct platform_device *pdev)
+{
+	struct watchdog_device *wd = platform_get_drvdata(pdev);
+	watchdog_unregister_device(wd);
+	return 0;
+}
+
+static struct platform_driver mid_wdt_driver = {
+	.probe		= mid_wdt_probe,
+	.remove		= mid_wdt_remove,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "intel_mid_wdt",
+	},
+};
+
+module_platform_driver(mid_wdt_driver);
+
+MODULE_AUTHOR("David Cohen <david.a.cohen@linux.intel.com>");
+MODULE_DESCRIPTION("Watchdog Driver for Intel MID platform");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/intel-mid_wdt.h b/include/linux/platform_data/intel-mid_wdt.h
new file mode 100644
index 000000000000..b98253466ace
--- /dev/null
+++ b/include/linux/platform_data/intel-mid_wdt.h
@@ -0,0 +1,22 @@
+/*
+ *      intel-mid_wdt: generic Intel MID SCU watchdog driver
+ *
+ *      Copyright (C) 2014 Intel Corporation. All rights reserved.
+ *      Contact: David Cohen <david.a.cohen@linux.intel.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of version 2 of the GNU General
+ *      Public License as published by the Free Software Foundation.
+ */
+
+#ifndef __INTEL_MID_WDT_H__
+#define __INTEL_MID_WDT_H__
+
+#include <linux/platform_device.h>
+
+struct intel_mid_wdt_pdata {
+	int irq;
+	int (*probe)(struct platform_device *pdev);
+};
+
+#endif /*__INTEL_MID_WDT_H__*/
-- 
cgit v1.2.3-71-gd317


From 23adbe12ef7d3d4195e80800ab36b37bee28cd03 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Tue, 10 Jun 2014 12:45:42 -0700
Subject: fs,userns: Change inode_capable to capable_wrt_inode_uidgid

The kernel has no concept of capabilities with respect to inodes; inodes
exist independently of namespaces.  For example, inode_capable(inode,
CAP_LINUX_IMMUTABLE) would be nonsense.

This patch changes inode_capable to check for uid and gid mappings and
renames it to capable_wrt_inode_uidgid, which should make it more
obvious what it does.

Fixes CVE-2014-4014.

Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Serge Hallyn <serge.hallyn@ubuntu.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: stable@vger.kernel.org
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/attr.c                  |  8 ++++----
 fs/inode.c                 | 10 +++++++---
 fs/namei.c                 | 11 ++++++-----
 fs/xfs/xfs_ioctl.c         |  2 +-
 include/linux/capability.h |  2 +-
 kernel/capability.c        | 20 ++++++++------------
 6 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index 5d4e59d56e85..6530ced19697 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -50,14 +50,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 	if ((ia_valid & ATTR_UID) &&
 	    (!uid_eq(current_fsuid(), inode->i_uid) ||
 	     !uid_eq(attr->ia_uid, inode->i_uid)) &&
-	    !inode_capable(inode, CAP_CHOWN))
+	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
 	    (!uid_eq(current_fsuid(), inode->i_uid) ||
 	    (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
-	    !inode_capable(inode, CAP_CHOWN))
+	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
@@ -67,7 +67,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
 				inode->i_gid) &&
-		    !inode_capable(inode, CAP_FSETID))
+		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
@@ -160,7 +160,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 		umode_t mode = attr->ia_mode;
 
 		if (!in_group_p(inode->i_gid) &&
-		    !inode_capable(inode, CAP_FSETID))
+		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
diff --git a/fs/inode.c b/fs/inode.c
index 2feb9b69f1be..6eecb7ff0b9a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1839,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner);
  * inode_owner_or_capable - check current task permissions to inode
  * @inode: inode being checked
  *
- * Return true if current either has CAP_FOWNER to the inode, or
- * owns the file.
+ * Return true if current either has CAP_FOWNER in a namespace with the
+ * inode owner uid mapped, or owns the file.
  */
 bool inode_owner_or_capable(const struct inode *inode)
 {
+	struct user_namespace *ns;
+
 	if (uid_eq(current_fsuid(), inode->i_uid))
 		return true;
-	if (inode_capable(inode, CAP_FOWNER))
+
+	ns = current_user_ns();
+	if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid))
 		return true;
 	return false;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 80168273396b..985c6f368485 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -332,10 +332,11 @@ int generic_permission(struct inode *inode, int mask)
 
 	if (S_ISDIR(inode->i_mode)) {
 		/* DACs are overridable for directories */
-		if (inode_capable(inode, CAP_DAC_OVERRIDE))
+		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 			return 0;
 		if (!(mask & MAY_WRITE))
-			if (inode_capable(inode, CAP_DAC_READ_SEARCH))
+			if (capable_wrt_inode_uidgid(inode,
+						     CAP_DAC_READ_SEARCH))
 				return 0;
 		return -EACCES;
 	}
@@ -345,7 +346,7 @@ int generic_permission(struct inode *inode, int mask)
 	 * at least one exec bit set.
 	 */
 	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
-		if (inode_capable(inode, CAP_DAC_OVERRIDE))
+		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 			return 0;
 
 	/*
@@ -353,7 +354,7 @@ int generic_permission(struct inode *inode, int mask)
 	 */
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 	if (mask == MAY_READ)
-		if (inode_capable(inode, CAP_DAC_READ_SEARCH))
+		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
 			return 0;
 
 	return -EACCES;
@@ -2379,7 +2380,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
 		return 0;
 	if (uid_eq(dir->i_uid, fsuid))
 		return 0;
-	return !inode_capable(inode, CAP_FOWNER);
+	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
 }
 
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0b18776b075e..6152cbe353e8 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1215,7 +1215,7 @@ xfs_ioctl_setattr(
 		 * cleared upon successful return from chown()
 		 */
 		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-		    !inode_capable(VFS_I(ip), CAP_FSETID))
+		    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
 			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 
 		/*
diff --git a/include/linux/capability.h b/include/linux/capability.h
index a6ee1f9a5018..84b13ad67c1c 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -210,7 +210,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
-extern bool inode_capable(const struct inode *inode, int cap);
+extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
diff --git a/kernel/capability.c b/kernel/capability.c
index 84b2bbf443e7..a5cf13c018ce 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -424,23 +424,19 @@ bool capable(int cap)
 EXPORT_SYMBOL(capable);
 
 /**
- * inode_capable - Check superior capability over inode
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
  * @inode: The inode in question
  * @cap: The capability in question
  *
- * Return true if the current task has the given superior capability
- * targeted at it's own user namespace and that the given inode is owned
- * by the current user namespace or a child namespace.
- *
- * Currently we check to see if an inode is owned by the current
- * user namespace by seeing if the inode's owner maps into the
- * current user namespace.
- *
+ * Return true if the current task has the given capability targeted at
+ * its own user namespace and that the given inode's uid and gid are
+ * mapped into the current user namespace.
  */
-bool inode_capable(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
 {
 	struct user_namespace *ns = current_user_ns();
 
-	return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
+	return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+		kgid_has_mapping(ns, inode->i_gid);
 }
-EXPORT_SYMBOL(inode_capable);
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
-- 
cgit v1.2.3-71-gd317


From 602cb5bbae9868fe48989efa78aca62415309fcf Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 10 Jun 2014 15:18:40 -0700
Subject: mfd/rtc: sec/s5m: rename SEC* symbols to S5M

Prepare for adding support for S2MPS14 RTC device to the rtc-s5m driver:

1. Rename SEC* symbols to S5M.
2. Add S5M prefix to some of defines which are different between S5M876X
   and S2MPS14.

This is only a rename-like patch, new code is not added.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Sangbeom Kim <sbkim73@samsung.com>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-s5m.c           | 66 +++++++++++++++++------------------
 include/linux/mfd/samsung/rtc.h | 76 ++++++++++++++++++++---------------------
 2 files changed, 71 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c
index 8ec2d6a1dbe1..b37df8c790f2 100644
--- a/drivers/rtc/rtc-s5m.c
+++ b/drivers/rtc/rtc-s5m.c
@@ -30,10 +30,10 @@
 
 /*
  * Maximum number of retries for checking changes in UDR field
- * of SEC_RTC_UDR_CON register (to limit possible endless loop).
+ * of S5M_RTC_UDR_CON register (to limit possible endless loop).
  *
  * After writing to RTC registers (setting time or alarm) read the UDR field
- * in SEC_RTC_UDR_CON register. UDR is auto-cleared when data have
+ * in S5M_RTC_UDR_CON register. UDR is auto-cleared when data have
  * been transferred.
  */
 #define UDR_READ_RETRY_CNT	5
@@ -54,7 +54,7 @@ static const struct regmap_config s5m_rtc_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 
-	.max_register = SEC_RTC_REG_MAX,
+	.max_register = S5M_RTC_REG_MAX,
 };
 
 static const struct regmap_config s2mps14_rtc_regmap_config = {
@@ -119,8 +119,8 @@ static inline int s5m8767_wait_for_udr_update(struct s5m_rtc_info *info)
 	unsigned int data;
 
 	do {
-		ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &data);
-	} while (--retry && (data & RTC_UDR_MASK) && !ret);
+		ret = regmap_read(info->regmap, S5M_RTC_UDR_CON, &data);
+	} while (--retry && (data & S5M_RTC_UDR_MASK) && !ret);
 
 	if (!retry)
 		dev_err(info->dev, "waiting for UDR update, reached max number of retries\n");
@@ -133,16 +133,16 @@ static inline int s5m8767_rtc_set_time_reg(struct s5m_rtc_info *info)
 	int ret;
 	unsigned int data;
 
-	ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &data);
+	ret = regmap_read(info->regmap, S5M_RTC_UDR_CON, &data);
 	if (ret < 0) {
 		dev_err(info->dev, "failed to read update reg(%d)\n", ret);
 		return ret;
 	}
 
-	data |= RTC_TIME_EN_MASK;
-	data |= RTC_UDR_MASK;
+	data |= S5M_RTC_TIME_EN_MASK;
+	data |= S5M_RTC_UDR_MASK;
 
-	ret = regmap_write(info->regmap, SEC_RTC_UDR_CON, data);
+	ret = regmap_write(info->regmap, S5M_RTC_UDR_CON, data);
 	if (ret < 0) {
 		dev_err(info->dev, "failed to write update reg(%d)\n", ret);
 		return ret;
@@ -158,17 +158,17 @@ static inline int s5m8767_rtc_set_alarm_reg(struct s5m_rtc_info *info)
 	int ret;
 	unsigned int data;
 
-	ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &data);
+	ret = regmap_read(info->regmap, S5M_RTC_UDR_CON, &data);
 	if (ret < 0) {
 		dev_err(info->dev, "%s: fail to read update reg(%d)\n",
 			__func__, ret);
 		return ret;
 	}
 
-	data &= ~RTC_TIME_EN_MASK;
-	data |= RTC_UDR_MASK;
+	data &= ~S5M_RTC_TIME_EN_MASK;
+	data |= S5M_RTC_UDR_MASK;
 
-	ret = regmap_write(info->regmap, SEC_RTC_UDR_CON, data);
+	ret = regmap_write(info->regmap, S5M_RTC_UDR_CON, data);
 	if (ret < 0) {
 		dev_err(info->dev, "%s: fail to write update reg(%d)\n",
 			__func__, ret);
@@ -218,7 +218,7 @@ static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	u8 data[8];
 	int ret;
 
-	ret = regmap_bulk_read(info->regmap, SEC_RTC_SEC, data, 8);
+	ret = regmap_bulk_read(info->regmap, S5M_RTC_SEC, data, 8);
 	if (ret < 0)
 		return ret;
 
@@ -266,7 +266,7 @@ static int s5m_rtc_set_time(struct device *dev, struct rtc_time *tm)
 		1900 + tm->tm_year, 1 + tm->tm_mon, tm->tm_mday,
 		tm->tm_hour, tm->tm_min, tm->tm_sec, tm->tm_wday);
 
-	ret = regmap_raw_write(info->regmap, SEC_RTC_SEC, data, 8);
+	ret = regmap_raw_write(info->regmap, S5M_RTC_SEC, data, 8);
 	if (ret < 0)
 		return ret;
 
@@ -282,20 +282,20 @@ static int s5m_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	unsigned int val;
 	int ret, i;
 
-	ret = regmap_bulk_read(info->regmap, SEC_ALARM0_SEC, data, 8);
+	ret = regmap_bulk_read(info->regmap, S5M_ALARM0_SEC, data, 8);
 	if (ret < 0)
 		return ret;
 
 	switch (info->device_type) {
 	case S5M8763X:
 		s5m8763_data_to_tm(data, &alrm->time);
-		ret = regmap_read(info->regmap, SEC_ALARM0_CONF, &val);
+		ret = regmap_read(info->regmap, S5M_ALARM0_CONF, &val);
 		if (ret < 0)
 			return ret;
 
 		alrm->enabled = !!val;
 
-		ret = regmap_read(info->regmap, SEC_RTC_STATUS, &val);
+		ret = regmap_read(info->regmap, S5M_RTC_STATUS, &val);
 		if (ret < 0)
 			return ret;
 
@@ -318,7 +318,7 @@ static int s5m_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 		}
 
 		alrm->pending = 0;
-		ret = regmap_read(info->regmap, SEC_RTC_STATUS, &val);
+		ret = regmap_read(info->regmap, S5M_RTC_STATUS, &val);
 		if (ret < 0)
 			return ret;
 		break;
@@ -327,7 +327,7 @@ static int s5m_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 		return -EINVAL;
 	}
 
-	if (val & ALARM0_STATUS)
+	if (val & S5M_ALARM0_STATUS)
 		alrm->pending = 1;
 	else
 		alrm->pending = 0;
@@ -341,7 +341,7 @@ static int s5m_rtc_stop_alarm(struct s5m_rtc_info *info)
 	int ret, i;
 	struct rtc_time tm;
 
-	ret = regmap_bulk_read(info->regmap, SEC_ALARM0_SEC, data, 8);
+	ret = regmap_bulk_read(info->regmap, S5M_ALARM0_SEC, data, 8);
 	if (ret < 0)
 		return ret;
 
@@ -352,14 +352,14 @@ static int s5m_rtc_stop_alarm(struct s5m_rtc_info *info)
 
 	switch (info->device_type) {
 	case S5M8763X:
-		ret = regmap_write(info->regmap, SEC_ALARM0_CONF, 0);
+		ret = regmap_write(info->regmap, S5M_ALARM0_CONF, 0);
 		break;
 
 	case S5M8767X:
 		for (i = 0; i < 7; i++)
 			data[i] &= ~ALARM_ENABLE_MASK;
 
-		ret = regmap_raw_write(info->regmap, SEC_ALARM0_SEC, data, 8);
+		ret = regmap_raw_write(info->regmap, S5M_ALARM0_SEC, data, 8);
 		if (ret < 0)
 			return ret;
 
@@ -381,7 +381,7 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info)
 	u8 alarm0_conf;
 	struct rtc_time tm;
 
-	ret = regmap_bulk_read(info->regmap, SEC_ALARM0_SEC, data, 8);
+	ret = regmap_bulk_read(info->regmap, S5M_ALARM0_SEC, data, 8);
 	if (ret < 0)
 		return ret;
 
@@ -393,7 +393,7 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info)
 	switch (info->device_type) {
 	case S5M8763X:
 		alarm0_conf = 0x77;
-		ret = regmap_write(info->regmap, SEC_ALARM0_CONF, alarm0_conf);
+		ret = regmap_write(info->regmap, S5M_ALARM0_CONF, alarm0_conf);
 		break;
 
 	case S5M8767X:
@@ -408,7 +408,7 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info)
 		if (data[RTC_YEAR1] & 0x7f)
 			data[RTC_YEAR1] |= ALARM_ENABLE_MASK;
 
-		ret = regmap_raw_write(info->regmap, SEC_ALARM0_SEC, data, 8);
+		ret = regmap_raw_write(info->regmap, S5M_ALARM0_SEC, data, 8);
 		if (ret < 0)
 			return ret;
 		ret = s5m8767_rtc_set_alarm_reg(info);
@@ -450,7 +450,7 @@ static int s5m_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	if (ret < 0)
 		return ret;
 
-	ret = regmap_raw_write(info->regmap, SEC_ALARM0_SEC, data, 8);
+	ret = regmap_raw_write(info->regmap, S5M_ALARM0_SEC, data, 8);
 	if (ret < 0)
 		return ret;
 
@@ -495,7 +495,7 @@ static const struct rtc_class_ops s5m_rtc_ops = {
 static void s5m_rtc_enable_wtsr(struct s5m_rtc_info *info, bool enable)
 {
 	int ret;
-	ret = regmap_update_bits(info->regmap, SEC_WTSR_SMPL_CNTL,
+	ret = regmap_update_bits(info->regmap, S5M_WTSR_SMPL_CNTL,
 				 WTSR_ENABLE_MASK,
 				 enable ? WTSR_ENABLE_MASK : 0);
 	if (ret < 0)
@@ -506,7 +506,7 @@ static void s5m_rtc_enable_wtsr(struct s5m_rtc_info *info, bool enable)
 static void s5m_rtc_enable_smpl(struct s5m_rtc_info *info, bool enable)
 {
 	int ret;
-	ret = regmap_update_bits(info->regmap, SEC_WTSR_SMPL_CNTL,
+	ret = regmap_update_bits(info->regmap, S5M_WTSR_SMPL_CNTL,
 				 SMPL_ENABLE_MASK,
 				 enable ? SMPL_ENABLE_MASK : 0);
 	if (ret < 0)
@@ -521,7 +521,7 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info)
 	int ret;
 	struct rtc_time tm;
 
-	ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &tp_read);
+	ret = regmap_read(info->regmap, S5M_RTC_UDR_CON, &tp_read);
 	if (ret < 0) {
 		dev_err(info->dev, "%s: fail to read control reg(%d)\n",
 			__func__, ret);
@@ -533,7 +533,7 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info)
 	data[1] = (0 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT);
 
 	info->rtc_24hr_mode = 1;
-	ret = regmap_raw_write(info->regmap, SEC_ALARM0_CONF, data, 2);
+	ret = regmap_raw_write(info->regmap, S5M_ALARM0_CONF, data, 2);
 	if (ret < 0) {
 		dev_err(info->dev, "%s: fail to write controlm reg(%d)\n",
 			__func__, ret);
@@ -555,7 +555,7 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info)
 		ret = s5m_rtc_set_time(info->dev, &tm);
 	}
 
-	ret = regmap_update_bits(info->regmap, SEC_RTC_UDR_CON,
+	ret = regmap_update_bits(info->regmap, S5M_RTC_UDR_CON,
 				 RTC_TCON_MASK, tp_read | RTC_TCON_MASK);
 	if (ret < 0)
 		dev_err(info->dev, "%s: fail to update TCON reg(%d)\n",
@@ -676,7 +676,7 @@ static void s5m_rtc_shutdown(struct platform_device *pdev)
 	if (info->wtsr_smpl) {
 		for (i = 0; i < 3; i++) {
 			s5m_rtc_enable_wtsr(info, false);
-			regmap_read(info->regmap, SEC_WTSR_SMPL_CNTL, &val);
+			regmap_read(info->regmap, S5M_WTSR_SMPL_CNTL, &val);
 			pr_debug("%s: WTSR_SMPL reg(0x%02x)\n", __func__, val);
 			if (val & WTSR_ENABLE_MASK)
 				pr_emerg("%s: fail to disable WTSR\n",
diff --git a/include/linux/mfd/samsung/rtc.h b/include/linux/mfd/samsung/rtc.h
index 3e02b768d537..207fcfbde82e 100644
--- a/include/linux/mfd/samsung/rtc.h
+++ b/include/linux/mfd/samsung/rtc.h
@@ -18,38 +18,38 @@
 #ifndef __LINUX_MFD_SEC_RTC_H
 #define __LINUX_MFD_SEC_RTC_H
 
-enum sec_rtc_reg {
-	SEC_RTC_SEC,
-	SEC_RTC_MIN,
-	SEC_RTC_HOUR,
-	SEC_RTC_WEEKDAY,
-	SEC_RTC_DATE,
-	SEC_RTC_MONTH,
-	SEC_RTC_YEAR1,
-	SEC_RTC_YEAR2,
-	SEC_ALARM0_SEC,
-	SEC_ALARM0_MIN,
-	SEC_ALARM0_HOUR,
-	SEC_ALARM0_WEEKDAY,
-	SEC_ALARM0_DATE,
-	SEC_ALARM0_MONTH,
-	SEC_ALARM0_YEAR1,
-	SEC_ALARM0_YEAR2,
-	SEC_ALARM1_SEC,
-	SEC_ALARM1_MIN,
-	SEC_ALARM1_HOUR,
-	SEC_ALARM1_WEEKDAY,
-	SEC_ALARM1_DATE,
-	SEC_ALARM1_MONTH,
-	SEC_ALARM1_YEAR1,
-	SEC_ALARM1_YEAR2,
-	SEC_ALARM0_CONF,
-	SEC_ALARM1_CONF,
-	SEC_RTC_STATUS,
-	SEC_WTSR_SMPL_CNTL,
-	SEC_RTC_UDR_CON,
+enum s5m_rtc_reg {
+	S5M_RTC_SEC,
+	S5M_RTC_MIN,
+	S5M_RTC_HOUR,
+	S5M_RTC_WEEKDAY,
+	S5M_RTC_DATE,
+	S5M_RTC_MONTH,
+	S5M_RTC_YEAR1,
+	S5M_RTC_YEAR2,
+	S5M_ALARM0_SEC,
+	S5M_ALARM0_MIN,
+	S5M_ALARM0_HOUR,
+	S5M_ALARM0_WEEKDAY,
+	S5M_ALARM0_DATE,
+	S5M_ALARM0_MONTH,
+	S5M_ALARM0_YEAR1,
+	S5M_ALARM0_YEAR2,
+	S5M_ALARM1_SEC,
+	S5M_ALARM1_MIN,
+	S5M_ALARM1_HOUR,
+	S5M_ALARM1_WEEKDAY,
+	S5M_ALARM1_DATE,
+	S5M_ALARM1_MONTH,
+	S5M_ALARM1_YEAR1,
+	S5M_ALARM1_YEAR2,
+	S5M_ALARM0_CONF,
+	S5M_ALARM1_CONF,
+	S5M_RTC_STATUS,
+	S5M_WTSR_SMPL_CNTL,
+	S5M_RTC_UDR_CON,
 
-	SEC_RTC_REG_MAX,
+	S5M_RTC_REG_MAX,
 };
 
 enum s2mps_rtc_reg {
@@ -88,9 +88,9 @@ enum s2mps_rtc_reg {
 #define HOUR_12			(1 << 7)
 #define HOUR_AMPM		(1 << 6)
 #define HOUR_PM			(1 << 5)
-#define ALARM0_STATUS		(1 << 1)
-#define ALARM1_STATUS		(1 << 2)
-#define UPDATE_AD		(1 << 0)
+#define S5M_ALARM0_STATUS	(1 << 1)
+#define S5M_ALARM1_STATUS	(1 << 2)
+#define S5M_UPDATE_AD		(1 << 0)
 
 #define S2MPS_ALARM0_STATUS	(1 << 2)
 #define S2MPS_ALARM1_STATUS	(1 << 1)
@@ -101,16 +101,16 @@ enum s2mps_rtc_reg {
 #define MODEL24_SHIFT		1
 #define MODEL24_MASK		(1 << MODEL24_SHIFT)
 /* RTC Update Register1 */
-#define RTC_UDR_SHIFT		0
-#define RTC_UDR_MASK		(1 << RTC_UDR_SHIFT)
+#define S5M_RTC_UDR_SHIFT	0
+#define S5M_RTC_UDR_MASK	(1 << S5M_RTC_UDR_SHIFT)
 #define S2MPS_RTC_WUDR_SHIFT	4
 #define S2MPS_RTC_WUDR_MASK	(1 << S2MPS_RTC_WUDR_SHIFT)
 #define S2MPS_RTC_RUDR_SHIFT	0
 #define S2MPS_RTC_RUDR_MASK	(1 << S2MPS_RTC_RUDR_SHIFT)
 #define RTC_TCON_SHIFT		1
 #define RTC_TCON_MASK		(1 << RTC_TCON_SHIFT)
-#define RTC_TIME_EN_SHIFT	3
-#define RTC_TIME_EN_MASK	(1 << RTC_TIME_EN_SHIFT)
+#define S5M_RTC_TIME_EN_SHIFT	3
+#define S5M_RTC_TIME_EN_MASK	(1 << S5M_RTC_TIME_EN_SHIFT)
 
 /* RTC Hour register */
 #define HOUR_PM_SHIFT		6
-- 
cgit v1.2.3-71-gd317


From 0c5f5d9af311013aabc519b68df19533d0d51cda Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 10 Jun 2014 15:18:43 -0700
Subject: rtc: s5m: use shorter time of register update

Set the time needed for updating alarm and time registers to 0.45 ms.
The default is 7.32 ms which is too long and leads to warnings when
setting alarm or time:

	s5m-rtc: waiting for UDR update, reached max number of retries

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Sangbeom Kim <sbkim73@samsung.com>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-s5m.c           |  7 +++++++
 include/linux/mfd/samsung/rtc.h | 10 ++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c
index 22137d4dbadf..3751ef90f93c 100644
--- a/drivers/rtc/rtc-s5m.c
+++ b/drivers/rtc/rtc-s5m.c
@@ -519,6 +519,13 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info)
 	u8 data[2];
 	int ret;
 
+	/* UDR update time. Default of 7.32 ms is too long. */
+	ret = regmap_update_bits(info->regmap, S5M_RTC_UDR_CON,
+			S5M_RTC_UDR_T_MASK, S5M_RTC_UDR_T_450_US);
+	if (ret < 0)
+		dev_err(info->dev, "%s: fail to change UDR time: %d\n",
+				__func__, ret);
+
 	/* Set RTC control register : Binary mode, 24hour mode */
 	data[0] = (1 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT);
 	data[1] = (0 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT);
diff --git a/include/linux/mfd/samsung/rtc.h b/include/linux/mfd/samsung/rtc.h
index 207fcfbde82e..b6401e7661c7 100644
--- a/include/linux/mfd/samsung/rtc.h
+++ b/include/linux/mfd/samsung/rtc.h
@@ -111,6 +111,16 @@ enum s2mps_rtc_reg {
 #define RTC_TCON_MASK		(1 << RTC_TCON_SHIFT)
 #define S5M_RTC_TIME_EN_SHIFT	3
 #define S5M_RTC_TIME_EN_MASK	(1 << S5M_RTC_TIME_EN_SHIFT)
+/*
+ * UDR_T field in S5M_RTC_UDR_CON register determines the time needed
+ * for updating alarm and time registers. Default is 7.32 ms.
+ */
+#define S5M_RTC_UDR_T_SHIFT	6
+#define S5M_RTC_UDR_T_MASK	(0x3 << S5M_RTC_UDR_T_SHIFT)
+#define S5M_RTC_UDR_T_7320_US	(0x0 << S5M_RTC_UDR_T_SHIFT)
+#define S5M_RTC_UDR_T_1830_US	(0x1 << S5M_RTC_UDR_T_SHIFT)
+#define S5M_RTC_UDR_T_3660_US	(0x2 << S5M_RTC_UDR_T_SHIFT)
+#define S5M_RTC_UDR_T_450_US	(0x3 << S5M_RTC_UDR_T_SHIFT)
 
 /* RTC Hour register */
 #define HOUR_PM_SHIFT		6
-- 
cgit v1.2.3-71-gd317


From 07f8ac4a1e26e8283542cdaf658a6e2a12fd6980 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@web.de>
Date: Sat, 7 Jun 2014 18:26:28 +0200
Subject: bridge: add export of multicast database adjacent to net_dev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this new, exported function br_multicast_list_adjacent(net_dev) a
list of IPv4/6 addresses is returned. This list contains all multicast
addresses sensed by the bridge multicast snooping feature on all bridge
ports of the bridge interface of net_dev, excluding addresses from the
specified net_device itself.

Adding bridge support to the batman-adv multicast optimization requires
batman-adv knowing about the existence of bridged-in multicast
listeners to be able to reliably serve them with multicast packets.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 18 +++++++++++++++
 net/bridge/br_multicast.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_private.h   | 12 ----------
 3 files changed, 76 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 1085ffeef956..44d6eb0eb852 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -16,9 +16,27 @@
 #include <linux/netdevice.h>
 #include <uapi/linux/if_bridge.h>
 
+struct br_ip {
+	union {
+		__be32	ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+		struct in6_addr ip6;
+#endif
+	} u;
+	__be16		proto;
+	__u16           vid;
+};
+
+struct br_ip_list {
+	struct list_head list;
+	struct br_ip addr;
+};
+
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 
 typedef int br_should_route_hook_t(struct sk_buff *skb);
 extern br_should_route_hook_t __rcu *br_should_route_hook;
+int br_multicast_list_adjacent(struct net_device *dev,
+			       struct list_head *br_ip_list);
 
 #endif
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b3f17c9b4d06..772476b7c4b7 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/err.h>
+#include <linux/export.h>
 #include <linux/if_ether.h>
 #include <linux/igmp.h>
 #include <linux/jhash.h>
@@ -2141,3 +2142,60 @@ unlock:
 
 	return err;
 }
+
+/**
+ * br_multicast_list_adjacent - Returns snooped multicast addresses
+ * @dev:	The bridge port adjacent to which to retrieve addresses
+ * @br_ip_list:	The list to store found, snooped multicast IP addresses in
+ *
+ * Creates a list of IP addresses (struct br_ip_list) sensed by the multicast
+ * snooping feature on all bridge ports of dev's bridge device, excluding
+ * the addresses from dev itself.
+ *
+ * Returns the number of items added to br_ip_list.
+ *
+ * Notes:
+ * - br_ip_list needs to be initialized by caller
+ * - br_ip_list might contain duplicates in the end
+ *   (needs to be taken care of by caller)
+ * - br_ip_list needs to be freed by caller
+ */
+int br_multicast_list_adjacent(struct net_device *dev,
+			       struct list_head *br_ip_list)
+{
+	struct net_bridge *br;
+	struct net_bridge_port *port;
+	struct net_bridge_port_group *group;
+	struct br_ip_list *entry;
+	int count = 0;
+
+	rcu_read_lock();
+	if (!br_ip_list || !br_port_exists(dev))
+		goto unlock;
+
+	port = br_port_get_rcu(dev);
+	if (!port || !port->br)
+		goto unlock;
+
+	br = port->br;
+
+	list_for_each_entry_rcu(port, &br->port_list, list) {
+		if (!port->dev || port->dev == dev)
+			continue;
+
+		hlist_for_each_entry_rcu(group, &port->mglist, mglist) {
+			entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+			if (!entry)
+				goto unlock;
+
+			entry->addr = group->addr;
+			list_add(&entry->list, br_ip_list);
+			count++;
+		}
+	}
+
+unlock:
+	rcu_read_unlock();
+	return count;
+}
+EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 97c5e46dde72..50e2ab021484 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -54,18 +54,6 @@ struct mac_addr
 	unsigned char	addr[ETH_ALEN];
 };
 
-struct br_ip
-{
-	union {
-		__be32	ip4;
-#if IS_ENABLED(CONFIG_IPV6)
-		struct in6_addr ip6;
-#endif
-	} u;
-	__be16		proto;
-	__u16		vid;
-};
-
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 /* our own querier */
 struct bridge_mcast_own_query {
-- 
cgit v1.2.3-71-gd317


From 2cd4143192e8c60f66cb32c3a30c76d0470a372d Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@web.de>
Date: Sat, 7 Jun 2014 18:26:29 +0200
Subject: bridge: memorize and export selected IGMP/MLD querier port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adding bridge support to the batman-adv multicast optimization requires
batman-adv knowing about the existence of bridged-in IGMP/MLD queriers
to be able to reliably serve any multicast listener behind this same
bridge.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c | 72 +++++++++++++++++++++++++++++++++++++++++++----
 net/bridge/br_private.h   |  1 +
 3 files changed, 68 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 44d6eb0eb852..fd22789d7b2e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -38,5 +38,6 @@ typedef int br_should_route_hook_t(struct sk_buff *skb);
 extern br_should_route_hook_t __rcu *br_should_route_hook;
 int br_multicast_list_adjacent(struct net_device *dev,
 			       struct list_head *br_ip_list);
+bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
 
 #endif
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 772476b7c4b7..cd3cf394c477 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1081,6 +1081,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 #endif
 
 static bool br_ip4_multicast_select_querier(struct net_bridge *br,
+					    struct net_bridge_port *port,
 					    __be32 saddr)
 {
 	if (!timer_pending(&br->ip4_own_query.timer) &&
@@ -1098,11 +1099,15 @@ static bool br_ip4_multicast_select_querier(struct net_bridge *br,
 update:
 	br->ip4_querier.addr.u.ip4 = saddr;
 
+	/* update protected by general multicast_lock by caller */
+	rcu_assign_pointer(br->ip4_querier.port, port);
+
 	return true;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static bool br_ip6_multicast_select_querier(struct net_bridge *br,
+					    struct net_bridge_port *port,
 					    struct in6_addr *saddr)
 {
 	if (!timer_pending(&br->ip6_own_query.timer) &&
@@ -1117,19 +1122,23 @@ static bool br_ip6_multicast_select_querier(struct net_bridge *br,
 update:
 	br->ip6_querier.addr.u.ip6 = *saddr;
 
+	/* update protected by general multicast_lock by caller */
+	rcu_assign_pointer(br->ip6_querier.port, port);
+
 	return true;
 }
 #endif
 
 static bool br_multicast_select_querier(struct net_bridge *br,
+					struct net_bridge_port *port,
 					struct br_ip *saddr)
 {
 	switch (saddr->proto) {
 	case htons(ETH_P_IP):
-		return br_ip4_multicast_select_querier(br, saddr->u.ip4);
+		return br_ip4_multicast_select_querier(br, port, saddr->u.ip4);
 #if IS_ENABLED(CONFIG_IPV6)
 	case htons(ETH_P_IPV6):
-		return br_ip6_multicast_select_querier(br, &saddr->u.ip6);
+		return br_ip6_multicast_select_querier(br, port, &saddr->u.ip6);
 #endif
 	}
 
@@ -1201,7 +1210,7 @@ static void br_multicast_query_received(struct net_bridge *br,
 					struct br_ip *saddr,
 					unsigned long max_delay)
 {
-	if (!br_multicast_select_querier(br, saddr))
+	if (!br_multicast_select_querier(br, port, saddr))
 		return;
 
 	br_multicast_update_query_timer(br, query, max_delay);
@@ -1804,12 +1813,14 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
 }
 
 static void br_multicast_query_expired(struct net_bridge *br,
-				       struct bridge_mcast_own_query *query)
+				       struct bridge_mcast_own_query *query,
+				       struct bridge_mcast_querier *querier)
 {
 	spin_lock(&br->multicast_lock);
 	if (query->startup_sent < br->multicast_startup_query_count)
 		query->startup_sent++;
 
+	rcu_assign_pointer(querier, NULL);
 	br_multicast_send_query(br, NULL, query);
 	spin_unlock(&br->multicast_lock);
 }
@@ -1818,7 +1829,7 @@ static void br_ip4_multicast_query_expired(unsigned long data)
 {
 	struct net_bridge *br = (void *)data;
 
-	br_multicast_query_expired(br, &br->ip4_own_query);
+	br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1826,7 +1837,7 @@ static void br_ip6_multicast_query_expired(unsigned long data)
 {
 	struct net_bridge *br = (void *)data;
 
-	br_multicast_query_expired(br, &br->ip6_own_query);
+	br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier);
 }
 #endif
 
@@ -1849,8 +1860,10 @@ void br_multicast_init(struct net_bridge *br)
 	br->multicast_membership_interval = 260 * HZ;
 
 	br->ip4_other_query.delay_time = 0;
+	br->ip4_querier.port = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
 	br->ip6_other_query.delay_time = 0;
+	br->ip6_querier.port = NULL;
 #endif
 
 	spin_lock_init(&br->multicast_lock);
@@ -2199,3 +2212,50 @@ unlock:
 	return count;
 }
 EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
+
+/**
+ * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port
+ * @dev: The bridge port adjacent to which to check for a querier
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a selected querier is behind one of the other ports of this
+ * bridge. Otherwise returns false.
+ */
+bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto)
+{
+	struct net_bridge *br;
+	struct net_bridge_port *port;
+	bool ret = false;
+
+	rcu_read_lock();
+	if (!br_port_exists(dev))
+		goto unlock;
+
+	port = br_port_get_rcu(dev);
+	if (!port || !port->br)
+		goto unlock;
+
+	br = port->br;
+
+	switch (proto) {
+	case ETH_P_IP:
+		if (!timer_pending(&br->ip4_other_query.timer) ||
+		    rcu_dereference(br->ip4_querier.port) == port)
+			goto unlock;
+		break;
+	case ETH_P_IPV6:
+		if (!timer_pending(&br->ip6_other_query.timer) ||
+		    rcu_dereference(br->ip6_querier.port) == port)
+			goto unlock;
+		break;
+	default:
+		goto unlock;
+	}
+
+	ret = true;
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 50e2ab021484..8346e9504cdb 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -70,6 +70,7 @@ struct bridge_mcast_other_query {
 /* selected querier */
 struct bridge_mcast_querier {
 	struct br_ip addr;
+	struct net_bridge_port __rcu	*port;
 };
 #endif
 
-- 
cgit v1.2.3-71-gd317


From e430f34ee5192c84bcabd3c79ab7e2388b5eec74 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 6 Jun 2014 14:46:06 -0700
Subject: net: filter: cleanup A/X name usage

The macro 'A' used in internal BPF interpreter:
 #define A regs[insn->a_reg]
was easily confused with the name of classic BPF register 'A', since
'A' would mean two different things depending on context.

This patch is trying to clean up the naming and clarify its usage in the
following way:

- A and X are names of two classic BPF registers

- BPF_REG_A denotes internal BPF register R0 used to map classic register A
  in internal BPF programs generated from classic

- BPF_REG_X denotes internal BPF register R7 used to map classic register X
  in internal BPF programs generated from classic

- internal BPF instruction format:
struct sock_filter_int {
        __u8    code;           /* opcode */
        __u8    dst_reg:4;      /* dest register */
        __u8    src_reg:4;      /* source register */
        __s16   off;            /* signed offset */
        __s32   imm;            /* signed immediate constant */
};

- BPF_X/BPF_K is 1 bit used to encode source operand of instruction
In classic:
  BPF_X - means use register X as source operand
  BPF_K - means use 32-bit immediate as source operand
In internal:
  BPF_X - means use 'src_reg' register as source operand
  BPF_K - means use 32-bit immediate as source operand

Suggested-by: Chema Gonzalez <chema@google.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Chema Gonzalez <chema@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt |   2 +-
 arch/x86/net/bpf_jit_comp.c         | 260 ++++++++++++++++++------------------
 include/linux/filter.h              | 156 ++++++++++++----------
 net/core/filter.c                   | 198 +++++++++++++--------------
 4 files changed, 314 insertions(+), 302 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 58c443926647..9f49b8690500 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -805,7 +805,7 @@ to seccomp_data, for converted BPF filters R1 points to a skb.
 
 A program, that is translated internally consists of the following elements:
 
-  op:16, jt:8, jf:8, k:32    ==>    op:8, a_reg:4, x_reg:4, off:16, imm:32
+  op:16, jt:8, jf:8, k:32    ==>    op:8, dst_reg:4, src_reg:4, off:16, imm:32
 
 So far 87 internal BPF instructions were implemented. 8-bit 'op' opcode field
 has room for new instructions. Some of them may use 16/24/32 byte encoding. New
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 080f3f071bb0..99bef86ed6df 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -64,10 +64,10 @@ static inline bool is_simm32(s64 value)
 	return value == (s64) (s32) value;
 }
 
-/* mov A, X */
-#define EMIT_mov(A, X) \
-	do {if (A != X) \
-		EMIT3(add_2mod(0x48, A, X), 0x89, add_2reg(0xC0, A, X)); \
+/* mov dst, src */
+#define EMIT_mov(DST, SRC) \
+	do {if (DST != SRC) \
+		EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
 	} while (0)
 
 static int bpf_size_to_x86_bytes(int bpf_size)
@@ -194,16 +194,16 @@ static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
 	return byte;
 }
 
-/* encode dest register 'a_reg' into x64 opcode 'byte' */
-static inline u8 add_1reg(u8 byte, u32 a_reg)
+/* encode 'dst_reg' register into x64 opcode 'byte' */
+static inline u8 add_1reg(u8 byte, u32 dst_reg)
 {
-	return byte + reg2hex[a_reg];
+	return byte + reg2hex[dst_reg];
 }
 
-/* encode dest 'a_reg' and src 'x_reg' registers into x64 opcode 'byte' */
-static inline u8 add_2reg(u8 byte, u32 a_reg, u32 x_reg)
+/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 {
-	return byte + reg2hex[a_reg] + (reg2hex[x_reg] << 3);
+	return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
 }
 
 struct jit_context {
@@ -286,9 +286,9 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 	}
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		const s32 K = insn->imm;
-		u32 a_reg = insn->a_reg;
-		u32 x_reg = insn->x_reg;
+		const s32 imm32 = insn->imm;
+		u32 dst_reg = insn->dst_reg;
+		u32 src_reg = insn->src_reg;
 		u8 b1 = 0, b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
@@ -315,32 +315,32 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			case BPF_XOR: b2 = 0x31; break;
 			}
 			if (BPF_CLASS(insn->code) == BPF_ALU64)
-				EMIT1(add_2mod(0x48, a_reg, x_reg));
-			else if (is_ereg(a_reg) || is_ereg(x_reg))
-				EMIT1(add_2mod(0x40, a_reg, x_reg));
-			EMIT2(b2, add_2reg(0xC0, a_reg, x_reg));
+				EMIT1(add_2mod(0x48, dst_reg, src_reg));
+			else if (is_ereg(dst_reg) || is_ereg(src_reg))
+				EMIT1(add_2mod(0x40, dst_reg, src_reg));
+			EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
 			break;
 
-			/* mov A, X */
+			/* mov dst, src */
 		case BPF_ALU64 | BPF_MOV | BPF_X:
-			EMIT_mov(a_reg, x_reg);
+			EMIT_mov(dst_reg, src_reg);
 			break;
 
-			/* mov32 A, X */
+			/* mov32 dst, src */
 		case BPF_ALU | BPF_MOV | BPF_X:
-			if (is_ereg(a_reg) || is_ereg(x_reg))
-				EMIT1(add_2mod(0x40, a_reg, x_reg));
-			EMIT2(0x89, add_2reg(0xC0, a_reg, x_reg));
+			if (is_ereg(dst_reg) || is_ereg(src_reg))
+				EMIT1(add_2mod(0x40, dst_reg, src_reg));
+			EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
 			break;
 
-			/* neg A */
+			/* neg dst */
 		case BPF_ALU | BPF_NEG:
 		case BPF_ALU64 | BPF_NEG:
 			if (BPF_CLASS(insn->code) == BPF_ALU64)
-				EMIT1(add_1mod(0x48, a_reg));
-			else if (is_ereg(a_reg))
-				EMIT1(add_1mod(0x40, a_reg));
-			EMIT2(0xF7, add_1reg(0xD8, a_reg));
+				EMIT1(add_1mod(0x48, dst_reg));
+			else if (is_ereg(dst_reg))
+				EMIT1(add_1mod(0x40, dst_reg));
+			EMIT2(0xF7, add_1reg(0xD8, dst_reg));
 			break;
 
 		case BPF_ALU | BPF_ADD | BPF_K:
@@ -354,9 +354,9 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU64 | BPF_OR | BPF_K:
 		case BPF_ALU64 | BPF_XOR | BPF_K:
 			if (BPF_CLASS(insn->code) == BPF_ALU64)
-				EMIT1(add_1mod(0x48, a_reg));
-			else if (is_ereg(a_reg))
-				EMIT1(add_1mod(0x40, a_reg));
+				EMIT1(add_1mod(0x48, dst_reg));
+			else if (is_ereg(dst_reg))
+				EMIT1(add_1mod(0x40, dst_reg));
 
 			switch (BPF_OP(insn->code)) {
 			case BPF_ADD: b3 = 0xC0; break;
@@ -366,10 +366,10 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			case BPF_XOR: b3 = 0xF0; break;
 			}
 
-			if (is_imm8(K))
-				EMIT3(0x83, add_1reg(b3, a_reg), K);
+			if (is_imm8(imm32))
+				EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
 			else
-				EMIT2_off32(0x81, add_1reg(b3, a_reg), K);
+				EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
 			break;
 
 		case BPF_ALU64 | BPF_MOV | BPF_K:
@@ -377,23 +377,23 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			 * use 'mov eax, imm32' (which zero-extends imm32)
 			 * to save 2 bytes
 			 */
-			if (K < 0) {
+			if (imm32 < 0) {
 				/* 'mov rax, imm32' sign extends imm32 */
-				b1 = add_1mod(0x48, a_reg);
+				b1 = add_1mod(0x48, dst_reg);
 				b2 = 0xC7;
 				b3 = 0xC0;
-				EMIT3_off32(b1, b2, add_1reg(b3, a_reg), K);
+				EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
 				break;
 			}
 
 		case BPF_ALU | BPF_MOV | BPF_K:
 			/* mov %eax, imm32 */
-			if (is_ereg(a_reg))
-				EMIT1(add_1mod(0x40, a_reg));
-			EMIT1_off32(add_1reg(0xB8, a_reg), K);
+			if (is_ereg(dst_reg))
+				EMIT1(add_1mod(0x40, dst_reg));
+			EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
 			break;
 
-			/* A %= X, A /= X, A %= K, A /= K */
+			/* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */
 		case BPF_ALU | BPF_MOD | BPF_X:
 		case BPF_ALU | BPF_DIV | BPF_X:
 		case BPF_ALU | BPF_MOD | BPF_K:
@@ -406,14 +406,14 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			EMIT1(0x52); /* push rdx */
 
 			if (BPF_SRC(insn->code) == BPF_X)
-				/* mov r11, X */
-				EMIT_mov(AUX_REG, x_reg);
+				/* mov r11, src_reg */
+				EMIT_mov(AUX_REG, src_reg);
 			else
-				/* mov r11, K */
-				EMIT3_off32(0x49, 0xC7, 0xC3, K);
+				/* mov r11, imm32 */
+				EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
 
-			/* mov rax, A */
-			EMIT_mov(BPF_REG_0, a_reg);
+			/* mov rax, dst_reg */
+			EMIT_mov(BPF_REG_0, dst_reg);
 
 			/* xor edx, edx
 			 * equivalent to 'xor rdx, rdx', but one byte less
@@ -421,7 +421,7 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			EMIT2(0x31, 0xd2);
 
 			if (BPF_SRC(insn->code) == BPF_X) {
-				/* if (X == 0) return 0 */
+				/* if (src_reg == 0) return 0 */
 
 				/* cmp r11, 0 */
 				EMIT4(0x49, 0x83, 0xFB, 0x00);
@@ -457,8 +457,8 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			EMIT1(0x5A); /* pop rdx */
 			EMIT1(0x58); /* pop rax */
 
-			/* mov A, r11 */
-			EMIT_mov(a_reg, AUX_REG);
+			/* mov dst_reg, r11 */
+			EMIT_mov(dst_reg, AUX_REG);
 			break;
 
 		case BPF_ALU | BPF_MUL | BPF_K:
@@ -468,15 +468,15 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			EMIT1(0x50); /* push rax */
 			EMIT1(0x52); /* push rdx */
 
-			/* mov r11, A */
-			EMIT_mov(AUX_REG, a_reg);
+			/* mov r11, dst_reg */
+			EMIT_mov(AUX_REG, dst_reg);
 
 			if (BPF_SRC(insn->code) == BPF_X)
-				/* mov rax, X */
-				EMIT_mov(BPF_REG_0, x_reg);
+				/* mov rax, src_reg */
+				EMIT_mov(BPF_REG_0, src_reg);
 			else
-				/* mov rax, K */
-				EMIT3_off32(0x48, 0xC7, 0xC0, K);
+				/* mov rax, imm32 */
+				EMIT3_off32(0x48, 0xC7, 0xC0, imm32);
 
 			if (BPF_CLASS(insn->code) == BPF_ALU64)
 				EMIT1(add_1mod(0x48, AUX_REG));
@@ -491,8 +491,8 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 			EMIT1(0x5A); /* pop rdx */
 			EMIT1(0x58); /* pop rax */
 
-			/* mov A, r11 */
-			EMIT_mov(a_reg, AUX_REG);
+			/* mov dst_reg, r11 */
+			EMIT_mov(dst_reg, AUX_REG);
 			break;
 
 			/* shifts */
@@ -503,39 +503,39 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU64 | BPF_RSH | BPF_K:
 		case BPF_ALU64 | BPF_ARSH | BPF_K:
 			if (BPF_CLASS(insn->code) == BPF_ALU64)
-				EMIT1(add_1mod(0x48, a_reg));
-			else if (is_ereg(a_reg))
-				EMIT1(add_1mod(0x40, a_reg));
+				EMIT1(add_1mod(0x48, dst_reg));
+			else if (is_ereg(dst_reg))
+				EMIT1(add_1mod(0x40, dst_reg));
 
 			switch (BPF_OP(insn->code)) {
 			case BPF_LSH: b3 = 0xE0; break;
 			case BPF_RSH: b3 = 0xE8; break;
 			case BPF_ARSH: b3 = 0xF8; break;
 			}
-			EMIT3(0xC1, add_1reg(b3, a_reg), K);
+			EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
 			break;
 
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
-			switch (K) {
+			switch (imm32) {
 			case 16:
 				/* emit 'ror %ax, 8' to swap lower 2 bytes */
 				EMIT1(0x66);
-				if (is_ereg(a_reg))
+				if (is_ereg(dst_reg))
 					EMIT1(0x41);
-				EMIT3(0xC1, add_1reg(0xC8, a_reg), 8);
+				EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
 				break;
 			case 32:
 				/* emit 'bswap eax' to swap lower 4 bytes */
-				if (is_ereg(a_reg))
+				if (is_ereg(dst_reg))
 					EMIT2(0x41, 0x0F);
 				else
 					EMIT1(0x0F);
-				EMIT1(add_1reg(0xC8, a_reg));
+				EMIT1(add_1reg(0xC8, dst_reg));
 				break;
 			case 64:
 				/* emit 'bswap rax' to swap 8 bytes */
-				EMIT3(add_1mod(0x48, a_reg), 0x0F,
-				      add_1reg(0xC8, a_reg));
+				EMIT3(add_1mod(0x48, dst_reg), 0x0F,
+				      add_1reg(0xC8, dst_reg));
 				break;
 			}
 			break;
@@ -543,117 +543,117 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_LE:
 			break;
 
-			/* ST: *(u8*)(a_reg + off) = imm */
+			/* ST: *(u8*)(dst_reg + off) = imm */
 		case BPF_ST | BPF_MEM | BPF_B:
-			if (is_ereg(a_reg))
+			if (is_ereg(dst_reg))
 				EMIT2(0x41, 0xC6);
 			else
 				EMIT1(0xC6);
 			goto st;
 		case BPF_ST | BPF_MEM | BPF_H:
-			if (is_ereg(a_reg))
+			if (is_ereg(dst_reg))
 				EMIT3(0x66, 0x41, 0xC7);
 			else
 				EMIT2(0x66, 0xC7);
 			goto st;
 		case BPF_ST | BPF_MEM | BPF_W:
-			if (is_ereg(a_reg))
+			if (is_ereg(dst_reg))
 				EMIT2(0x41, 0xC7);
 			else
 				EMIT1(0xC7);
 			goto st;
 		case BPF_ST | BPF_MEM | BPF_DW:
-			EMIT2(add_1mod(0x48, a_reg), 0xC7);
+			EMIT2(add_1mod(0x48, dst_reg), 0xC7);
 
 st:			if (is_imm8(insn->off))
-				EMIT2(add_1reg(0x40, a_reg), insn->off);
+				EMIT2(add_1reg(0x40, dst_reg), insn->off);
 			else
-				EMIT1_off32(add_1reg(0x80, a_reg), insn->off);
+				EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);
 
-			EMIT(K, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+			EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
 			break;
 
-			/* STX: *(u8*)(a_reg + off) = x_reg */
+			/* STX: *(u8*)(dst_reg + off) = src_reg */
 		case BPF_STX | BPF_MEM | BPF_B:
 			/* emit 'mov byte ptr [rax + off], al' */
-			if (is_ereg(a_reg) || is_ereg(x_reg) ||
+			if (is_ereg(dst_reg) || is_ereg(src_reg) ||
 			    /* have to add extra byte for x86 SIL, DIL regs */
-			    x_reg == BPF_REG_1 || x_reg == BPF_REG_2)
-				EMIT2(add_2mod(0x40, a_reg, x_reg), 0x88);
+			    src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
+				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
 			else
 				EMIT1(0x88);
 			goto stx;
 		case BPF_STX | BPF_MEM | BPF_H:
-			if (is_ereg(a_reg) || is_ereg(x_reg))
-				EMIT3(0x66, add_2mod(0x40, a_reg, x_reg), 0x89);
+			if (is_ereg(dst_reg) || is_ereg(src_reg))
+				EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
 			else
 				EMIT2(0x66, 0x89);
 			goto stx;
 		case BPF_STX | BPF_MEM | BPF_W:
-			if (is_ereg(a_reg) || is_ereg(x_reg))
-				EMIT2(add_2mod(0x40, a_reg, x_reg), 0x89);
+			if (is_ereg(dst_reg) || is_ereg(src_reg))
+				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
 			else
 				EMIT1(0x89);
 			goto stx;
 		case BPF_STX | BPF_MEM | BPF_DW:
-			EMIT2(add_2mod(0x48, a_reg, x_reg), 0x89);
+			EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
 stx:			if (is_imm8(insn->off))
-				EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+				EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
 			else
-				EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+				EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
 					    insn->off);
 			break;
 
-			/* LDX: a_reg = *(u8*)(x_reg + off) */
+			/* LDX: dst_reg = *(u8*)(src_reg + off) */
 		case BPF_LDX | BPF_MEM | BPF_B:
 			/* emit 'movzx rax, byte ptr [rax + off]' */
-			EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB6);
+			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_H:
 			/* emit 'movzx rax, word ptr [rax + off]' */
-			EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB7);
+			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_W:
 			/* emit 'mov eax, dword ptr [rax+0x14]' */
-			if (is_ereg(a_reg) || is_ereg(x_reg))
-				EMIT2(add_2mod(0x40, x_reg, a_reg), 0x8B);
+			if (is_ereg(dst_reg) || is_ereg(src_reg))
+				EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
 			else
 				EMIT1(0x8B);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_DW:
 			/* emit 'mov rax, qword ptr [rax+0x14]' */
-			EMIT2(add_2mod(0x48, x_reg, a_reg), 0x8B);
+			EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
 ldx:			/* if insn->off == 0 we can save one extra byte, but
 			 * special case of x86 r13 which always needs an offset
 			 * is not worth the hassle
 			 */
 			if (is_imm8(insn->off))
-				EMIT2(add_2reg(0x40, x_reg, a_reg), insn->off);
+				EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off);
 			else
-				EMIT1_off32(add_2reg(0x80, x_reg, a_reg),
+				EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
 					    insn->off);
 			break;
 
-			/* STX XADD: lock *(u32*)(a_reg + off) += x_reg */
+			/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
 		case BPF_STX | BPF_XADD | BPF_W:
 			/* emit 'lock add dword ptr [rax + off], eax' */
-			if (is_ereg(a_reg) || is_ereg(x_reg))
-				EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x01);
+			if (is_ereg(dst_reg) || is_ereg(src_reg))
+				EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
 			else
 				EMIT2(0xF0, 0x01);
 			goto xadd;
 		case BPF_STX | BPF_XADD | BPF_DW:
-			EMIT3(0xF0, add_2mod(0x48, a_reg, x_reg), 0x01);
+			EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
 xadd:			if (is_imm8(insn->off))
-				EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+				EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
 			else
-				EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+				EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
 					    insn->off);
 			break;
 
 			/* call */
 		case BPF_JMP | BPF_CALL:
-			func = (u8 *) __bpf_call_base + K;
+			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (ctx->seen_ld_abs) {
 				EMIT2(0x41, 0x52); /* push %r10 */
@@ -663,9 +663,9 @@ xadd:			if (is_imm8(insn->off))
 				 */
 				jmp_offset += 4;
 			}
-			if (!K || !is_simm32(jmp_offset)) {
+			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported bpf func %d addr %p image %p\n",
-				       K, func, image);
+				       imm32, func, image);
 				return -EINVAL;
 			}
 			EMIT1_off32(0xE8, jmp_offset);
@@ -682,21 +682,21 @@ xadd:			if (is_imm8(insn->off))
 		case BPF_JMP | BPF_JGE | BPF_X:
 		case BPF_JMP | BPF_JSGT | BPF_X:
 		case BPF_JMP | BPF_JSGE | BPF_X:
-			/* cmp a_reg, x_reg */
-			EMIT3(add_2mod(0x48, a_reg, x_reg), 0x39,
-			      add_2reg(0xC0, a_reg, x_reg));
+			/* cmp dst_reg, src_reg */
+			EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
+			      add_2reg(0xC0, dst_reg, src_reg));
 			goto emit_cond_jmp;
 
 		case BPF_JMP | BPF_JSET | BPF_X:
-			/* test a_reg, x_reg */
-			EMIT3(add_2mod(0x48, a_reg, x_reg), 0x85,
-			      add_2reg(0xC0, a_reg, x_reg));
+			/* test dst_reg, src_reg */
+			EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x85,
+			      add_2reg(0xC0, dst_reg, src_reg));
 			goto emit_cond_jmp;
 
 		case BPF_JMP | BPF_JSET | BPF_K:
-			/* test a_reg, imm32 */
-			EMIT1(add_1mod(0x48, a_reg));
-			EMIT2_off32(0xF7, add_1reg(0xC0, a_reg), K);
+			/* test dst_reg, imm32 */
+			EMIT1(add_1mod(0x48, dst_reg));
+			EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
 			goto emit_cond_jmp;
 
 		case BPF_JMP | BPF_JEQ | BPF_K:
@@ -705,13 +705,13 @@ xadd:			if (is_imm8(insn->off))
 		case BPF_JMP | BPF_JGE | BPF_K:
 		case BPF_JMP | BPF_JSGT | BPF_K:
 		case BPF_JMP | BPF_JSGE | BPF_K:
-			/* cmp a_reg, imm8/32 */
-			EMIT1(add_1mod(0x48, a_reg));
+			/* cmp dst_reg, imm8/32 */
+			EMIT1(add_1mod(0x48, dst_reg));
 
-			if (is_imm8(K))
-				EMIT3(0x83, add_1reg(0xF8, a_reg), K);
+			if (is_imm8(imm32))
+				EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
 			else
-				EMIT2_off32(0x81, add_1reg(0xF8, a_reg), K);
+				EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
 
 emit_cond_jmp:		/* convert BPF opcode to x86 */
 			switch (BPF_OP(insn->code)) {
@@ -773,27 +773,27 @@ emit_jmp:
 			func = sk_load_word;
 			goto common_load;
 		case BPF_LD | BPF_ABS | BPF_W:
-			func = CHOOSE_LOAD_FUNC(K, sk_load_word);
+			func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
 common_load:		ctx->seen_ld_abs = true;
 			jmp_offset = func - (image + addrs[i]);
 			if (!func || !is_simm32(jmp_offset)) {
 				pr_err("unsupported bpf func %d addr %p image %p\n",
-				       K, func, image);
+				       imm32, func, image);
 				return -EINVAL;
 			}
 			if (BPF_MODE(insn->code) == BPF_ABS) {
 				/* mov %esi, imm32 */
-				EMIT1_off32(0xBE, K);
+				EMIT1_off32(0xBE, imm32);
 			} else {
-				/* mov %rsi, x_reg */
-				EMIT_mov(BPF_REG_2, x_reg);
-				if (K) {
-					if (is_imm8(K))
+				/* mov %rsi, src_reg */
+				EMIT_mov(BPF_REG_2, src_reg);
+				if (imm32) {
+					if (is_imm8(imm32))
 						/* add %esi, imm8 */
-						EMIT3(0x83, 0xC6, K);
+						EMIT3(0x83, 0xC6, imm32);
 					else
 						/* add %esi, imm32 */
-						EMIT2_off32(0x81, 0xC6, K);
+						EMIT2_off32(0x81, 0xC6, imm32);
 				}
 			}
 			/* skb pointer is in R6 (%rbx), it will be copied into
@@ -808,13 +808,13 @@ common_load:		ctx->seen_ld_abs = true;
 			func = sk_load_half;
 			goto common_load;
 		case BPF_LD | BPF_ABS | BPF_H:
-			func = CHOOSE_LOAD_FUNC(K, sk_load_half);
+			func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
 			goto common_load;
 		case BPF_LD | BPF_IND | BPF_B:
 			func = sk_load_byte;
 			goto common_load;
 		case BPF_LD | BPF_ABS | BPF_B:
-			func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
+			func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
 			goto common_load;
 
 		case BPF_JMP | BPF_EXIT:
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f0c2ad43b4af..a7e3c48d73a7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -78,161 +78,173 @@ enum {
 
 /* Helper macros for filter block array initializers. */
 
-/* ALU ops on registers, bpf_add|sub|...: A += X */
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
 
-#define BPF_ALU64_REG(OP, A, X)					\
+#define BPF_ALU64_REG(OP, DST, SRC)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = 0,					\
 		.imm   = 0 })
 
-#define BPF_ALU32_REG(OP, A, X)					\
+#define BPF_ALU32_REG(OP, DST, SRC)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = 0,					\
 		.imm   = 0 })
 
-/* ALU ops on immediates, bpf_add|sub|...: A += IMM */
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
 
-#define BPF_ALU64_IMM(OP, A, IMM)				\
+#define BPF_ALU64_IMM(OP, DST, IMM)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
-		.a_reg = A,					\
-		.x_reg = 0,					\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
 		.imm   = IMM })
 
-#define BPF_ALU32_IMM(OP, A, IMM)				\
+#define BPF_ALU32_IMM(OP, DST, IMM)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
-		.a_reg = A,					\
-		.x_reg = 0,					\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
 		.imm   = IMM })
 
 /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
 
-#define BPF_ENDIAN(TYPE, A, LEN)				\
+#define BPF_ENDIAN(TYPE, DST, LEN)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),	\
-		.a_reg = A,					\
-		.x_reg = 0,					\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
 		.imm   = LEN })
 
-/* Short form of mov, A = X */
+/* Short form of mov, dst_reg = src_reg */
 
-#define BPF_MOV64_REG(A, X)					\
+#define BPF_MOV64_REG(DST, SRC)					\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = 0,					\
 		.imm   = 0 })
 
-#define BPF_MOV32_REG(A, X)					\
+#define BPF_MOV32_REG(DST, SRC)					\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = 0,					\
 		.imm   = 0 })
 
-/* Short form of mov, A = IMM */
+/* Short form of mov, dst_reg = imm32 */
 
-#define BPF_MOV64_IMM(A, IMM)					\
+#define BPF_MOV64_IMM(DST, IMM)					\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
-		.a_reg = A,					\
-		.x_reg = 0,					\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
 		.imm   = IMM })
 
-#define BPF_MOV32_IMM(A, IMM)					\
+#define BPF_MOV32_IMM(DST, IMM)					\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU | BPF_MOV | BPF_K,		\
-		.a_reg = A,					\
-		.x_reg = 0,					\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
 		.imm   = IMM })
 
-/* Short form of mov based on type, BPF_X: A = X,  BPF_K: A = IMM */
+/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */
 
-#define BPF_MOV64_RAW(TYPE, A, X, IMM)				\
+#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)			\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),	\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = 0,					\
 		.imm   = IMM })
 
-#define BPF_MOV32_RAW(TYPE, A, X, IMM)				\
+#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM)			\
 	((struct sock_filter_int) {				\
 		.code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),	\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = 0,					\
 		.imm   = IMM })
 
-/* Direct packet access, R0 = *(uint *) (skb->data + OFF) */
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
 
-#define BPF_LD_ABS(SIZE, OFF)					\
+#define BPF_LD_ABS(SIZE, IMM)					\
 	((struct sock_filter_int) {				\
 		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
-		.a_reg = 0,					\
-		.x_reg = 0,					\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
-		.imm   = OFF })
+		.imm   = IMM })
 
-/* Indirect packet access, R0 = *(uint *) (skb->data + X + OFF) */
+/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */
 
-#define BPF_LD_IND(SIZE, X, OFF)				\
+#define BPF_LD_IND(SIZE, SRC, IMM)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,	\
-		.a_reg = 0,					\
-		.x_reg = X,					\
+		.dst_reg = 0,					\
+		.src_reg = SRC,					\
 		.off   = 0,					\
-		.imm   = OFF })
+		.imm   = IMM })
 
-/* Memory store, A = *(uint *) (X + OFF), and vice versa */
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
 
-#define BPF_LDX_MEM(SIZE, A, X, OFF)				\
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)			\
 	((struct sock_filter_int) {				\
 		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = OFF,					\
 		.imm   = 0 })
 
-#define BPF_STX_MEM(SIZE, A, X, OFF)				\
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
 	((struct sock_filter_int) {				\
 		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = OFF,					\
 		.imm   = 0 })
 
-/* Conditional jumps against registers, if (A 'op' X) goto pc + OFF */
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
+	((struct sock_filter_int) {				\
+		.code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
 
-#define BPF_JMP_REG(OP, A, X, OFF)				\
+#define BPF_JMP_REG(OP, DST, SRC, OFF)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = OFF,					\
 		.imm   = 0 })
 
-/* Conditional jumps against immediates, if (A 'op' IMM) goto pc + OFF */
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
 
-#define BPF_JMP_IMM(OP, A, IMM, OFF)				\
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)				\
 	((struct sock_filter_int) {				\
 		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
-		.a_reg = A,					\
-		.x_reg = 0,					\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
 		.off   = OFF,					\
 		.imm   = IMM })
 
@@ -241,18 +253,18 @@ enum {
 #define BPF_EMIT_CALL(FUNC)					\
 	((struct sock_filter_int) {				\
 		.code  = BPF_JMP | BPF_CALL,			\
-		.a_reg = 0,					\
-		.x_reg = 0,					\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
 		.imm   = ((FUNC) - __bpf_call_base) })
 
 /* Raw code statement block */
 
-#define BPF_RAW_INSN(CODE, A, X, OFF, IMM)			\
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)			\
 	((struct sock_filter_int) {				\
 		.code  = CODE,					\
-		.a_reg = A,					\
-		.x_reg = X,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
 		.off   = OFF,					\
 		.imm   = IMM })
 
@@ -261,8 +273,8 @@ enum {
 #define BPF_EXIT_INSN()						\
 	((struct sock_filter_int) {				\
 		.code  = BPF_JMP | BPF_EXIT,			\
-		.a_reg = 0,					\
-		.x_reg = 0,					\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
 		.off   = 0,					\
 		.imm   = 0 })
 
@@ -287,8 +299,8 @@ enum {
 
 struct sock_filter_int {
 	__u8	code;		/* opcode */
-	__u8	a_reg:4;	/* dest register */
-	__u8	x_reg:4;	/* source register */
+	__u8	dst_reg:4;	/* dest register */
+	__u8	src_reg:4;	/* source register */
 	__s16	off;		/* signed offset */
 	__s32	imm;		/* signed immediate constant */
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index 6bd2e350e751..b3f21751b238 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,12 +59,12 @@
 #define BPF_R10	regs[BPF_REG_10]
 
 /* Named registers */
-#define A	regs[insn->a_reg]
-#define X	regs[insn->x_reg]
+#define DST	regs[insn->dst_reg]
+#define SRC	regs[insn->src_reg]
 #define FP	regs[BPF_REG_FP]
 #define ARG1	regs[BPF_REG_ARG1]
 #define CTX	regs[BPF_REG_CTX]
-#define K	insn->imm
+#define IMM	insn->imm
 
 /* No hurry in this branch
  *
@@ -264,7 +264,7 @@ static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *ins
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
 	ARG1 = (u64) (unsigned long) ctx;
 
-	/* Register for user BPF programs need to be reset first. */
+	/* Registers used in classic BPF programs need to be reset first. */
 	regs[BPF_REG_A] = 0;
 	regs[BPF_REG_X] = 0;
 
@@ -274,16 +274,16 @@ select_insn:
 	/* ALU */
 #define ALU(OPCODE, OP)			\
 	ALU64_##OPCODE##_X:		\
-		A = A OP X;		\
+		DST = DST OP SRC;	\
 		CONT;			\
 	ALU_##OPCODE##_X:		\
-		A = (u32) A OP (u32) X;	\
+		DST = (u32) DST OP (u32) SRC;	\
 		CONT;			\
 	ALU64_##OPCODE##_K:		\
-		A = A OP K;		\
+		DST = DST OP IMM;		\
 		CONT;			\
 	ALU_##OPCODE##_K:		\
-		A = (u32) A OP (u32) K;	\
+		DST = (u32) DST OP (u32) IMM;	\
 		CONT;
 
 	ALU(ADD,  +)
@@ -296,92 +296,92 @@ select_insn:
 	ALU(MUL,  *)
 #undef ALU
 	ALU_NEG:
-		A = (u32) -A;
+		DST = (u32) -DST;
 		CONT;
 	ALU64_NEG:
-		A = -A;
+		DST = -DST;
 		CONT;
 	ALU_MOV_X:
-		A = (u32) X;
+		DST = (u32) SRC;
 		CONT;
 	ALU_MOV_K:
-		A = (u32) K;
+		DST = (u32) IMM;
 		CONT;
 	ALU64_MOV_X:
-		A = X;
+		DST = SRC;
 		CONT;
 	ALU64_MOV_K:
-		A = K;
+		DST = IMM;
 		CONT;
 	ALU64_ARSH_X:
-		(*(s64 *) &A) >>= X;
+		(*(s64 *) &DST) >>= SRC;
 		CONT;
 	ALU64_ARSH_K:
-		(*(s64 *) &A) >>= K;
+		(*(s64 *) &DST) >>= IMM;
 		CONT;
 	ALU64_MOD_X:
-		if (unlikely(X == 0))
+		if (unlikely(SRC == 0))
 			return 0;
-		tmp = A;
-		A = do_div(tmp, X);
+		tmp = DST;
+		DST = do_div(tmp, SRC);
 		CONT;
 	ALU_MOD_X:
-		if (unlikely(X == 0))
+		if (unlikely(SRC == 0))
 			return 0;
-		tmp = (u32) A;
-		A = do_div(tmp, (u32) X);
+		tmp = (u32) DST;
+		DST = do_div(tmp, (u32) SRC);
 		CONT;
 	ALU64_MOD_K:
-		tmp = A;
-		A = do_div(tmp, K);
+		tmp = DST;
+		DST = do_div(tmp, IMM);
 		CONT;
 	ALU_MOD_K:
-		tmp = (u32) A;
-		A = do_div(tmp, (u32) K);
+		tmp = (u32) DST;
+		DST = do_div(tmp, (u32) IMM);
 		CONT;
 	ALU64_DIV_X:
-		if (unlikely(X == 0))
+		if (unlikely(SRC == 0))
 			return 0;
-		do_div(A, X);
+		do_div(DST, SRC);
 		CONT;
 	ALU_DIV_X:
-		if (unlikely(X == 0))
+		if (unlikely(SRC == 0))
 			return 0;
-		tmp = (u32) A;
-		do_div(tmp, (u32) X);
-		A = (u32) tmp;
+		tmp = (u32) DST;
+		do_div(tmp, (u32) SRC);
+		DST = (u32) tmp;
 		CONT;
 	ALU64_DIV_K:
-		do_div(A, K);
+		do_div(DST, IMM);
 		CONT;
 	ALU_DIV_K:
-		tmp = (u32) A;
-		do_div(tmp, (u32) K);
-		A = (u32) tmp;
+		tmp = (u32) DST;
+		do_div(tmp, (u32) IMM);
+		DST = (u32) tmp;
 		CONT;
 	ALU_END_TO_BE:
-		switch (K) {
+		switch (IMM) {
 		case 16:
-			A = (__force u16) cpu_to_be16(A);
+			DST = (__force u16) cpu_to_be16(DST);
 			break;
 		case 32:
-			A = (__force u32) cpu_to_be32(A);
+			DST = (__force u32) cpu_to_be32(DST);
 			break;
 		case 64:
-			A = (__force u64) cpu_to_be64(A);
+			DST = (__force u64) cpu_to_be64(DST);
 			break;
 		}
 		CONT;
 	ALU_END_TO_LE:
-		switch (K) {
+		switch (IMM) {
 		case 16:
-			A = (__force u16) cpu_to_le16(A);
+			DST = (__force u16) cpu_to_le16(DST);
 			break;
 		case 32:
-			A = (__force u32) cpu_to_le32(A);
+			DST = (__force u32) cpu_to_le32(DST);
 			break;
 		case 64:
-			A = (__force u64) cpu_to_le64(A);
+			DST = (__force u64) cpu_to_le64(DST);
 			break;
 		}
 		CONT;
@@ -401,85 +401,85 @@ select_insn:
 		insn += insn->off;
 		CONT;
 	JMP_JEQ_X:
-		if (A == X) {
+		if (DST == SRC) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JEQ_K:
-		if (A == K) {
+		if (DST == IMM) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JNE_X:
-		if (A != X) {
+		if (DST != SRC) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JNE_K:
-		if (A != K) {
+		if (DST != IMM) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JGT_X:
-		if (A > X) {
+		if (DST > SRC) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JGT_K:
-		if (A > K) {
+		if (DST > IMM) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JGE_X:
-		if (A >= X) {
+		if (DST >= SRC) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JGE_K:
-		if (A >= K) {
+		if (DST >= IMM) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JSGT_X:
-		if (((s64) A) > ((s64) X)) {
+		if (((s64) DST) > ((s64) SRC)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JSGT_K:
-		if (((s64) A) > ((s64) K)) {
+		if (((s64) DST) > ((s64) IMM)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JSGE_X:
-		if (((s64) A) >= ((s64) X)) {
+		if (((s64) DST) >= ((s64) SRC)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JSGE_K:
-		if (((s64) A) >= ((s64) K)) {
+		if (((s64) DST) >= ((s64) IMM)) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JSET_X:
-		if (A & X) {
+		if (DST & SRC) {
 			insn += insn->off;
 			CONT_JMP;
 		}
 		CONT;
 	JMP_JSET_K:
-		if (A & K) {
+		if (DST & IMM) {
 			insn += insn->off;
 			CONT_JMP;
 		}
@@ -488,15 +488,15 @@ select_insn:
 		return BPF_R0;
 
 	/* STX and ST and LDX*/
-#define LDST(SIZEOP, SIZE)					\
-	STX_MEM_##SIZEOP:					\
-		*(SIZE *)(unsigned long) (A + insn->off) = X;	\
-		CONT;						\
-	ST_MEM_##SIZEOP:					\
-		*(SIZE *)(unsigned long) (A + insn->off) = K;	\
-		CONT;						\
-	LDX_MEM_##SIZEOP:					\
-		A = *(SIZE *)(unsigned long) (X + insn->off);	\
+#define LDST(SIZEOP, SIZE)						\
+	STX_MEM_##SIZEOP:						\
+		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\
+		CONT;							\
+	ST_MEM_##SIZEOP:						\
+		*(SIZE *)(unsigned long) (DST + insn->off) = IMM;	\
+		CONT;							\
+	LDX_MEM_##SIZEOP:						\
+		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\
 		CONT;
 
 	LDST(B,   u8)
@@ -504,16 +504,16 @@ select_insn:
 	LDST(W,  u32)
 	LDST(DW, u64)
 #undef LDST
-	STX_XADD_W: /* lock xadd *(u32 *)(A + insn->off) += X */
-		atomic_add((u32) X, (atomic_t *)(unsigned long)
-			   (A + insn->off));
+	STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+		atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+			   (DST + insn->off));
 		CONT;
-	STX_XADD_DW: /* lock xadd *(u64 *)(A + insn->off) += X */
-		atomic64_add((u64) X, (atomic64_t *)(unsigned long)
-			     (A + insn->off));
+	STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+			     (DST + insn->off));
 		CONT;
-	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + K)) */
-		off = K;
+	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+		off = IMM;
 load_word:
 		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
 		 * only appearing in the programs where ctx ==
@@ -527,51 +527,51 @@ load_word:
 		 * BPF_R6-BPF_R9, and store return value into BPF_R0.
 		 *
 		 * Implicit input:
-		 *   ctx
+		 *   ctx == skb == BPF_R6 == CTX
 		 *
 		 * Explicit input:
-		 *   X == any register
-		 *   K == 32-bit immediate
+		 *   SRC == any register
+		 *   IMM == 32-bit immediate
 		 *
 		 * Output:
 		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
 		 */
 
-		ptr = load_pointer((struct sk_buff *) ctx, off, 4, &tmp);
+		ptr = load_pointer((struct sk_buff *) CTX, off, 4, &tmp);
 		if (likely(ptr != NULL)) {
 			BPF_R0 = get_unaligned_be32(ptr);
 			CONT;
 		}
 
 		return 0;
-	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + K)) */
-		off = K;
+	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+		off = IMM;
 load_half:
-		ptr = load_pointer((struct sk_buff *) ctx, off, 2, &tmp);
+		ptr = load_pointer((struct sk_buff *) CTX, off, 2, &tmp);
 		if (likely(ptr != NULL)) {
 			BPF_R0 = get_unaligned_be16(ptr);
 			CONT;
 		}
 
 		return 0;
-	LD_ABS_B: /* BPF_R0 = *(u8 *) (ctx + K) */
-		off = K;
+	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+		off = IMM;
 load_byte:
-		ptr = load_pointer((struct sk_buff *) ctx, off, 1, &tmp);
+		ptr = load_pointer((struct sk_buff *) CTX, off, 1, &tmp);
 		if (likely(ptr != NULL)) {
 			BPF_R0 = *(u8 *)ptr;
 			CONT;
 		}
 
 		return 0;
-	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + X + K)) */
-		off = K + X;
+	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+		off = IMM + SRC;
 		goto load_word;
-	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + X + K)) */
-		off = K + X;
+	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+		off = IMM + SRC;
 		goto load_half;
-	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + X + K) */
-		off = K + X;
+	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+		off = IMM + SRC;
 		goto load_byte;
 
 	default_label:
@@ -675,7 +675,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_PROTOCOL:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
-		/* A = *(u16 *) (ctx + offsetof(protocol)) */
+		/* A = *(u16 *) (CTX + offsetof(protocol)) */
 		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 				      offsetof(struct sk_buff, protocol));
 		/* A = ntohs(A) [emitting a nop or swap16] */
@@ -741,7 +741,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
 
-		/* A = *(u16 *) (ctx + offsetof(vlan_tci)) */
+		/* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
 		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 				      offsetof(struct sk_buff, vlan_tci));
 		if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
@@ -760,13 +760,13 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
-		/* arg1 = ctx */
+		/* arg1 = CTX */
 		*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 		/* arg2 = A */
 		*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
 		/* arg3 = X */
 		*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
-		/* Emit call(ctx, arg2=A, arg3=X) */
+		/* Emit call(arg1=CTX, arg2=A, arg3=X) */
 		switch (fp->k) {
 		case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 			*insn = BPF_EMIT_CALL(__skb_get_pay_offset);
@@ -941,12 +941,12 @@ do_pass:
 				 */
 				*insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
 
-				insn->a_reg = BPF_REG_A;
-				insn->x_reg = BPF_REG_TMP;
+				insn->dst_reg = BPF_REG_A;
+				insn->src_reg = BPF_REG_TMP;
 				bpf_src = BPF_X;
 			} else {
-				insn->a_reg = BPF_REG_A;
-				insn->x_reg = BPF_REG_X;
+				insn->dst_reg = BPF_REG_A;
+				insn->src_reg = BPF_REG_X;
 				insn->imm = fp->k;
 				bpf_src = BPF_SRC(fp->code);
 			}
-- 
cgit v1.2.3-71-gd317


From 67cb9366ff5f99868100198efba5ca88aaa6ad25 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 11 Jun 2014 18:19:28 +0200
Subject: ktime: add ktime_after and ktime_before helper

Add two minimal helper functions analogous to time_before() and
time_after() that will later on both be needed by SCTP code.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ktime.h    | 24 ++++++++++++++++++++++++
 net/sctp/sm_make_chunk.c |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 31c0cd1c941a..de9e46e6bcc9 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -304,6 +304,30 @@ static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
 	return 0;
 }
 
+/**
+ * ktime_after - Compare if a ktime_t value is bigger than another one.
+ * @cmp1:	comparable1
+ * @cmp2:	comparable2
+ *
+ * Return: true if cmp1 happened after cmp2.
+ */
+static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return ktime_compare(cmp1, cmp2) > 0;
+}
+
+/**
+ * ktime_before - Compare if a ktime_t value is smaller than another one.
+ * @cmp1:	comparable1
+ * @cmp2:	comparable2
+ *
+ * Return: true if cmp1 happened before cmp2.
+ */
+static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return ktime_compare(cmp1, cmp2) < 0;
+}
+
 static inline s64 ktime_to_us(const ktime_t kt)
 {
 	struct timeval tv = ktime_to_timeval(kt);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index fee5552ddf92..ae0e616a7ca5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1782,7 +1782,7 @@ no_hmac:
 	else
 		kt = ktime_get();
 
-	if (!asoc && ktime_compare(bear_cookie->expiration, kt) < 0) {
+	if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
 		/*
 		 * Section 3.3.10.3 Stale Cookie Error (3)
 		 *
-- 
cgit v1.2.3-71-gd317


From 2940474af79744411da0cb63b041ad52c57bc443 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jun 2014 13:49:23 +0200
Subject: block: remove elv_abort_queue and blk_abort_flushes

elv_abort_queue has no callers, and blk_abort_flushes is only called by
elv_abort_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c        | 38 --------------------------------------
 block/blk.h              |  1 -
 block/elevator.c         | 20 --------------------
 include/linux/elevator.h |  1 -
 4 files changed, 60 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8ffee4b5f93d..3cb5e9e7108a 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -421,44 +421,6 @@ void blk_insert_flush(struct request *rq)
 	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
 }
 
-/**
- * blk_abort_flushes - @q is being aborted, abort flush requests
- * @q: request_queue being aborted
- *
- * To be called from elv_abort_queue().  @q is being aborted.  Prepare all
- * FLUSH/FUA requests for abortion.
- *
- * CONTEXT:
- * spin_lock_irq(q->queue_lock)
- */
-void blk_abort_flushes(struct request_queue *q)
-{
-	struct request *rq, *n;
-	int i;
-
-	/*
-	 * Requests in flight for data are already owned by the dispatch
-	 * queue or the device driver.  Just restore for normal completion.
-	 */
-	list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
-		list_del_init(&rq->flush.list);
-		blk_flush_restore_request(rq);
-	}
-
-	/*
-	 * We need to give away requests on flush queues.  Restore for
-	 * normal completion and put them on the dispatch queue.
-	 */
-	for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
-		list_for_each_entry_safe(rq, n, &q->flush_queue[i],
-					 flush.list) {
-			list_del_init(&rq->flush.list);
-			blk_flush_restore_request(rq);
-			list_add_tail(&rq->queuelist, &q->queue_head);
-		}
-	}
-}
-
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
diff --git a/block/blk.h b/block/blk.h
index 45385e9abf6f..6748c4f8d7a1 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -84,7 +84,6 @@ static inline void blk_clear_rq_complete(struct request *rq)
 #define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
 
 void blk_insert_flush(struct request *rq);
-void blk_abort_flushes(struct request_queue *q);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
diff --git a/block/elevator.c b/block/elevator.c
index f35edddfe9b5..34bded18910e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -729,26 +729,6 @@ int elv_may_queue(struct request_queue *q, int rw)
 	return ELV_MQUEUE_MAY;
 }
 
-void elv_abort_queue(struct request_queue *q)
-{
-	struct request *rq;
-
-	blk_abort_flushes(q);
-
-	while (!list_empty(&q->queue_head)) {
-		rq = list_entry_rq(q->queue_head.next);
-		rq->cmd_flags |= REQ_QUIET;
-		trace_block_rq_abort(q, rq);
-		/*
-		 * Mark this request as started so we don't trigger
-		 * any debug logic in the end I/O path.
-		 */
-		blk_start_request(rq);
-		__blk_end_request_all(rq, -EIO);
-	}
-}
-EXPORT_SYMBOL(elv_abort_queue);
-
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4ff262e2bf37..e2a6bd7fb133 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -133,7 +133,6 @@ extern struct request *elv_latter_request(struct request_queue *, struct request
 extern int elv_register_queue(struct request_queue *q);
 extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, int);
-extern void elv_abort_queue(struct request_queue *);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
 			   struct bio *bio, gfp_t gfp_mask);
-- 
cgit v1.2.3-71-gd317


From da91309e0a7e8966d916a74cce42ed170fde06bf Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Mon, 9 Jun 2014 10:24:38 +0300
Subject: cpumask: Utility function to set n'th cpu - local cpu first

This function sets the n'th cpu - local cpu's first.
For example: in a 16 cores server with even cpu's local, will get the
following values:
cpumask_set_cpu_local_first(0, numa, cpumask) => cpu 0 is set
cpumask_set_cpu_local_first(1, numa, cpumask) => cpu 2 is set
...
cpumask_set_cpu_local_first(7, numa, cpumask) => cpu 14 is set
cpumask_set_cpu_local_first(8, numa, cpumask) => cpu 1 is set
cpumask_set_cpu_local_first(9, numa, cpumask) => cpu 3 is set
...
cpumask_set_cpu_local_first(15, numa, cpumask) => cpu 15 is set

Curently this function will be used by multi queue networking devices to
calculate the irq affinity mask, such that as many local cpu's as
possible will be utilized to handle the mq device irq's.

Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cpumask.h |  8 +++++++
 lib/cpumask.c           | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d08e4d2a9b92..d5ef249735d2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -142,6 +142,13 @@ static inline unsigned int cpumask_any_but(const struct cpumask *mask,
 	return 1;
 }
 
+static inline int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp)
+{
+	set_bit(0, cpumask_bits(dstp));
+
+	return 0;
+}
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
@@ -192,6 +199,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 
 int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
 int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
diff --git a/lib/cpumask.c b/lib/cpumask.c
index b810b753c607..c101230658eb 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -164,3 +164,66 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 	memblock_free_early(__pa(mask), cpumask_size());
 }
 #endif
+
+/**
+ * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first
+ *
+ * @i: index number
+ * @numa_node: local numa_node
+ * @dstp: cpumask with the relevant cpu bit set according to the policy
+ *
+ * This function sets the cpumask according to a numa aware policy.
+ * cpumask could be used as an affinity hint for the IRQ related to a
+ * queue. When the policy is to spread queues across cores - local cores
+ * first.
+ *
+ * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set
+ * the cpu bit and need to re-call the function.
+ */
+int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp)
+{
+	cpumask_var_t mask;
+	int cpu;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	i %= num_online_cpus();
+
+	if (!cpumask_of_node(numa_node)) {
+		/* Use all online cpu's for non numa aware system */
+		cpumask_copy(mask, cpu_online_mask);
+	} else {
+		int n;
+
+		cpumask_and(mask,
+			    cpumask_of_node(numa_node), cpu_online_mask);
+
+		n = cpumask_weight(mask);
+		if (i >= n) {
+			i -= n;
+
+			/* If index > number of local cpu's, mask out local
+			 * cpu's
+			 */
+			cpumask_andnot(mask, cpu_online_mask, mask);
+		}
+	}
+
+	for_each_cpu(cpu, mask) {
+		if (--i < 0)
+			goto out;
+	}
+
+	ret = -EAGAIN;
+
+out:
+	free_cpumask_var(mask);
+
+	if (!ret)
+		cpumask_set_cpu(cpu, dstp);
+
+	return ret;
+}
+EXPORT_SYMBOL(cpumask_set_cpu_local_first);
-- 
cgit v1.2.3-71-gd317


From bad93e9d4eeb0d2d6b79204d6cedc7f2e7b256f1 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <octavian.purdila@intel.com>
Date: Thu, 12 Jun 2014 01:36:26 +0300
Subject: net: add __pskb_copy_fclone and pskb_copy_for_clone

There are several instances where a pskb_copy or __pskb_copy is
immediately followed by an skb_clone.

Add a couple of new functions to allow the copy skb to be allocated
from the fclone cache and thus speed up subsequent skb_clone calls.

Cc: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Marek Lindner <mareklindner@neomailbox.ch>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Cc: Antonio Quartulli <antonio@meshcoding.com>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Arvid Brodin <arvid.brodin@alten.se>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Lauro Ramos Venancio <lauro.venancio@openbossa.org>
Cc: Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Allan Stephens <allan.stephens@windriver.com>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Reviewed-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h                 | 16 +++++++++++++++-
 net/batman-adv/distributed-arp-table.c |  2 +-
 net/batman-adv/network-coding.c        |  2 +-
 net/bluetooth/hci_sock.c               |  6 +++---
 net/core/skbuff.c                      | 14 +++++++++-----
 net/nfc/llcp_core.c                    |  4 ++--
 net/nfc/rawsock.c                      |  4 ++--
 net/tipc/bcast.c                       |  2 +-
 8 files changed, 34 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c705808bef9c..1f50bfe2243d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -744,7 +744,13 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
-struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask);
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+				   gfp_t gfp_mask, bool fclone);
+static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
+					  gfp_t gfp_mask)
+{
+	return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
+}
 
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
@@ -2238,6 +2244,14 @@ static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
 	return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
 }
 
+
+static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
+						  gfp_t gfp_mask)
+{
+	return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
+}
+
+
 /**
  *	skb_clone_writable - is the header of a clone writable
  *	@skb: buffer to check
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index dcd99b2bea3c..f2c066b21716 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -594,7 +594,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
 		if (!neigh_node)
 			goto free_orig;
 
-		tmp_skb = pskb_copy(skb, GFP_ATOMIC);
+		tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
 		if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb,
 							   cand[i].orig_node,
 							   packet_subtype)) {
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 40a2fc4bcf4c..8d04d174669e 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1344,7 +1344,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
 	struct ethhdr *ethhdr;
 
 	/* Copy skb header to change the mac header */
-	skb = pskb_copy(skb, GFP_ATOMIC);
+	skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
 	if (!skb)
 		return;
 
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index f608bffdb8b9..80d25c150a65 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -143,7 +143,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
 
 		if (!skb_copy) {
 			/* Create a private copy with headroom */
-			skb_copy = __pskb_copy(skb, 1, GFP_ATOMIC);
+			skb_copy = __pskb_copy_fclone(skb, 1, GFP_ATOMIC, true);
 			if (!skb_copy)
 				continue;
 
@@ -247,8 +247,8 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
 			struct hci_mon_hdr *hdr;
 
 			/* Create a private copy with headroom */
-			skb_copy = __pskb_copy(skb, HCI_MON_HDR_SIZE,
-					       GFP_ATOMIC);
+			skb_copy = __pskb_copy_fclone(skb, HCI_MON_HDR_SIZE,
+						      GFP_ATOMIC, true);
 			if (!skb_copy)
 				continue;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 05f4bef2ce12..b9e85e6cb26a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -951,10 +951,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 EXPORT_SYMBOL(skb_copy);
 
 /**
- *	__pskb_copy	-	create copy of an sk_buff with private head.
+ *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.
  *	@skb: buffer to copy
  *	@headroom: headroom of new skb
  *	@gfp_mask: allocation priority
+ *	@fclone: if true allocate the copy of the skb from the fclone
+ *	cache instead of the head cache; it is recommended to set this
+ *	to true for the cases where the copy will likely be cloned
  *
  *	Make a copy of both an &sk_buff and part of its data, located
  *	in header. Fragmented data remain shared. This is used when
@@ -964,11 +967,12 @@ EXPORT_SYMBOL(skb_copy);
  *	The returned buffer has a reference count of 1.
  */
 
-struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+				   gfp_t gfp_mask, bool fclone)
 {
 	unsigned int size = skb_headlen(skb) + headroom;
-	struct sk_buff *n = __alloc_skb(size, gfp_mask,
-					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
 
 	if (!n)
 		goto out;
@@ -1008,7 +1012,7 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
 out:
 	return n;
 }
-EXPORT_SYMBOL(__pskb_copy);
+EXPORT_SYMBOL(__pskb_copy_fclone);
 
 /**
  *	pskb_expand_head - reallocate header of &sk_buff
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index f6278da68763..51e788797317 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -680,8 +680,8 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local,
 			continue;
 
 		if (skb_copy == NULL) {
-			skb_copy = __pskb_copy(skb, NFC_RAW_HEADER_SIZE,
-					       GFP_ATOMIC);
+			skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE,
+						      GFP_ATOMIC, true);
 
 			if (skb_copy == NULL)
 				continue;
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 55eefee311eb..11c3544ea546 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -378,8 +378,8 @@ void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb,
 
 	sk_for_each(sk, &raw_sk_list.head) {
 		if (!skb_copy) {
-			skb_copy = __pskb_copy(skb, NFC_RAW_HEADER_SIZE,
-				     GFP_ATOMIC);
+			skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE,
+						      GFP_ATOMIC, true);
 			if (!skb_copy)
 				continue;
 
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 671f9817b4f4..26631679a1fa 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -653,7 +653,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1,
 			tipc_bearer_send(b->identity, buf, &b->bcast_addr);
 		} else {
 			/* Avoid concurrent buffer access */
-			tbuf = pskb_copy(buf, GFP_ATOMIC);
+			tbuf = pskb_copy_for_clone(buf, GFP_ATOMIC);
 			if (!tbuf)
 				break;
 			tipc_bearer_send(b->identity, tbuf, &b->bcast_addr);
-- 
cgit v1.2.3-71-gd317


From 5d0c2b95bc57cf8fdc0e7b3e9d7e751eb65ad052 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Jun 2014 18:54:13 -0700
Subject: net: Preserve CHECKSUM_COMPLETE at validation

Currently when the first checksum in a packet is validated using
CHECKSUM_COMPLETE, ip_summed is overwritten to be CHECKSUM_UNNECESSARY
so that any subsequent checksums in the packet are not correctly
validated.

This patch adds csum_valid flag in sk_buff and uses that to indicate
validated checksum instead of setting CHECKSUM_UNNECESSARY. The bit
is set accordingly in the skb_checksum_validate_* functions. The flag
is checked in skb_checksum_complete, so that validation is communicated
between checksum_init and checksum_complete sequence in TCP and UDP.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1f50bfe2243d..72a53805858a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -572,7 +572,8 @@ struct sk_buff {
 	 */
 	__u8			encapsulation:1;
 	__u8			encap_hdr_csum:1;
-	/* 5/7 bit hole (depending on ndisc_nodetype presence) */
+	__u8			csum_valid:1;
+	/* 4/6 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
 #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
@@ -2735,7 +2736,7 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb);
 
 static inline int skb_csum_unnecessary(const struct sk_buff *skb)
 {
-	return skb->ip_summed & CHECKSUM_UNNECESSARY;
+	return ((skb->ip_summed & CHECKSUM_UNNECESSARY) || skb->csum_valid);
 }
 
 /**
@@ -2769,10 +2770,8 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
 						  bool zero_okay,
 						  __sum16 check)
 {
-	if (skb_csum_unnecessary(skb)) {
-		return false;
-	} else if (zero_okay && !check) {
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
+		skb->csum_valid = 1;
 		return false;
 	}
 
@@ -2799,15 +2798,20 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
 {
 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		if (!csum_fold(csum_add(psum, skb->csum))) {
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			skb->csum_valid = 1;
 			return 0;
 		}
 	}
 
 	skb->csum = psum;
 
-	if (complete || skb->len <= CHECKSUM_BREAK)
-		return __skb_checksum_complete(skb);
+	if (complete || skb->len <= CHECKSUM_BREAK) {
+		__sum16 csum;
+
+		csum = __skb_checksum_complete(skb);
+		skb->csum_valid = !csum;
+		return csum;
+	}
 
 	return 0;
 }
@@ -2831,6 +2835,7 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
 				zero_okay, check, compute_pseudo)	\
 ({									\
 	__sum16 __ret = 0;						\
+	skb->csum_valid = 0;						\
 	if (__skb_checksum_validate_needed(skb, zero_okay, check))	\
 		__ret = __skb_checksum_validate_complete(skb,		\
 				complete, compute_pseudo(skb, proto));	\
-- 
cgit v1.2.3-71-gd317


From 7e3cead5172927732f51fde77fef6f521e22f209 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 10 Jun 2014 18:54:19 -0700
Subject: net: Save software checksum complete

In skb_checksum complete, if we need to compute the checksum for the
packet (via skb_checksum) save the result as CHECKSUM_COMPLETE.
Subsequent checksum verification can use this.

Also, added csum_complete_sw flag to distinguish between software and
hardware generated checksum complete, we should always be able to trust
the software computation.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 ++-
 net/core/datagram.c    | 14 +++++++++-----
 net/ipv4/gre_offload.c |  6 ++++--
 net/sunrpc/socklib.c   |  3 ++-
 4 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 72a53805858a..5b5cd3189c98 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -573,7 +573,8 @@ struct sk_buff {
 	__u8			encapsulation:1;
 	__u8			encap_hdr_csum:1;
 	__u8			csum_valid:1;
-	/* 4/6 bit hole (depending on ndisc_nodetype presence) */
+	__u8			csum_complete_sw:1;
+	/* 3/5 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
 #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a16ed7bbe376..6b1c04ca1d50 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -739,11 +739,15 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
 	__sum16 sum;
 
 	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
-	if (likely(!sum)) {
-		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
-			netdev_rx_csum_fault(skb->dev);
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	}
+	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !sum &&
+	    !skb->csum_complete_sw)
+		netdev_rx_csum_fault(skb->dev);
+
+	/* Save checksum complete for later use */
+	skb->csum = sum;
+	skb->ip_summed = CHECKSUM_COMPLETE;
+	skb->csum_complete_sw = 1;
+
 	return sum;
 }
 EXPORT_SYMBOL(__skb_checksum_complete_head);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 24deb3928b9e..eb92deb12666 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -131,10 +131,12 @@ static __sum16 gro_skb_checksum(struct sk_buff *skb)
 		csum_partial(skb->data, skb_gro_offset(skb), 0));
 	sum = csum_fold(NAPI_GRO_CB(skb)->csum);
 	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
-		if (unlikely(!sum))
+		if (unlikely(!sum) && !skb->csum_complete_sw)
 			netdev_rx_csum_fault(skb->dev);
-	} else
+	} else {
 		skb->ip_summed = CHECKSUM_COMPLETE;
+		skb->csum_complete_sw = 1;
+	}
 
 	return sum;
 }
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 0a648c502fc3..2df87f78e518 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -173,7 +173,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		return -1;
 	if (csum_fold(desc.csum))
 		return -1;
-	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+	    !skb->csum_complete_sw)
 		netdev_rx_csum_fault(skb->dev);
 	return 0;
 no_checksum:
-- 
cgit v1.2.3-71-gd317


From 8d0207652cbe27d1f962050737848e5ad4671958 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Apr 2014 04:27:08 -0400
Subject: ->splice_write() via ->write_iter()

iter_file_splice_write() - a ->splice_write() instance that gathers the
pipe buffers, builds a bio_vec-based iov_iter covering those and feeds
it to ->write_iter().  A bunch of simple cases coverted to that...

[AV: fixed the braino spotted by Cyrill]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c        |   2 +-
 fs/exofs/file.c       |   2 +-
 fs/ext2/file.c        |   2 +-
 fs/ext3/file.c        |   2 +-
 fs/ext4/file.c        |   2 +-
 fs/f2fs/file.c        |   2 +-
 fs/gfs2/file.c        |   4 +-
 fs/jfs/file.c         |   2 +-
 fs/ramfs/file-mmu.c   |   2 +-
 fs/ramfs/file-nommu.c |   2 +-
 fs/reiserfs/file.c    |   2 +-
 fs/splice.c           | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/file.c       |   2 +-
 fs/xfs/xfs_file.c     |  43 +---------------
 fs/xfs/xfs_trace.h    |   1 -
 include/linux/fs.h    |   2 +
 16 files changed, 156 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4e36b8ea8aa4..e68e150b1b16 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1583,7 +1583,7 @@ const struct file_operations def_blk_fops = {
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 5b7f6be5a2d5..71bf8e4fb5d4 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -77,7 +77,7 @@ const struct file_operations exofs_file_operations = {
 	.fsync		= exofs_file_fsync,
 	.flush		= exofs_flush,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 970c6aca15cc..7c87b22a7228 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -75,7 +75,7 @@ const struct file_operations ext2_file_operations = {
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index c833b1226d4d..a062fa1e1b11 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -63,7 +63,7 @@ const struct file_operations ext3_file_operations = {
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 const struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 48383a5f37a1..708aad768199 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -599,7 +599,7 @@ const struct file_operations ext4_file_operations = {
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.fallocate	= ext4_fallocate,
 };
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 22f4900dd8eb..e4ba4b93f96a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -692,5 +692,5 @@ const struct file_operations f2fs_file_operations = {
 	.compat_ioctl	= f2fs_compat_ioctl,
 #endif
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ca932cd358d3..01b4c5b1bff8 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1068,7 +1068,7 @@ const struct file_operations gfs2_file_fops = {
 	.lock		= gfs2_lock,
 	.flock		= gfs2_flock,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.setlease	= gfs2_setlease,
 	.fallocate	= gfs2_fallocate,
 };
@@ -1098,7 +1098,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.release	= gfs2_release,
 	.fsync		= gfs2_fsync,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.setlease	= generic_setlease,
 	.fallocate	= gfs2_fallocate,
 };
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index cc744ecaf51f..33aa0cc1f8b8 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -157,7 +157,7 @@ const struct file_operations jfs_file_operations = {
 	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.fsync		= jfs_fsync,
 	.release	= jfs_release,
 	.unlocked_ioctl = jfs_ioctl,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 6ea0b9718a9d..4f56de822d2f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -38,7 +38,7 @@ const struct file_operations ramfs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.fsync		= noop_fsync,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.llseek		= generic_file_llseek,
 };
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 9ed420f8f3ca..dda012ad4208 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,7 +43,7 @@ const struct file_operations ramfs_file_operations = {
 	.write_iter		= generic_file_write_iter,
 	.fsync			= noop_fsync,
 	.splice_read		= generic_file_splice_read,
-	.splice_write		= generic_file_splice_write,
+	.splice_write		= iter_file_splice_write,
 	.llseek			= generic_file_llseek,
 };
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 7c8ecd6468db..f070cc827456 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -248,7 +248,7 @@ const struct file_operations reiserfs_file_operations = {
 	.read_iter = generic_file_read_iter,
 	.write_iter = generic_file_write_iter,
 	.splice_read = generic_file_splice_read,
-	.splice_write = generic_file_splice_write,
+	.splice_write = iter_file_splice_write,
 	.llseek = generic_file_llseek,
 };
 
diff --git a/fs/splice.c b/fs/splice.c
index f99e420744c7..f195a9b89fb2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,6 +32,7 @@
 #include <linux/gfp.h>
 #include <linux/socket.h>
 #include <linux/compat.h>
+#include <linux/aio.h>
 #include "internal.h"
 
 /*
@@ -1052,6 +1053,145 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
+/**
+ * iter_file_splice_write - splice data from a pipe to a file
+ * @pipe:	pipe info
+ * @out:	file to write to
+ * @ppos:	position in @out
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will either move or copy pages (determined by @flags options) from
+ *    the given pipe inode to the given file.
+ *    This one is ->write_iter-based.
+ *
+ */
+ssize_t
+iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+	int nbufs = pipe->buffers;
+	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
+					GFP_KERNEL);
+	ssize_t ret;
+
+	if (unlikely(!array))
+		return -ENOMEM;
+
+	pipe_lock(pipe);
+
+	splice_from_pipe_begin(&sd);
+	while (sd.total_len) {
+		struct iov_iter from;
+		struct kiocb kiocb;
+		size_t left;
+		int n, idx;
+
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
+
+		if (unlikely(nbufs < pipe->buffers)) {
+			kfree(array);
+			nbufs = pipe->buffers;
+			array = kcalloc(nbufs, sizeof(struct bio_vec),
+					GFP_KERNEL);
+			if (!array) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
+		/* build the vector */
+		left = sd.total_len;
+		for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
+			struct pipe_buffer *buf = pipe->bufs + idx;
+			size_t this_len = buf->len;
+
+			if (this_len > left)
+				this_len = left;
+
+			if (idx == pipe->buffers - 1)
+				idx = -1;
+
+			ret = buf->ops->confirm(pipe, buf);
+			if (unlikely(ret)) {
+				if (ret == -ENODATA)
+					ret = 0;
+				goto done;
+			}
+
+			array[n].bv_page = buf->page;
+			array[n].bv_len = this_len;
+			array[n].bv_offset = buf->offset;
+			left -= this_len;
+		}
+
+		/* ... iov_iter */
+		from.type = ITER_BVEC | WRITE;
+		from.bvec = array;
+		from.nr_segs = n;
+		from.count = sd.total_len - left;
+		from.iov_offset = 0;
+
+		/* ... and iocb */
+		init_sync_kiocb(&kiocb, out);
+		kiocb.ki_pos = sd.pos;
+		kiocb.ki_nbytes = sd.total_len - left;
+
+		/* now, send it */
+		ret = out->f_op->write_iter(&kiocb, &from);
+		if (-EIOCBQUEUED == ret)
+			ret = wait_on_sync_kiocb(&kiocb);
+
+		if (ret <= 0)
+			break;
+
+		sd.num_spliced += ret;
+		sd.total_len -= ret;
+		*ppos = sd.pos = kiocb.ki_pos;
+
+		/* dismiss the fully eaten buffers, adjust the partial one */
+		while (ret) {
+			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+			if (ret >= buf->len) {
+				const struct pipe_buf_operations *ops = buf->ops;
+				ret -= buf->len;
+				buf->len = 0;
+				buf->ops = NULL;
+				ops->release(pipe, buf);
+				pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+				pipe->nrbufs--;
+				if (pipe->files)
+					sd.need_wakeup = true;
+			} else {
+				buf->offset += ret;
+				buf->len -= ret;
+				ret = 0;
+			}
+		}
+	}
+done:
+	kfree(array);
+	splice_from_pipe_end(pipe, &sd);
+
+	pipe_unlock(pipe);
+
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	return ret;
+}
+
+EXPORT_SYMBOL(iter_file_splice_write);
+
 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			  struct splice_desc *sd)
 {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6bc4e8efbccf..0888502a6041 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1585,7 +1585,7 @@ const struct file_operations ubifs_file_operations = {
 	.fsync          = ubifs_fsync,
 	.unlocked_ioctl = ubifs_ioctl,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5446e86d3485..b1c489c1fb2e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -342,47 +342,6 @@ xfs_file_splice_read(
 	return ret;
 }
 
-/*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
-STATIC ssize_t
-xfs_file_splice_write(
-	struct pipe_inode_info	*pipe,
-	struct file		*outfilp,
-	loff_t			*ppos,
-	size_t			count,
-	unsigned int		flags)
-{
-	struct inode		*inode = outfilp->f_mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ioflags = 0;
-	ssize_t			ret;
-
-	XFS_STATS_INC(xs_write_calls);
-
-	if (outfilp->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return -EIO;
-
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
-	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_write_bytes, ret);
-
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	return ret;
-}
-
 /*
  * This routine is called to handle zeroing any space in the last block of the
  * file that is beyond the EOF.  We do this since the size is being increased
@@ -1442,7 +1401,7 @@ const struct file_operations xfs_file_operations = {
 	.read_iter	= xfs_file_read_iter,
 	.write_iter	= xfs_file_write_iter,
 	.splice_read	= xfs_file_splice_read,
-	.splice_write	= xfs_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.unlocked_ioctl	= xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 65d8c793a25c..53182f97cf01 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1060,7 +1060,6 @@ DEFINE_RW_EVENT(xfs_file_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
-DEFINE_RW_EVENT(xfs_file_splice_write);
 
 DECLARE_EVENT_CLASS(xfs_page_class,
 	TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a6448849dbce..8bd8ed357c7b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2434,6 +2434,8 @@ extern ssize_t default_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
+extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
+		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
 		struct file *out, loff_t *, size_t len, unsigned int flags);
 
-- 
cgit v1.2.3-71-gd317


From 96f9bc8fbc2440d90e15f02398e1de43f674b433 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Apr 2014 04:35:49 -0400
Subject: fs/splice.c: remove unneeded exports

ocfs2 was using a bunch of splice.c guts...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/splice.c            | 15 +++++----------
 include/linux/splice.h | 10 ----------
 2 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index f195a9b89fb2..ab84051758a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -738,7 +738,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
  * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
  * a new page in the output file page cache and fill/dirty that.
  */
-int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		 struct splice_desc *sd)
 {
 	struct file *file = sd->u.file;
@@ -773,7 +773,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 out:
 	return ret;
 }
-EXPORT_SYMBOL(pipe_to_file);
 
 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 {
@@ -803,7 +802,7 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
  *    locking is required around copying the pipe buffers to the
  *    destination.
  */
-int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 			  splice_actor *actor)
 {
 	int ret;
@@ -850,7 +849,6 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 
 	return 1;
 }
-EXPORT_SYMBOL(splice_from_pipe_feed);
 
 /**
  * splice_from_pipe_next - wait for some data to splice from
@@ -862,7 +860,7 @@ EXPORT_SYMBOL(splice_from_pipe_feed);
  *    value (one) if pipe buffers are available.  It will return zero
  *    or -errno if no more data needs to be spliced.
  */
-int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 {
 	while (!pipe->nrbufs) {
 		if (!pipe->writers)
@@ -887,7 +885,6 @@ int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 
 	return 1;
 }
-EXPORT_SYMBOL(splice_from_pipe_next);
 
 /**
  * splice_from_pipe_begin - start splicing from pipe
@@ -898,12 +895,11 @@ EXPORT_SYMBOL(splice_from_pipe_next);
  *    splice_from_pipe_next() and splice_from_pipe_feed() to
  *    initialize the necessary fields of @sd.
  */
-void splice_from_pipe_begin(struct splice_desc *sd)
+static void splice_from_pipe_begin(struct splice_desc *sd)
 {
 	sd->num_spliced = 0;
 	sd->need_wakeup = false;
 }
-EXPORT_SYMBOL(splice_from_pipe_begin);
 
 /**
  * splice_from_pipe_end - finish splicing from pipe
@@ -915,12 +911,11 @@ EXPORT_SYMBOL(splice_from_pipe_begin);
  *    be called after a loop containing splice_from_pipe_next() and
  *    splice_from_pipe_feed().
  */
-void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 {
 	if (sd->need_wakeup)
 		wakeup_pipe_writers(pipe);
 }
-EXPORT_SYMBOL(splice_from_pipe_end);
 
 /**
  * __splice_from_pipe - splice data from a pipe to given actor
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 0e43906d2fda..da2751d3b93d 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -70,16 +70,6 @@ extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
 				splice_actor *);
 extern ssize_t __splice_from_pipe(struct pipe_inode_info *,
 				  struct splice_desc *, splice_actor *);
-extern int splice_from_pipe_feed(struct pipe_inode_info *, struct splice_desc *,
-				 splice_actor *);
-extern int splice_from_pipe_next(struct pipe_inode_info *,
-				 struct splice_desc *);
-extern void splice_from_pipe_begin(struct splice_desc *);
-extern void splice_from_pipe_end(struct pipe_inode_info *,
-				 struct splice_desc *);
-extern int pipe_to_file(struct pipe_inode_info *, struct pipe_buffer *,
-			struct splice_desc *);
-
 extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 			      struct splice_pipe_desc *);
 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
-- 
cgit v1.2.3-71-gd317


From 5f073850602084fbcbb987948ff3e70ae273f7d2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Apr 2014 13:31:50 -0400
Subject: kill generic_file_splice_write()

no callers left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/splice.c        | 124 -----------------------------------------------------
 include/linux/fs.h |   2 -
 2 files changed, 126 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index ab84051758a7..8e7eef755a9b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -718,62 +718,6 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 				    sd->len, &pos, more);
 }
 
-/*
- * This is a little more tricky than the file -> pipe splicing. There are
- * basically three cases:
- *
- *	- Destination page already exists in the address space and there
- *	  are users of it. For that case we have no other option that
- *	  copying the data. Tough luck.
- *	- Destination page already exists in the address space, but there
- *	  are no users of it. Make sure it's uptodate, then drop it. Fall
- *	  through to last case.
- *	- Destination page does not exist, we can add the pipe page to
- *	  the page cache and avoid the copy.
- *
- * If asked to move pages to the output file (SPLICE_F_MOVE is set in
- * sd->flags), we attempt to migrate pages from the pipe to the output
- * file address space page cache. This is possible if no one else has
- * the pipe page referenced outside of the pipe and page cache. If
- * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
- * a new page in the output file page cache and fill/dirty that.
- */
-static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-		 struct splice_desc *sd)
-{
-	struct file *file = sd->u.file;
-	struct address_space *mapping = file->f_mapping;
-	unsigned int offset, this_len;
-	struct page *page;
-	void *fsdata;
-	int ret;
-
-	offset = sd->pos & ~PAGE_CACHE_MASK;
-
-	this_len = sd->len;
-	if (this_len + offset > PAGE_CACHE_SIZE)
-		this_len = PAGE_CACHE_SIZE - offset;
-
-	ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
-	if (unlikely(ret))
-		goto out;
-
-	if (buf->page != page) {
-		char *src = kmap_atomic(buf->page);
-		char *dst = kmap_atomic(page);
-
-		memcpy(dst + offset, src + buf->offset, this_len);
-		flush_dcache_page(page);
-		kunmap_atomic(dst);
-		kunmap_atomic(src);
-	}
-	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
-				page, fsdata);
-out:
-	return ret;
-}
-
 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 {
 	smp_mb();
@@ -980,74 +924,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	return ret;
 }
 
-/**
- * generic_file_splice_write - splice data from a pipe to a file
- * @pipe:	pipe info
- * @out:	file to write to
- * @ppos:	position in @out
- * @len:	number of bytes to splice
- * @flags:	splice modifier flags
- *
- * Description:
- *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file.
- *
- */
-ssize_t
-generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-			  loff_t *ppos, size_t len, unsigned int flags)
-{
-	struct address_space *mapping = out->f_mapping;
-	struct inode *inode = mapping->host;
-	struct splice_desc sd = {
-		.total_len = len,
-		.flags = flags,
-		.pos = *ppos,
-		.u.file = out,
-	};
-	ssize_t ret;
-
-	pipe_lock(pipe);
-
-	splice_from_pipe_begin(&sd);
-	do {
-		ret = splice_from_pipe_next(pipe, &sd);
-		if (ret <= 0)
-			break;
-
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-		ret = file_remove_suid(out);
-		if (!ret) {
-			ret = file_update_time(out);
-			if (!ret)
-				ret = splice_from_pipe_feed(pipe, &sd,
-							    pipe_to_file);
-		}
-		mutex_unlock(&inode->i_mutex);
-	} while (ret > 0);
-	splice_from_pipe_end(pipe, &sd);
-
-	pipe_unlock(pipe);
-
-	if (sd.num_spliced)
-		ret = sd.num_spliced;
-
-	if (ret > 0) {
-		int err;
-
-		err = generic_write_sync(out, *ppos, ret);
-		if (err)
-			ret = err;
-		else
-			*ppos += ret;
-		balance_dirty_pages_ratelimited(mapping);
-	}
-
-	return ret;
-}
-
-EXPORT_SYMBOL(generic_file_splice_write);
-
 /**
  * iter_file_splice_write - splice data from a pipe to a file
  * @pipe:	pipe info
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8bd8ed357c7b..4e92d551518d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2432,8 +2432,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t default_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
-extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
-		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
-- 
cgit v1.2.3-71-gd317


From 1a539d372edd9832444e7a3daa710c444c014dc9 Mon Sep 17 00:00:00 2001
From: Tomas Pop <Tomas.Pop@sensirion.com>
Date: Thu, 5 Jun 2014 15:24:19 -0700
Subject: hwmon: add support for Sensirion SHTC1 sensor

Add support for Sensirion SHTC1 and compatible temperature and humidity
sensors.

Signed-off-by: Tomas Pop <tomas.pop@sensirion.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/shtc1           |  43 ++++++
 drivers/hwmon/Kconfig               |  10 ++
 drivers/hwmon/Makefile              |   1 +
 drivers/hwmon/shtc1.c               | 251 ++++++++++++++++++++++++++++++++++++
 include/linux/platform_data/shtc1.h |  23 ++++
 5 files changed, 328 insertions(+)
 create mode 100644 Documentation/hwmon/shtc1
 create mode 100644 drivers/hwmon/shtc1.c
 create mode 100644 include/linux/platform_data/shtc1.h

(limited to 'include/linux')

diff --git a/Documentation/hwmon/shtc1 b/Documentation/hwmon/shtc1
new file mode 100644
index 000000000000..6b1e05458f0f
--- /dev/null
+++ b/Documentation/hwmon/shtc1
@@ -0,0 +1,43 @@
+Kernel driver shtc1
+===================
+
+Supported chips:
+  * Sensirion SHTC1
+    Prefix: 'shtc1'
+    Addresses scanned: none
+    Datasheet: http://www.sensirion.com/file/datasheet_shtc1
+
+  * Sensirion SHTW1
+    Prefix: 'shtw1'
+    Addresses scanned: none
+    Datasheet: Not publicly available
+
+Author:
+  Johannes Winkelmann <johannes.winkelmann@sensirion.com>
+
+Description
+-----------
+
+This driver implements support for the Sensirion SHTC1 chip, a humidity and
+temperature sensor. Temperature is measured in degrees celsius, relative
+humidity is expressed as a percentage. Driver can be used as well for SHTW1
+chip, which has the same electrical interface.
+
+The device communicates with the I2C protocol. All sensors are set to I2C
+address 0x70. See Documentation/i2c/instantiating-devices for methods to
+instantiate the device.
+
+There are two options configurable by means of shtc1_platform_data:
+1. blocking (pull the I2C clock line down while performing the measurement) or
+   non-blocking mode. Blocking mode will guarantee the fastest result but
+   the I2C bus will be busy during that time. By default, non-blocking mode
+   is used. Make sure clock-stretching works properly on your device if you
+   want to use blocking mode.
+2. high or low accuracy. High accuracy is used by default and using it is
+   strongly recommended.
+
+sysfs-Interface
+---------------
+
+temp1_input - temperature input
+humidity1_input - humidity input
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 00343166feb1..08531a128f53 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1124,6 +1124,16 @@ config SENSORS_SHT21
 	  This driver can also be built as a module.  If so, the module
 	  will be called sht21.
 
+config SENSORS_SHTC1
+	tristate "Sensiron humidity and temperature sensors. SHTC1 and compat."
+	depends on I2C
+	help
+	  If you say yes here you get support for the Sensiron SHTC1 and SHTW1
+	  humidity and temperature sensors.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called shtc1.
+
 config SENSORS_S3C
 	tristate "Samsung built-in ADC"
 	depends on S3C_ADC
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 11798ad7e801..3dc0f02f71d2 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -126,6 +126,7 @@ obj-$(CONFIG_SENSORS_SCH5627)	+= sch5627.o
 obj-$(CONFIG_SENSORS_SCH5636)	+= sch5636.o
 obj-$(CONFIG_SENSORS_SHT15)	+= sht15.o
 obj-$(CONFIG_SENSORS_SHT21)	+= sht21.o
+obj-$(CONFIG_SENSORS_SHTC1)	+= shtc1.o
 obj-$(CONFIG_SENSORS_SIS5595)	+= sis5595.o
 obj-$(CONFIG_SENSORS_SMM665)	+= smm665.o
 obj-$(CONFIG_SENSORS_SMSC47B397)+= smsc47b397.o
diff --git a/drivers/hwmon/shtc1.c b/drivers/hwmon/shtc1.c
new file mode 100644
index 000000000000..decd7df995ab
--- /dev/null
+++ b/drivers/hwmon/shtc1.c
@@ -0,0 +1,251 @@
+/* Sensirion SHTC1 humidity and temperature sensor driver
+ *
+ * Copyright (C) 2014 Sensirion AG, Switzerland
+ * Author: Johannes Winkelmann <johannes.winkelmann@sensirion.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/platform_data/shtc1.h>
+
+/* commands (high precision mode) */
+static const unsigned char shtc1_cmd_measure_blocking_hpm[]    = { 0x7C, 0xA2 };
+static const unsigned char shtc1_cmd_measure_nonblocking_hpm[] = { 0x78, 0x66 };
+
+/* commands (low precision mode) */
+static const unsigned char shtc1_cmd_measure_blocking_lpm[]    = { 0x64, 0x58 };
+static const unsigned char shtc1_cmd_measure_nonblocking_lpm[] = { 0x60, 0x9c };
+
+/* command for reading the ID register */
+static const unsigned char shtc1_cmd_read_id_reg[]	       = { 0xef, 0xc8 };
+
+/* constants for reading the ID register */
+#define SHTC1_ID	  0x07
+#define SHTC1_ID_REG_MASK 0x1f
+
+/* delays for non-blocking i2c commands, both in us */
+#define SHTC1_NONBLOCKING_WAIT_TIME_HPM  14400
+#define SHTC1_NONBLOCKING_WAIT_TIME_LPM   1000
+
+#define SHTC1_CMD_LENGTH      2
+#define SHTC1_RESPONSE_LENGTH 6
+
+struct shtc1_data {
+	struct i2c_client *client;
+	struct mutex update_lock;
+	bool valid;
+	unsigned long last_updated; /* in jiffies */
+
+	const unsigned char *command;
+	unsigned int nonblocking_wait_time; /* in us */
+
+	struct shtc1_platform_data setup;
+
+	int temperature; /* 1000 * temperature in dgr C */
+	int humidity; /* 1000 * relative humidity in %RH */
+};
+
+static int shtc1_update_values(struct i2c_client *client,
+			       struct shtc1_data *data,
+			       char *buf, int bufsize)
+{
+	int ret = i2c_master_send(client, data->command, SHTC1_CMD_LENGTH);
+	if (ret != SHTC1_CMD_LENGTH) {
+		dev_err(&client->dev, "failed to send command: %d\n", ret);
+		return ret < 0 ? ret : -EIO;
+	}
+
+	/*
+	 * In blocking mode (clock stretching mode) the I2C bus
+	 * is blocked for other traffic, thus the call to i2c_master_recv()
+	 * will wait until the data is ready. For non blocking mode, we
+	 * have to wait ourselves.
+	 */
+	if (!data->setup.blocking_io)
+		usleep_range(data->nonblocking_wait_time,
+			     data->nonblocking_wait_time + 1000);
+
+	ret = i2c_master_recv(client, buf, bufsize);
+	if (ret != bufsize) {
+		dev_err(&client->dev, "failed to read values: %d\n", ret);
+		return ret < 0 ? ret : -EIO;
+	}
+
+	return 0;
+}
+
+/* sysfs attributes */
+static struct shtc1_data *shtc1_update_client(struct device *dev)
+{
+	struct shtc1_data *data = dev_get_drvdata(dev);
+	struct i2c_client *client = data->client;
+	unsigned char buf[SHTC1_RESPONSE_LENGTH];
+	int val;
+	int ret = 0;
+
+	mutex_lock(&data->update_lock);
+
+	if (time_after(jiffies, data->last_updated + HZ / 10) || !data->valid) {
+		ret = shtc1_update_values(client, data, buf, sizeof(buf));
+		if (ret)
+			goto out;
+
+		/*
+		 * From datasheet:
+		 * T = -45 + 175 * ST / 2^16
+		 * RH = 100 * SRH / 2^16
+		 *
+		 * Adapted for integer fixed point (3 digit) arithmetic.
+		 */
+		val = be16_to_cpup((__be16 *)buf);
+		data->temperature = ((21875 * val) >> 13) - 45000;
+		val = be16_to_cpup((__be16 *)(buf + 3));
+		data->humidity = ((12500 * val) >> 13);
+
+		data->last_updated = jiffies;
+		data->valid = true;
+	}
+
+out:
+	mutex_unlock(&data->update_lock);
+
+	return ret == 0 ? data : ERR_PTR(ret);
+}
+
+static ssize_t temp1_input_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct shtc1_data *data = shtc1_update_client(dev);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n", data->temperature);
+}
+
+static ssize_t humidity1_input_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct shtc1_data *data = shtc1_update_client(dev);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n", data->humidity);
+}
+
+static DEVICE_ATTR_RO(temp1_input);
+static DEVICE_ATTR_RO(humidity1_input);
+
+static struct attribute *shtc1_attrs[] = {
+	&dev_attr_temp1_input.attr,
+	&dev_attr_humidity1_input.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(shtc1);
+
+static void shtc1_select_command(struct shtc1_data *data)
+{
+	if (data->setup.high_precision) {
+		data->command = data->setup.blocking_io ?
+				shtc1_cmd_measure_blocking_hpm :
+				shtc1_cmd_measure_nonblocking_hpm;
+		data->nonblocking_wait_time = SHTC1_NONBLOCKING_WAIT_TIME_HPM;
+
+	} else {
+		data->command = data->setup.blocking_io ?
+				shtc1_cmd_measure_blocking_lpm :
+				shtc1_cmd_measure_nonblocking_lpm;
+		data->nonblocking_wait_time = SHTC1_NONBLOCKING_WAIT_TIME_LPM;
+	}
+}
+
+static int shtc1_probe(struct i2c_client *client,
+		       const struct i2c_device_id *id)
+{
+	int ret;
+	char id_reg[2];
+	struct shtc1_data *data;
+	struct device *hwmon_dev;
+	struct i2c_adapter *adap = client->adapter;
+	struct device *dev = &client->dev;
+
+	if (!i2c_check_functionality(adap, I2C_FUNC_I2C)) {
+		dev_err(dev, "plain i2c transactions not supported\n");
+		return -ENODEV;
+	}
+
+	ret = i2c_master_send(client, shtc1_cmd_read_id_reg, SHTC1_CMD_LENGTH);
+	if (ret != SHTC1_CMD_LENGTH) {
+		dev_err(dev, "could not send read_id_reg command: %d\n", ret);
+		return ret < 0 ? ret : -ENODEV;
+	}
+	ret = i2c_master_recv(client, id_reg, sizeof(id_reg));
+	if (ret != sizeof(id_reg)) {
+		dev_err(dev, "could not read ID register: %d\n", ret);
+		return -ENODEV;
+	}
+	if ((id_reg[1] & SHTC1_ID_REG_MASK) != SHTC1_ID) {
+		dev_err(dev, "ID register doesn't match\n");
+		return -ENODEV;
+	}
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->setup.blocking_io = false;
+	data->setup.high_precision = true;
+	data->client = client;
+
+	if (client->dev.platform_data)
+		data->setup = *(struct shtc1_platform_data *)dev->platform_data;
+	shtc1_select_command(data);
+	mutex_init(&data->update_lock);
+
+	hwmon_dev = devm_hwmon_device_register_with_groups(dev,
+							   client->name,
+							   data,
+							   shtc1_groups);
+	if (IS_ERR(hwmon_dev))
+		dev_dbg(dev, "unable to register hwmon device\n");
+
+	return PTR_ERR_OR_ZERO(hwmon_dev);
+}
+
+/* device ID table */
+static const struct i2c_device_id shtc1_id[] = {
+	{ "shtc1", 0 },
+	{ "shtw1", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, shtc1_id);
+
+static struct i2c_driver shtc1_i2c_driver = {
+	.driver.name  = "shtc1",
+	.probe        = shtc1_probe,
+	.id_table     = shtc1_id,
+};
+
+module_i2c_driver(shtc1_i2c_driver);
+
+MODULE_AUTHOR("Johannes Winkelmann <johannes.winkelmann@sensirion.com>");
+MODULE_DESCRIPTION("Sensirion SHTC1 humidity and temperature sensor driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/shtc1.h b/include/linux/platform_data/shtc1.h
new file mode 100644
index 000000000000..7b8c353f7dc8
--- /dev/null
+++ b/include/linux/platform_data/shtc1.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2014 Sensirion AG, Switzerland
+ * Author: Johannes Winkelmann <johannes.winkelmann@sensirion.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __SHTC1_H_
+#define __SHTC1_H_
+
+struct shtc1_platform_data {
+	bool blocking_io;
+	bool high_precision;
+};
+#endif /* __SHTC1_H_ */
-- 
cgit v1.2.3-71-gd317


From f3db22feb5de6b98b7bae924c2d4b6c8d65bedae Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 11 Jun 2014 11:51:35 -0600
Subject: NVMe: Fix hot cpu notification dead lock

There is a potential dead lock if a cpu event occurs during nvme probe
since it registered with hot cpu notification. This fixes the race by
having the module register with notification outside of probe rather
than have each device register.

The actual work is done in a scheduled work queue instead of in the
notifier since assigning IO queues has the potential to block if the
driver creates additional queues.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 35 +++++++++++++++++++++++++----------
 include/linux/nvme.h      |  2 +-
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 2af079e571fc..e0ac1210fe31 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -73,6 +73,7 @@ static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
 static struct workqueue_struct *nvme_workq;
 static wait_queue_head_t nvme_kthread_wait;
+static struct notifier_block nvme_nb;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
 
@@ -2115,14 +2116,25 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
 }
 
+static void nvme_cpu_workfn(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work);
+	if (dev->initialized)
+		nvme_assign_io_queues(dev);
+}
+
 static int nvme_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
-	struct nvme_dev *dev = container_of(self, struct nvme_dev, nb);
+	struct nvme_dev *dev;
+
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DEAD:
-		nvme_assign_io_queues(dev);
+		spin_lock(&dev_list_lock);
+		list_for_each_entry(dev, &dev_list, node)
+			schedule_work(&dev->cpu_work);
+		spin_unlock(&dev_list_lock);
 		break;
 	}
 	return NOTIFY_OK;
@@ -2191,11 +2203,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	nvme_free_queues(dev, nr_io_queues + 1);
 	nvme_assign_io_queues(dev);
 
-	dev->nb.notifier_call = &nvme_cpu_notify;
-	result = register_hotcpu_notifier(&dev->nb);
-	if (result)
-		goto free_queues;
-
 	return 0;
 
  free_queues:
@@ -2495,8 +2502,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	int i;
 
 	dev->initialized = 0;
-	unregister_hotcpu_notifier(&dev->nb);
-
 	nvme_dev_list_remove(dev);
 
 	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
@@ -2767,6 +2772,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->reset_workfn = nvme_reset_failed_dev;
 	INIT_WORK(&dev->reset_work, nvme_reset_workfn);
+	INIT_WORK(&dev->cpu_work, nvme_cpu_workfn);
 	dev->pci_dev = pdev;
 	pci_set_drvdata(pdev, dev);
 	result = nvme_set_instance(dev);
@@ -2836,6 +2842,7 @@ static void nvme_remove(struct pci_dev *pdev)
 
 	pci_set_drvdata(pdev, NULL);
 	flush_work(&dev->reset_work);
+	flush_work(&dev->cpu_work);
 	misc_deregister(&dev->miscdev);
 	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
@@ -2923,11 +2930,18 @@ static int __init nvme_init(void)
 	else if (result > 0)
 		nvme_major = result;
 
-	result = pci_register_driver(&nvme_driver);
+	nvme_nb.notifier_call = &nvme_cpu_notify;
+	result = register_hotcpu_notifier(&nvme_nb);
 	if (result)
 		goto unregister_blkdev;
+
+	result = pci_register_driver(&nvme_driver);
+	if (result)
+		goto unregister_hotcpu;
 	return 0;
 
+ unregister_hotcpu:
+	unregister_hotcpu_notifier(&nvme_nb);
  unregister_blkdev:
 	unregister_blkdev(nvme_major, "nvme");
  kill_workq:
@@ -2938,6 +2952,7 @@ static int __init nvme_init(void)
 static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
+	unregister_hotcpu_notifier(&nvme_nb);
 	unregister_blkdev(nvme_major, "nvme");
 	destroy_workqueue(nvme_workq);
 	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8541dd920bb7..2bf403195c09 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -90,7 +90,7 @@ struct nvme_dev {
 	struct miscdevice miscdev;
 	work_func_t reset_workfn;
 	struct work_struct reset_work;
-	struct notifier_block nb;
+	struct work_struct cpu_work;
 	char name[12];
 	char serial[20];
 	char model[40];
-- 
cgit v1.2.3-71-gd317


From 4b28252cada3d0521ab59751f4240ecdfb9bba18 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sat, 14 Jun 2014 23:23:52 -0700
Subject: net: Fix GSO constants to match NETIF flags

Joseph Gasparakis reported that VXLAN GSO offload stopped working with
i40e device after recent UDP changes. The problem is that the
SKB_GSO_* bits are out of sync with the corresponding NETIF flags. This
patch fixes that. Also, we add BUILD_BUG_ONs in net_gso_ok for several
GSO constants that were missing to avoid the problem in the future.

Reported-by: Joseph Gasparakis <joseph.gasparakis@intel.com>
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  1 +
 include/linux/netdevice.h       |  7 +++++++
 include/linux/skbuff.h          | 11 ++++++-----
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index e5a589435e2b..d99800cbdcf3 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -117,6 +117,7 @@ enum {
 #define NETIF_F_GSO_IPIP	__NETIF_F(GSO_IPIP)
 #define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
+#define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_GSO_MPLS	__NETIF_F(GSO_MPLS)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index abe3de1db932..66f9a04ec270 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3305,6 +3305,13 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_IPIP    != (NETIF_F_GSO_IPIP >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_MPLS    != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5b5cd3189c98..e13ed90be7c2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -338,17 +338,18 @@ enum {
 
 	SKB_GSO_GRE = 1 << 6,
 
-	SKB_GSO_IPIP = 1 << 7,
+	SKB_GSO_GRE_CSUM = 1 << 7,
 
-	SKB_GSO_SIT = 1 << 8,
+	SKB_GSO_IPIP = 1 << 8,
 
-	SKB_GSO_UDP_TUNNEL = 1 << 9,
+	SKB_GSO_SIT = 1 << 9,
 
-	SKB_GSO_MPLS = 1 << 10,
+	SKB_GSO_UDP_TUNNEL = 1 << 10,
 
 	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
 
-	SKB_GSO_GRE_CSUM = 1 << 12,
+	SKB_GSO_MPLS = 1 << 12,
+
 };
 
 #if BITS_PER_LONG > 32
-- 
cgit v1.2.3-71-gd317


From e5eb4e30a51236079fb22bb9f75fcd31915b03c6 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sat, 14 Jun 2014 23:24:28 -0700
Subject: net: add skb_pop_rcv_encapsulation

This function is used by UDP encapsulation protocols in RX when
crossing encapsulation boundary. If ip_summed is set to
CHECKSUM_UNNECESSARY and encapsulation is not set, change to
CHECKSUM_NONE since the checksum has not been validated within the
encapsulation. Clears csum_valid by the same rationale.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e13ed90be7c2..ec89301ada41 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1854,6 +1854,18 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
 	return pskb_may_pull(skb, skb_network_offset(skb) + len);
 }
 
+static inline void skb_pop_rcv_encapsulation(struct sk_buff *skb)
+{
+	/* Only continue with checksum unnecessary if device indicated
+	 * it is valid across encapsulation (skb->encapsulation was set).
+	 */
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY && !skb->encapsulation)
+		skb->ip_summed = CHECKSUM_NONE;
+
+	skb->encapsulation = 0;
+	skb->csum_valid = 0;
+}
+
 /*
  * CPUs often take a performance hit when accessing unaligned memory
  * locations. The actual performance hit varies, it can be small if the
-- 
cgit v1.2.3-71-gd317


From a6e15a39048ec3229b9a53425f4384f55f6cc1b3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Jun 2014 13:30:35 -0700
Subject: PM / hibernate: introduce "nohibernate" boot parameter

To support using kernel features that are not compatible with hibernation,
this creates the "nohibernate" kernel boot parameter to disable both
hibernation and resume. This allows hibernation support to be a boot-time
choice instead of only a compile-time choice.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/kernel-parameters.txt |  3 +++
 include/linux/suspend.h             |  2 ++
 kernel/power/hibernate.c            | 31 ++++++++++++++++++++++++++++++-
 kernel/power/main.c                 |  6 ++----
 kernel/power/user.c                 |  3 +++
 5 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6eaa9cdb7094..f8f0466b8b1d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2184,6 +2184,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			in certain environments such as networked servers or
 			real-time systems.
 
+	nohibernate	[HIBERNATION] Disable hibernation and resume.
+
 	nohz=		[KNL] Boottime enable/disable dynamic ticks
 			Valid arguments: on, off
 			Default: on
@@ -2980,6 +2982,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 		noresume	Don't check if there's a hibernation image
 				present during boot.
 		nocompress	Don't compress/decompress hibernation images.
+		no		Disable hibernation and resume.
 
 	retain_initrd	[RAM] Keep initrd memory after extraction
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index f76994b9396c..519064e0c943 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -327,6 +327,7 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 extern void hibernation_set_ops(const struct platform_hibernation_ops *ops);
 extern int hibernate(void);
 extern bool system_entering_hibernation(void);
+extern bool hibernation_available(void);
 asmlinkage int swsusp_save(void);
 extern struct pbe *restore_pblist;
 #else /* CONFIG_HIBERNATION */
@@ -339,6 +340,7 @@ static inline void swsusp_unset_page_free(struct page *p) {}
 static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
+static inline bool hibernation_available(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
 
 /* Hibernation and suspend events */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 49e0a20fd010..258f492f0347 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,6 +35,7 @@
 
 static int nocompress;
 static int noresume;
+static int nohibernate;
 static int resume_wait;
 static unsigned int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
@@ -62,6 +63,11 @@ bool freezer_test_done;
 
 static const struct platform_hibernation_ops *hibernation_ops;
 
+bool hibernation_available(void)
+{
+	return (nohibernate == 0);
+}
+
 /**
  * hibernation_set_ops - Set the global hibernate operations.
  * @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -642,6 +648,11 @@ int hibernate(void)
 {
 	int error;
 
+	if (!hibernation_available()) {
+		pr_debug("PM: Hibernation not available.\n");
+		return -EPERM;
+	}
+
 	lock_system_sleep();
 	/* The snapshot device should not be opened while we're running */
 	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -734,7 +745,7 @@ static int software_resume(void)
 	/*
 	 * If the user said "noresume".. bail out early.
 	 */
-	if (noresume)
+	if (noresume || !hibernation_available())
 		return 0;
 
 	/*
@@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
 	int i;
 	char *start = buf;
 
+	if (!hibernation_available())
+		return sprintf(buf, "[disabled]\n");
+
 	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
 		if (!hibernation_modes[i])
 			continue;
@@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 	char *p;
 	int mode = HIBERNATION_INVALID;
 
+	if (!hibernation_available())
+		return -EPERM;
+
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
@@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str)
 		noresume = 1;
 	else if (!strncmp(str, "nocompress", 10))
 		nocompress = 1;
+	else if (!strncmp(str, "no", 2)) {
+		noresume = 1;
+		nohibernate = 1;
+	}
 	return 1;
 }
 
@@ -1125,9 +1146,17 @@ static int __init resumedelay_setup(char *str)
 	return 1;
 }
 
+static int __init nohibernate_setup(char *str)
+{
+	noresume = 1;
+	nohibernate = 1;
+	return 1;
+}
+
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
 __setup("resumewait", resumewait_setup);
 __setup("resumedelay=", resumedelay_setup);
+__setup("nohibernate", nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 573410d6647e..8e90f330f139 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -300,13 +300,11 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 			s += sprintf(s,"%s ", pm_states[i].label);
 
 #endif
-#ifdef CONFIG_HIBERNATION
-	s += sprintf(s, "%s\n", "disk");
-#else
+	if (hibernation_available())
+		s += sprintf(s, "disk ");
 	if (s != buf)
 		/* convert the last space to a newline */
 		*(s-1) = '\n';
-#endif
 	return (s - buf);
 }
 
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	struct snapshot_data *data;
 	int error;
 
+	if (!hibernation_available())
+		return -EPERM;
+
 	lock_system_sleep();
 
 	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
-- 
cgit v1.2.3-71-gd317


From 736ed4de766d4f0e8e6142dd4f9d73ef61835ed9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 17 Jun 2014 22:09:29 -0700
Subject: block: blk_max_size_offset() should check ->max_sectors

Commit 762380ad9322 inadvertently changed a check for max_sectors
to max_hw_sectors. Revert that part, so we still compare against
max_sectors.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 31e11051f1ba..713f8b62b435 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -920,7 +920,7 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
 					       sector_t offset)
 {
 	if (!q->limits.chunk_sectors)
-		return q->limits.max_hw_sectors;
+		return q->limits.max_sectors;
 
 	return q->limits.chunk_sectors -
 			(offset & (q->limits.chunk_sectors - 1));
-- 
cgit v1.2.3-71-gd317


From 8537b12034cf1fd3fab3da2c859d71f76846fae9 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Tue, 17 Jun 2014 22:12:35 -0700
Subject: blk-mq: bitmap tag: fix races on shared ::wake_index fields

Fix racy updates of shared blk_mq_bitmap_tags::wake_index
and blk_mq_hw_ctx::wake_index fields.

Cc: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c     | 32 +++++++++++++++++++++-----------
 block/blk-mq-tag.h     |  2 +-
 include/linux/blk-mq.h |  2 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 1aab39f71d95..6deb13055490 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -43,9 +43,16 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 	return bt_has_free_tags(&tags->bitmap_tags);
 }
 
-static inline void bt_index_inc(unsigned int *index)
+static inline int bt_index_inc(int index)
 {
-	*index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+	return (index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
+static inline void bt_index_atomic_inc(atomic_t *index)
+{
+	int old = atomic_read(index);
+	int new = bt_index_inc(old);
+	atomic_cmpxchg(index, old, new);
 }
 
 /*
@@ -69,14 +76,14 @@ static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
 	int i, wake_index;
 
 	bt = &tags->bitmap_tags;
-	wake_index = bt->wake_index;
+	wake_index = atomic_read(&bt->wake_index);
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
 		struct bt_wait_state *bs = &bt->bs[wake_index];
 
 		if (waitqueue_active(&bs->wait))
 			wake_up(&bs->wait);
 
-		bt_index_inc(&wake_index);
+		wake_index = bt_index_inc(wake_index);
 	}
 }
 
@@ -212,12 +219,14 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
 					 struct blk_mq_hw_ctx *hctx)
 {
 	struct bt_wait_state *bs;
+	int wait_index;
 
 	if (!hctx)
 		return &bt->bs[0];
 
-	bs = &bt->bs[hctx->wait_index];
-	bt_index_inc(&hctx->wait_index);
+	wait_index = atomic_read(&hctx->wait_index);
+	bs = &bt->bs[wait_index];
+	bt_index_atomic_inc(&hctx->wait_index);
 	return bs;
 }
 
@@ -313,18 +322,19 @@ static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
 {
 	int i, wake_index;
 
-	wake_index = bt->wake_index;
+	wake_index = atomic_read(&bt->wake_index);
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
 		struct bt_wait_state *bs = &bt->bs[wake_index];
 
 		if (waitqueue_active(&bs->wait)) {
-			if (wake_index != bt->wake_index)
-				bt->wake_index = wake_index;
+			int o = atomic_read(&bt->wake_index);
+			if (wake_index != o)
+				atomic_cmpxchg(&bt->wake_index, o, wake_index);
 
 			return bs;
 		}
 
-		bt_index_inc(&wake_index);
+		wake_index = bt_index_inc(wake_index);
 	}
 
 	return NULL;
@@ -344,7 +354,7 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 	bs = bt_wake_ptr(bt);
 	if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
 		atomic_set(&bs->wait_cnt, bt->wake_cnt);
-		bt_index_inc(&bt->wake_index);
+		bt_index_atomic_inc(&bt->wake_index);
 		wake_up(&bs->wait);
 	}
 }
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 98696a65d4d4..6206ed17ef76 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -24,7 +24,7 @@ struct blk_mq_bitmap_tags {
 	unsigned int map_nr;
 	struct blk_align_bitmap *map;
 
-	unsigned int wake_index;
+	atomic_t wake_index;
 	struct bt_wait_state *bs;
 };
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a002cf191427..eb726b9c5762 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -42,7 +42,7 @@ struct blk_mq_hw_ctx {
 	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
 
-	unsigned int		wait_index;
+	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
 
-- 
cgit v1.2.3-71-gd317


From 2b8f2a28eac1d35a432705d269f02bdaeba9be8f Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Wed, 18 Jun 2014 11:01:41 +0200
Subject: net: phylib: add link_change_notify callback to phy device

Add a notify callback to inform phy drivers when the core is about to
do its link adjustment. No change for drivers that do not implement
this callback.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 3 +++
 include/linux/phy.h   | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 3bc079a67a3d..f7c61812ea4a 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -720,6 +720,9 @@ void phy_state_machine(struct work_struct *work)
 
 	mutex_lock(&phydev->lock);
 
+	if (phydev->drv->link_change_notify)
+		phydev->drv->link_change_notify(phydev);
+
 	switch (phydev->state) {
 	case PHY_DOWN:
 	case PHY_STARTING:
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 864ddafad8cc..68041446c450 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -536,6 +536,15 @@ struct phy_driver {
 	/* See set_wol, but for checking whether Wake on LAN is enabled. */
 	void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);
 
+	/*
+	 * Called to inform a PHY device driver when the core is about to
+	 * change the link state. This callback is supposed to be used as
+	 * fixup hook for drivers that need to take action when the link
+	 * state changes. Drivers are by no means allowed to mess with the
+	 * PHY device structure in their implementations.
+	 */
+	void (*link_change_notify)(struct phy_device *dev);
+
 	struct device_driver driver;
 };
 #define to_phy_driver(d) container_of(d, struct phy_driver, driver)
-- 
cgit v1.2.3-71-gd317


From e567bf7112518824830978d644dfb5a991e67d54 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Sun, 22 Jun 2014 16:32:48 -0600
Subject: Revert "block: add __init to elv_register"

This reverts commit b5097e956a4d2919ee248d6481e4204c5568ed5c.

The original commit is buggy, we do use the registration functions
at runtime, for instance when loading IO schedulers through sysfs.

Reported-by: Damien Wyart <damien.wyart@gmail.com>
---
 block/elevator.c         | 2 +-
 include/linux/elevator.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/elevator.c b/block/elevator.c
index 34bded18910e..24c28b659bb3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -825,7 +825,7 @@ void elv_unregister_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(elv_unregister_queue);
 
-int __init elv_register(struct elevator_type *e)
+int elv_register(struct elevator_type *e)
 {
 	char *def = "";
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index e2a6bd7fb133..45a91474487d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -143,7 +143,7 @@ extern void elv_drain_elevator(struct request_queue *);
  * io scheduler registration
  */
 extern void __init load_default_elevator_module(void);
-extern int __init elv_register(struct elevator_type *);
+extern int elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
 
 /*
-- 
cgit v1.2.3-71-gd317


From b3acc56bfe1287c6b666e80edc70b89eea2a1a80 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: kexec: save PG_head_mask in VMCOREINFO

To allow filtering of huge pages, makedumpfile must be able to identify
them in the dump.  This can be done by checking the appropriate page
flag, so communicate its value to makedumpfile through the VMCOREINFO
interface.

There's only one small catch.  Depending on how many page flags are
available on a given architecture, this bit can be called PG_head or
PG_compound.

I sent a similar patch back in 2012, but Eric Biederman did not like
using an #ifdef.  So, this time I'm adding a common symbol
(PG_head_mask) instead.

See https://lkml.org/lkml/2012/11/28/91 for the previous version.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 3 +++
 kernel/kexec.c             | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3c545b48aeab..8304959ad336 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -360,6 +360,9 @@ static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 #endif
+
+#define PG_head_mask ((1L << PG_head))
+
 #else
 /*
  * Reduce page flag use as much as possible by overlapping
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d0..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_MEMORY_FAILURE
 	VMCOREINFO_NUMBER(PG_hwpoison);
 #endif
+	VMCOREINFO_NUMBER(PG_head_mask);
 	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
 
 	arch_crash_save_vmcoreinfo();
-- 
cgit v1.2.3-71-gd317


From f3aca3d09525f87731ba6b892c9b010570bc54b4 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: nmi: provide the option to issue an NMI back trace to every cpu but
 current

Sometimes it is preferred not to use the trigger_all_cpu_backtrace()
routine when one wants to avoid capturing a back trace for current.  For
instance if one was previously captured recently.

This patch provides a new routine namely
trigger_allbutself_cpu_backtrace() which offers the flexibility to issue
an NMI to every cpu but current and capture a back trace accordingly.

Patch x86 and sparc to support new routine.

[dzickus@redhat.com: add stub in #else clause]
[dzickus@redhat.com: don't print message in single processor case, wrap with get/put_cpu based on Oleg's suggestion]
[sfr@canb.auug.org.au: undo C99ism]
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/irq_64.h |  2 +-
 arch/sparc/kernel/process_64.c  | 18 ++++++++++++------
 arch/x86/include/asm/irq.h      |  2 +-
 arch/x86/kernel/apic/hw_nmi.c   | 18 ++++++++++++++----
 include/linux/nmi.h             | 11 ++++++++++-
 5 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 375cffcf7dbd..91d219381306 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -89,7 +89,7 @@ static inline unsigned long get_softint(void)
 	return retval;
 }
 
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 
 extern void *hardirq_stack[NR_CPUS];
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index b2988f25e230..027e09986194 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
 	}
 }
 
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
 {
 	struct thread_info *tp = current_thread_info();
 	struct pt_regs *regs = get_irq_regs();
@@ -251,16 +251,22 @@ void arch_trigger_all_cpu_backtrace(void)
 
 	spin_lock_irqsave(&global_cpu_snapshot_lock, flags);
 
-	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
-
 	this_cpu = raw_smp_processor_id();
 
-	__global_reg_self(tp, regs, this_cpu);
+	memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
+
+	if (include_self)
+		__global_reg_self(tp, regs, this_cpu);
 
 	smp_fetch_global_regs();
 
 	for_each_online_cpu(cpu) {
-		struct global_reg_snapshot *gp = &global_cpu_snapshot[cpu].reg;
+		struct global_reg_snapshot *gp;
+
+		if (!include_self && cpu == this_cpu)
+			continue;
+
+		gp = &global_cpu_snapshot[cpu].reg;
 
 		__global_reg_poll(gp);
 
@@ -292,7 +298,7 @@ void arch_trigger_all_cpu_backtrace(void)
 
 static void sysrq_handle_globreg(int key)
 {
-	arch_trigger_all_cpu_backtrace();
+	arch_trigger_all_cpu_backtrace(true);
 }
 
 static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd034cf..a80cbb88ea91 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
 extern void init_ISA_irqs(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 #endif
 
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c3fcb5de5083..6a1e71bde323 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
 {
 	int i;
+	int cpu = get_cpu();
 
-	if (test_and_set_bit(0, &backtrace_flag))
+	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
 		 * If there is already a trigger_all_cpu_backtrace() in progress
 		 * (backtrace_flag == 1), don't output double cpu dump infos.
 		 */
+		put_cpu();
 		return;
+	}
 
 	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+	if (!include_self)
+		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 
-	printk(KERN_INFO "sending NMI to all CPUs:\n");
-	apic->send_IPI_all(NMI_VECTOR);
+	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+		pr_info("sending NMI to %s CPUs:\n",
+			(include_self ? "all" : "other"));
+		apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+	}
 
 	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
 	for (i = 0; i < 10 * 1000; i++) {
 		if (cpumask_empty(to_cpumask(backtrace_mask)))
 			break;
 		mdelay(1);
+		touch_softlockup_watchdog();
 	}
 
 	clear_bit(0, &backtrace_flag);
 	smp_mb__after_atomic();
+	put_cpu();
 }
 
 static int
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 6a45fb583ff1..a17ab6398d7c 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void)
 #ifdef arch_trigger_all_cpu_backtrace
 static inline bool trigger_all_cpu_backtrace(void)
 {
-	arch_trigger_all_cpu_backtrace();
+	arch_trigger_all_cpu_backtrace(true);
 
 	return true;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+	arch_trigger_all_cpu_backtrace(false);
+	return true;
+}
 #else
 static inline bool trigger_all_cpu_backtrace(void)
 {
 	return false;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+	return false;
+}
 #endif
 
 #ifdef CONFIG_LOCKUP_DETECTOR
-- 
cgit v1.2.3-71-gd317


From ed235875e2ca983197831337a986f0517074e1a0 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: kernel/watchdog.c: print traces for all cpus on lockup detection

A 'softlockup' is defined as a bug that causes the kernel to loop in
kernel mode for more than a predefined period to time, without giving
other tasks a chance to run.

Currently, upon detection of this condition by the per-cpu watchdog
task, debug information (including a stack trace) is sent to the system
log.

On some occasions, we have observed that the "victim" rather than the
actual "culprit" (i.e.  the owner/holder of the contended resource) is
reported to the user.  Often this information has proven to be
insufficient to assist debugging efforts.

To avoid loss of useful debug information, for architectures which
support NMI, this patch makes it possible to improve soft lockup
reporting.  This is accomplished by issuing an NMI to each cpu to obtain
a stack trace.

If NMI is not supported we just revert back to the old method.  A sysctl
and boot-time parameter is available to toggle this feature.

[dzickus@redhat.com: add CONFIG_SMP in certain areas]
[akpm@linux-foundation.org: additional CONFIG_SMP=n optimisations]
[mq@suse.cz: fix warning]
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jan Moskyto Matejka <mq@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |  5 +++++
 Documentation/sysctl/kernel.txt     | 17 ++++++++++++++++
 include/linux/nmi.h                 |  1 +
 kernel/sysctl.c                     | 11 +++++++++++
 kernel/watchdog.c                   | 39 +++++++++++++++++++++++++++++++++++++
 5 files changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 884904975d0b..c1b9aa8c5a52 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3130,6 +3130,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			[KNL] Should the soft-lockup detector generate panics.
 			Format: <integer>
 
+	softlockup_all_cpu_backtrace=
+			[KNL] Should the soft-lockup detector generate
+			backtraces on all cpus.
+			Format: <integer>
+
 	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
 			See Documentation/laptops/sonypi.txt
 
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 708bb7f1b7e0..c14374e71775 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -75,6 +75,7 @@ show up in /proc/sys/kernel:
 - shmall
 - shmmax                      [ sysv ipc ]
 - shmmni
+- softlockup_all_cpu_backtrace
 - stop-a                      [ SPARC only ]
 - sysrq                       ==> Documentation/sysrq.txt
 - sysctl_writes_strict
@@ -783,6 +784,22 @@ via the /proc/sys interface:
 
 ==============================================================
 
+softlockup_all_cpu_backtrace:
+
+This value controls the soft lockup detector thread's behavior
+when a soft lockup condition is detected as to whether or not
+to gather further debug information. If enabled, each cpu will
+be issued an NMI and instructed to capture stack trace.
+
+This feature is only applicable for architectures which support
+NMI.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
+
+==============================================================
+
 tainted:
 
 Non-zero if the kernel has been tainted.  Numeric values, which
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a17ab6398d7c..447775ee2c4b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -57,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
 			   void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 075d1903138f..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -860,6 +860,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#ifdef CONFIG_SMP
+	{
+		.procname	= "softlockup_all_cpu_backtrace",
+		.data		= &sysctl_softlockup_all_cpu_backtrace,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif /* CONFIG_SMP */
 	{
 		.procname       = "nmi_watchdog",
 		.data           = &watchdog_user_enabled,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 30e482240dae..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
 
 int watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
+
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
 
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
+static unsigned long soft_lockup_nmi_warn;
 
 /* boot commands */
 /*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+	sysctl_softlockup_all_cpu_backtrace =
+		!!simple_strtol(str, NULL, 0);
+	return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
 
 /*
  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
 	struct pt_regs *regs = get_irq_regs();
 	int duration;
+	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
 
+		if (softlockup_all_cpu_backtrace) {
+			/* Prevent multiple soft-lockup reports if one cpu is already
+			 * engaged in dumping cpu back traces
+			 */
+			if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+				/* Someone else will report us. Let's give up */
+				__this_cpu_write(soft_watchdog_warn, true);
+				return HRTIMER_RESTART;
+			}
+		}
+
 		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		else
 			dump_stack();
 
+		if (softlockup_all_cpu_backtrace) {
+			/* Avoid generating two back traces for current
+			 * given that one is already made above
+			 */
+			trigger_allbutself_cpu_backtrace();
+
+			clear_bit(0, &soft_lockup_nmi_warn);
+			/* Barrier to sync with other cpus */
+			smp_mb__after_atomic();
+		}
+
 		if (softlockup_panic)
 			panic("softlockup: hung tasks");
 		__this_cpu_write(soft_watchdog_warn, true);
-- 
cgit v1.2.3-71-gd317


From 3a4b0eda8e4b27e6aca86f9f4d327c1070815e30 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 24 Jun 2014 18:10:26 +0800
Subject: bio: remove unused macro bip_vec_idx()

Macro bip_vec_idx() was used by bio integrity originally, but no longer
used now. So remove it.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a645769f020..f91decbca96b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -644,10 +644,6 @@ struct biovec_slab {
 
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
-
-
-#define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
-
 #define bip_for_each_vec(bvl, bip, iter)				\
 	for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)
 
-- 
cgit v1.2.3-71-gd317


From 66cb45aa41315d1d9972cada354fbdf7870d7714 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 24 Jun 2014 16:22:24 -0600
Subject: block: add support for limiting gaps in SG lists

Another restriction inherited for NVMe - those devices don't support
SG lists that have "gaps" in them. Gaps refers to cases where the
previous SG entry doesn't end on a page boundary. For NVMe, all SG
entries must start at offset 0 (except the first) and end on a page
boundary (except the last).

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c            |  8 ++++++++
 block/blk-merge.c      | 10 ++++++++++
 include/linux/bio.h    |  9 +++++++++
 include/linux/blkdev.h |  1 +
 4 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 8c2e55e39a1b..0ec61c9e536c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -746,6 +746,14 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
 			goto done;
 		}
+
+		/*
+		 * If the queue doesn't support SG gaps and adding this
+		 * offset would create a gap, disallow it.
+		 */
+		if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
+		    bvec_gap_to_prev(prev, offset))
+			return 0;
 	}
 
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b3bf0df0f4c2..54535831f1e1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -568,6 +568,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 {
+	struct request_queue *q = rq->q;
+
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 
@@ -591,6 +593,14 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
+	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+		struct bio_vec *bprev;
+
+		bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1];
+		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
+			return false;
+	}
+
 	return true;
 }
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f91decbca96b..d2633ee099d9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -186,6 +186,15 @@ static inline void *bio_data(struct bio *bio)
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
 	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
 
+/*
+ * Check if adding a bio_vec after bprv with offset would create a gap in
+ * the SG list. Most drivers don't care about this, but some do.
+ */
+static inline bool bvec_gap_to_prev(struct bio_vec *bprv, unsigned int offset)
+{
+	return offset || ((bprv->bv_offset + bprv->bv_len) & (PAGE_SIZE - 1));
+}
+
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 713f8b62b435..8699bcf5f099 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -512,6 +512,7 @@ struct request_queue {
 #define QUEUE_FLAG_DEAD        19	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
+#define QUEUE_FLAG_SG_GAPS     22	/* queue doesn't support SG gaps */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3-71-gd317


From 0b86dbf675e0170a191a9ca18e5e99fd39a678c0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 23 Jun 2014 08:44:40 +0100
Subject: Fix 32-bit regression in block device read(2)

blkdev_read_iter() wants to cap the iov_iter by the amount of data
remaining to the end of device.  That's what iov_iter_truncate() is for
(trim iter->count if it's above the given limit).  So far, so good, but
the argument of iov_iter_truncate() is size_t, so on 32bit boxen (in
case of a large device) we end up with that upper limit truncated down
to 32 bits *before* comparing it with iter->count.

Easily fixed by making iov_iter_truncate() take 64bit argument - it does
the right thing after such change (we only reach the assignment in there
when the current value of iter->count is greater than the limit, i.e.
for anything that would get truncated we don't reach the assignment at
all) and that argument is not the new value of iter->count - it's an
upper limit for such.

The overhead of passing u64 is not an issue - the thing is inlined, so
callers passing size_t won't pay any penalty.

Reported-and-tested-by: Theodore Tso <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Tested-by: Bruno Wolff III <bruno@wolff.to>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uio.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index e2231e47cec1..d54985e0705e 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -94,8 +94,20 @@ static inline size_t iov_iter_count(struct iov_iter *i)
 	return i->count;
 }
 
-static inline void iov_iter_truncate(struct iov_iter *i, size_t count)
+/*
+ * Cap the iov_iter by given limit; note that the second argument is
+ * *not* the new size - it's upper limit for such.  Passing it a value
+ * greater than the amount of data in iov_iter is fine - it'll just do
+ * nothing in that case.
+ */
+static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
 {
+	/*
+	 * count doesn't have to fit in size_t - comparison extends both
+	 * operands to u64 here and any value that would be truncated by
+	 * conversion in assignement is by definition greater than all
+	 * values of size_t, including old i->count.
+	 */
 	if (i->count > count)
 		i->count = count;
 }
-- 
cgit v1.2.3-71-gd317


From ac5ccdba3a1659b3517e7e99ef7d35a6a2d77cf4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Jun 2014 21:22:56 +0300
Subject: iovec: move memcpy_from/toiovecend to lib/iovec.c

ERROR: "memcpy_fromiovecend" [drivers/vhost/vhost_scsi.ko] undefined!

commit 9f977ef7b671f6169eca78bf40f230fe84b7c7e5
    vhost-scsi: Include prot_bytes into expected data transfer length
in target-pending makes drivers/vhost/scsi.c call memcpy_fromiovecend().
This function is not available when CONFIG_NET is not enabled.

socket.h already includes uio.h, so no callers need updating.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/linux/socket.h |  4 ----
 include/linux/uio.h    |  5 ++++-
 lib/iovec.c            | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/core/iovec.c       | 55 --------------------------------------------------
 4 files changed, 59 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8e98297f1388..ec538fc287a6 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -305,8 +305,6 @@ struct ucred {
 /* IPX options */
 #define IPX_TYPE	1
 
-extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			       int offset, int len);
 extern int csum_partial_copy_fromiovecend(unsigned char *kdata, 
 					  struct iovec *iov, 
 					  int offset, 
@@ -315,8 +313,6 @@ extern unsigned long iov_pages(const struct iovec *iov, int offset,
 			       unsigned long nr_segs);
 
 extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode);
-extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
-			     int offset, int len);
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e2231e47cec1..04c8c4bb4927 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -111,6 +111,9 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
-
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			int offset, int len);
+int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
+		      int offset, int len);
 
 #endif
diff --git a/lib/iovec.c b/lib/iovec.c
index 454baa88bf27..7a7c2da4cddf 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -51,3 +51,58 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
 	return 0;
 }
 EXPORT_SYMBOL(memcpy_toiovec);
+
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+		      int offset, int len)
+{
+	int copy;
+	for (; len > 0; ++iov) {
+		/* Skip over the finished iovecs */
+		if (unlikely(offset >= iov->iov_len)) {
+			offset -= iov->iov_len;
+			continue;
+		}
+		copy = min_t(unsigned int, iov->iov_len - offset, len);
+		if (copy_to_user(iov->iov_base + offset, kdata, copy))
+			return -EFAULT;
+		offset = 0;
+		kdata += copy;
+		len -= copy;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_toiovecend);
+
+/*
+ *	Copy iovec to kernel. Returns -EFAULT on error.
+ */
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			int offset, int len)
+{
+	/* Skip over the finished iovecs */
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		iov++;
+	}
+
+	while (len > 0) {
+		u8 __user *base = iov->iov_base + offset;
+		int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+		offset = 0;
+		if (copy_from_user(kdata, base, copy))
+			return -EFAULT;
+		len -= copy;
+		kdata += copy;
+		iov++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_fromiovecend);
diff --git a/net/core/iovec.c b/net/core/iovec.c
index b61869429f4c..827dd6beb49c 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -74,61 +74,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
 	return err;
 }
 
-/*
- *	Copy kernel to iovec. Returns -EFAULT on error.
- */
-
-int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
-		      int offset, int len)
-{
-	int copy;
-	for (; len > 0; ++iov) {
-		/* Skip over the finished iovecs */
-		if (unlikely(offset >= iov->iov_len)) {
-			offset -= iov->iov_len;
-			continue;
-		}
-		copy = min_t(unsigned int, iov->iov_len - offset, len);
-		if (copy_to_user(iov->iov_base + offset, kdata, copy))
-			return -EFAULT;
-		offset = 0;
-		kdata += copy;
-		len -= copy;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovecend);
-
-/*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- */
-
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			int offset, int len)
-{
-	/* Skip over the finished iovecs */
-	while (offset >= iov->iov_len) {
-		offset -= iov->iov_len;
-		iov++;
-	}
-
-	while (len > 0) {
-		u8 __user *base = iov->iov_base + offset;
-		int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
-		offset = 0;
-		if (copy_from_user(kdata, base, copy))
-			return -EFAULT;
-		len -= copy;
-		kdata += copy;
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovecend);
-
 /*
  *	And now for the all-in-one: copy and checksum from a user iovec
  *	directly to a datagram
-- 
cgit v1.2.3-71-gd317


From 4e26445faad366d67d7723622bf6a60a6f0f5993 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 30 Jun 2014 11:50:28 +0800
Subject: kernfs: introduce kernfs_pin_sb()

kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return the superblock.
- drop kernfs_drop_sb().

tj: Updated the comment a bit.

[ This is a prerequisite for a bugfix. ]
Cc: <stable@vger.kernel.org> # 3.15
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 fs/kernfs/mount.c      | 30 ++++++++++++++++++++++++++++++
 include/linux/kernfs.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d171b98a6cdd..f973ae9b05f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb)
 	kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations.  This can be used to block ->kill_sb() which may be useful
+ * for kernfs users which dynamically manage superblocks.
+ *
+ * Returns NULL if there's no superblock associated to this kernfs_root, or
+ * -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+	struct kernfs_super_info *info;
+	struct super_block *sb = NULL;
+
+	mutex_lock(&kernfs_mutex);
+	list_for_each_entry(info, &root->supers, node) {
+		if (info->ns == ns) {
+			sb = info->sb;
+			if (!atomic_inc_not_zero(&info->sb->s_active))
+				sb = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	mutex_unlock(&kernfs_mutex);
+	return sb;
+}
+
 void __init kernfs_init(void)
 {
 	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 17aa1cce6f8e..20f493564917 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -304,6 +304,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 			       struct kernfs_root *root, unsigned long magic,
 			       bool *new_sb_created, const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
cgit v1.2.3-71-gd317


From b14bf2d0c0358140041d1c1805a674376964d0e0 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 30 Jun 2014 11:04:21 -0400
Subject: usb-storage/SCSI: Add broken_fua blacklist flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some buggy JMicron USB-ATA bridges don't know how to translate the FUA
bit in READs or WRITEs.  This patch adds an entry in unusual_devs.h
and a blacklist flag to tell the sd driver not to use FUA.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Michael Büsch <m@bues.ch>
Tested-by: Michael Büsch <m@bues.ch>
Acked-by: James Bottomley <James.Bottomley@HansenPartnership.com>
CC: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
CC: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/scsi/sd.c                  | 5 ++++-
 drivers/usb/storage/scsiglue.c     | 4 ++++
 drivers/usb/storage/unusual_devs.h | 7 +++++++
 include/linux/usb_usual.h          | 4 +++-
 include/scsi/scsi_device.h         | 1 +
 5 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index e9689d57ccb6..6825eda1114a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2441,7 +2441,10 @@ sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer)
 		}
 
 		sdkp->DPOFUA = (data.device_specific & 0x10) != 0;
-		if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw) {
+		if (sdp->broken_fua) {
+			sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n");
+			sdkp->DPOFUA = 0;
+		} else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw) {
 			sd_first_printk(KERN_NOTICE, sdkp,
 				  "Uses READ/WRITE(6), disabling FUA\n");
 			sdkp->DPOFUA = 0;
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 9d38ddc8da49..866b5df36ed1 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -256,6 +256,10 @@ static int slave_configure(struct scsi_device *sdev)
 		if (us->fflags & US_FL_WRITE_CACHE)
 			sdev->wce_default_on = 1;
 
+		/* A few buggy USB-ATA bridges don't understand FUA */
+		if (us->fflags & US_FL_BROKEN_FUA)
+			sdev->broken_fua = 1;
+
 	} else {
 
 		/* Non-disk-type devices don't need to blacklist any pages
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 174a447868cd..80a5b366255f 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1936,6 +1936,13 @@ UNUSUAL_DEV(  0x14cd, 0x6600, 0x0201, 0x0201,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_IGNORE_RESIDUE ),
 
+/* Reported by Michael Büsch <m@bues.ch> */
+UNUSUAL_DEV(  0x152d, 0x0567, 0x0114, 0x0114,
+		"JMicron",
+		"USB to ATA/ATAPI Bridge",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_BROKEN_FUA ),
+
 /* Reported by Alexandre Oliva <oliva@lsd.ic.unicamp.br>
  * JMicron responds to USN and several other SCSI ioctls with a
  * residue that causes subsequent I/O requests to fail.  */
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index 1a64b26046ed..9b7de1b46437 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -70,7 +70,9 @@
 	US_FLAG(NEEDS_CAP16,	0x00400000)			\
 		/* cannot handle READ_CAPACITY_10 */		\
 	US_FLAG(IGNORE_UAS,	0x00800000)			\
-		/* Device advertises UAS but it is broken */
+		/* Device advertises UAS but it is broken */	\
+	US_FLAG(BROKEN_FUA,	0x01000000)			\
+		/* Cannot handle FUA in WRITE or READ CDBs */	\
 
 #define US_FLAG(name, value)	US_FL_##name = value ,
 enum { US_DO_ALL_FLAGS };
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 5853c913d2b0..27ab31017f09 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -173,6 +173,7 @@ struct scsi_device {
 	unsigned is_visible:1;	/* is the device visible in sysfs */
 	unsigned wce_default_on:1;	/* Cache is ON by default */
 	unsigned no_dif:1;	/* T10 PI (DIF) should be disabled */
+	unsigned broken_fua:1;		/* Don't set FUA bit */
 
 	atomic_t disk_events_disable_depth; /* disable depth for disk events */
 
-- 
cgit v1.2.3-71-gd317


From 330d282216d6e4d845a21b72572dc4df4122e8fa Mon Sep 17 00:00:00 2001
From: Zhengyu He <hzy@google.com>
Date: Tue, 1 Jul 2014 12:11:47 -0700
Subject: core: fix typo in percpu read_mostly section

This fixes a typo that named the read_mostly section of percpu as
readmostly. It works fine with SMP because the linker script specifies
.data..percpu..readmostly. However, UP kernel builds don't have percpu
sections defined and the non-percpu version of the section is called
data..read_mostly, so .data..readmostly will float around and may break
things unexpectedly.

Looking at the original change that introduced data..percpu..readmostly
(commit c957ef2c59e952803766ddc22e89981ab534606f), it looks like this
was the original intention.

Tested: Built UP kernel and confirmed the sections got merged.

- Before the patch:
$ objdump -h vmlinux.o  | grep '\.data\.\.read.*mostly'
38 .data..read_mostly 00004418  0000000000000000  0000000000000000  00431ac0  2**6
50 .data..readmostly 00000014  0000000000000000  0000000000000000  00444000  2**3

- After the patch:
$ objdump -h vmlinux.o  | grep '\.data\.\.read.*mostly'
38 .data..read_mostly 00004438  0000000000000000  0000000000000000  00431ac0  2**6

Signed-off-by: Zhengyu He <hzy@google.com>
Signed-off-by: Filipe Brandenburger <filbranden@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 include/linux/percpu-defs.h       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 471ba48c7ae4..c1c0b0cf39b4 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -693,7 +693,7 @@
 	. = ALIGN(PAGE_SIZE);						\
 	*(.data..percpu..page_aligned)					\
 	. = ALIGN(cacheline);						\
-	*(.data..percpu..readmostly)					\
+	*(.data..percpu..read_mostly)					\
 	. = ALIGN(cacheline);						\
 	*(.data..percpu)						\
 	*(.data..percpu..shared_aligned)				\
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index a5fc7d01aad6..dec01d6c3f80 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -146,10 +146,10 @@
  * Declaration/definition used for per-CPU variables that must be read mostly.
  */
 #define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\
-	DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
+	DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
 
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\
-	DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
+	DEFINE_PER_CPU_SECTION(type, name, "..read_mostly")
 
 /*
  * Intermodule exports for per-CPU variables.  sparse forgets about
-- 
cgit v1.2.3-71-gd317


From ecca47ce8294843045e7465d76fee84dbf07a004 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Jul 2014 16:41:03 -0400
Subject: kernfs: kernfs_notify() must be useable from non-sleepable contexts

d911d9874801 ("kernfs: make kernfs_notify() trigger inotify events
too") added fsnotify triggering to kernfs_notify() which requires a
sleepable context.  There are already existing users of
kernfs_notify() which invoke it from an atomic context and in general
it's silly to require a sleepable context for triggering a
notification.

The following is an invalid context bug triggerd by md invoking
sysfs_notify() from IO completion path.

 BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586
 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1
 2 locks held by swapper/1/0:
  #0:  (&(&vblk->vq_lock)->rlock){-.-...}, at: [<ffffffffa0039042>] virtblk_done+0x42/0xe0 [virtio_blk]
  #1:  (&(&bitmap->counts.lock)->rlock){-.....}, at: [<ffffffff81633718>] bitmap_endwrite+0x68/0x240
 irq event stamp: 33518
 hardirqs last  enabled at (33515): [<ffffffff8102544f>] default_idle+0x1f/0x230
 hardirqs last disabled at (33516): [<ffffffff818122ed>] common_interrupt+0x6d/0x72
 softirqs last  enabled at (33518): [<ffffffff810a1272>] _local_bh_enable+0x22/0x50
 softirqs last disabled at (33517): [<ffffffff810a29e0>] irq_enter+0x60/0x80
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.16.0-0.rc2.git2.1.fc21.x86_64 #1
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000000000000 f90db13964f4ee05 ffff88007d403b80 ffffffff81807b4c
  0000000000000000 ffff88007d403ba8 ffffffff810d4f14 0000000000000000
  0000000000441800 ffff880078fa1780 ffff88007d403c38 ffffffff8180caf2
 Call Trace:
  <IRQ>  [<ffffffff81807b4c>] dump_stack+0x4d/0x66
  [<ffffffff810d4f14>] __might_sleep+0x184/0x240
  [<ffffffff8180caf2>] mutex_lock_nested+0x42/0x440
  [<ffffffff812d76a0>] kernfs_notify+0x90/0x150
  [<ffffffff8163377c>] bitmap_endwrite+0xcc/0x240
  [<ffffffffa00de863>] close_write+0x93/0xb0 [raid1]
  [<ffffffffa00df029>] r1_bio_write_done+0x29/0x50 [raid1]
  [<ffffffffa00e0474>] raid1_end_write_request+0xe4/0x260 [raid1]
  [<ffffffff813acb8b>] bio_endio+0x6b/0xa0
  [<ffffffff813b46c4>] blk_update_request+0x94/0x420
  [<ffffffff813bf0ea>] blk_mq_end_io+0x1a/0x70
  [<ffffffffa00392c2>] virtblk_request_done+0x32/0x80 [virtio_blk]
  [<ffffffff813c0648>] __blk_mq_complete_request+0x88/0x120
  [<ffffffff813c070a>] blk_mq_complete_request+0x2a/0x30
  [<ffffffffa0039066>] virtblk_done+0x66/0xe0 [virtio_blk]
  [<ffffffffa002535a>] vring_interrupt+0x3a/0xa0 [virtio_ring]
  [<ffffffff81116177>] handle_irq_event_percpu+0x77/0x340
  [<ffffffff8111647d>] handle_irq_event+0x3d/0x60
  [<ffffffff81119436>] handle_edge_irq+0x66/0x130
  [<ffffffff8101c3e4>] handle_irq+0x84/0x150
  [<ffffffff818146ad>] do_IRQ+0x4d/0xe0
  [<ffffffff818122f2>] common_interrupt+0x72/0x72
  <EOI>  [<ffffffff8105f706>] ? native_safe_halt+0x6/0x10
  [<ffffffff81025454>] default_idle+0x24/0x230
  [<ffffffff81025f9f>] arch_cpu_idle+0xf/0x20
  [<ffffffff810f5adc>] cpu_startup_entry+0x37c/0x7b0
  [<ffffffff8104df1b>] start_secondary+0x25b/0x300

This patch fixes it by punting the notification delivery through a
work item.  This ends up adding an extra pointer to kernfs_elem_attr
enlarging kernfs_node by a pointer, which is not ideal but not a very
big deal either.  If this turns out to be an actual issue, we can move
kernfs_elem_attr->size to kernfs_node->iattr later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Cc: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c       | 69 ++++++++++++++++++++++++++++++++++++++++----------
 include/linux/kernfs.h |  1 +
 2 files changed, 56 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e3d37f607f97..d895b4b7b661 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -39,6 +39,19 @@ struct kernfs_open_node {
 	struct list_head	files; /* goes through kernfs_open_file.list */
 };
 
+/*
+ * kernfs_notify() may be called from any context and bounces notifications
+ * through a work item.  To minimize space overhead in kernfs_node, the
+ * pending queue is implemented as a singly linked list of kernfs_nodes.
+ * The list is terminated with the self pointer so that whether a
+ * kernfs_node is on the list or not can be determined by testing the next
+ * pointer for NULL.
+ */
+#define KERNFS_NOTIFY_EOL			((void *)&kernfs_notify_list)
+
+static DEFINE_SPINLOCK(kernfs_notify_lock);
+static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+
 static struct kernfs_open_file *kernfs_of(struct file *file)
 {
 	return ((struct seq_file *)file->private_data)->private;
@@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
 	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
-/**
- * kernfs_notify - notify a kernfs file
- * @kn: file to notify
- *
- * Notify @kn such that poll(2) on @kn wakes up.
- */
-void kernfs_notify(struct kernfs_node *kn)
+static void kernfs_notify_workfn(struct work_struct *work)
 {
-	struct kernfs_root *root = kernfs_root(kn);
+	struct kernfs_node *kn;
 	struct kernfs_open_node *on;
 	struct kernfs_super_info *info;
-	unsigned long flags;
-
-	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+repeat:
+	/* pop one off the notify_list */
+	spin_lock_irq(&kernfs_notify_lock);
+	kn = kernfs_notify_list;
+	if (kn == KERNFS_NOTIFY_EOL) {
+		spin_unlock_irq(&kernfs_notify_lock);
 		return;
+	}
+	kernfs_notify_list = kn->attr.notify_next;
+	kn->attr.notify_next = NULL;
+	spin_unlock_irq(&kernfs_notify_lock);
 
 	/* kick poll */
-	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+	spin_lock_irq(&kernfs_open_node_lock);
 
 	on = kn->attr.open;
 	if (on) {
@@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *kn)
 		wake_up_interruptible(&on->poll);
 	}
 
-	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+	spin_unlock_irq(&kernfs_open_node_lock);
 
 	/* kick fsnotify */
 	mutex_lock(&kernfs_mutex);
 
-	list_for_each_entry(info, &root->supers, node) {
+	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
 		struct inode *inode;
 		struct dentry *dentry;
 
@@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *kn)
 	}
 
 	mutex_unlock(&kernfs_mutex);
+	kernfs_put(kn);
+	goto repeat;
+}
+
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up.  Maybe be called from any
+ * context.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+	static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
+	unsigned long flags;
+
+	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+		return;
+
+	spin_lock_irqsave(&kernfs_notify_lock, flags);
+	if (!kn->attr.notify_next) {
+		kernfs_get(kn);
+		kn->attr.notify_next = kernfs_notify_list;
+		kernfs_notify_list = kn;
+		schedule_work(&kernfs_notify_work);
+	}
+	spin_unlock_irqrestore(&kernfs_notify_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 17aa1cce6f8e..145375ea0bd9 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -91,6 +91,7 @@ struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct kernfs_open_node	*open;
 	loff_t			size;
+	struct kernfs_node	*notify_next;	/* for kernfs_notify() */
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From b9cd18de4db3c9ffa7e17b0dc0ca99ed5aa4d43a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 3 Jul 2014 15:43:15 -0400
Subject: ptrace,x86: force IRET path after a ptrace_stop()

The 'sysret' fastpath does not correctly restore even all regular
registers, much less any segment registers or reflags values.  That is
very much part of why it's faster than 'iret'.

Normally that isn't a problem, because the normal ptrace() interface
catches the process using the signal handler infrastructure, which
always returns with an iret.

However, some paths can get caught using ptrace_event() instead of the
signal path, and for those we need to make sure that we aren't going to
return to user space using 'sysret'.  Otherwise the modifications that
may have been done to the register set by the tracer wouldn't
necessarily take effect.

Fix it by forcing IRET path by setting TIF_NOTIFY_RESUME from
arch_ptrace_stop_needed() which is invoked from ptrace_stop().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/ptrace.h | 16 ++++++++++++++++
 include/linux/ptrace.h        |  3 +++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14fd6fd75a19..6205f0c434db 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -231,6 +231,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
 
 #define ARCH_HAS_USER_SINGLE_STEP_INFO
 
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set.  The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info)				\
+({									\
+	set_thread_flag(TIF_NOTIFY_RESUME);				\
+	false;								\
+})
+
 struct user_desc;
 extern int do_get_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 077904c8b70d..cc79eff4a1ad 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -334,6 +334,9 @@ static inline void user_single_step_siginfo(struct task_struct *tsk,
  * calling arch_ptrace_stop() when it would be superfluous.  For example,
  * if the thread has not been back to user mode since the last stop, the
  * thread state might indicate that nothing needs to be done.
+ *
+ * This is guaranteed to be invoked once before a task stops for ptrace and
+ * may include arch-specific operations necessary prior to a ptrace stop.
  */
 #define arch_ptrace_stop_needed(code, info)	(0)
 #endif
-- 
cgit v1.2.3-71-gd317