From 00b9de9c249f51f09c19aa41cbbb3e3eb4eea807 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 24 Jun 2009 17:53:33 +0900
Subject: serial: sh-sci: Move SCSCR_INIT in to platform data.

This moves all of the SCSCR_INIT definitions in to the platform data,
for future consolidation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/serial_sci.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 1c297ddc9d5a..f722a2275add 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -7,6 +7,15 @@
  * Generic header for SuperH SCI(F) (used by sh/sh64/h8300 and related parts)
  */
 
+#define SCSCR_TIE	(1 << 7)
+#define SCSCR_RIE	(1 << 6)
+#define SCSCR_TE	(1 << 5)
+#define SCSCR_RE	(1 << 4)
+#define SCSCR_REIE	(1 << 3)
+#define SCSCR_TOIE	(1 << 2)	/* not supported by all parts */
+#define SCSCR_CKE1	(1 << 1)
+#define SCSCR_CKE0	(1 << 0)
+
 /* Offsets into the sci_port->irqs array */
 enum {
 	SCIx_ERI_IRQ,
@@ -26,6 +35,8 @@ struct plat_sci_port {
 	unsigned int	type;			/* SCI / SCIF / IRDA */
 	upf_t		flags;			/* UPF_* flags */
 	char		*clk;			/* clock string */
+
+	unsigned int	scscr;			/* SCSCR initialization */
 };
 
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
cgit v1.2.3-71-gd317


From 26c92f3728d738aaa7e4859d5581323cd68096dd Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 24 Jun 2009 18:23:52 +0900
Subject: serial: sh-sci: Move SCBRR calculation algo in to platform data.

This permits each port to select its own SCBRR calculation algorithm,
rather than having it all ifdef'ed in the header. There are presently
only 5 different variations that all parts fall under.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh2/setup-sh7619.c  |  3 ++
 arch/sh/kernel/cpu/sh2a/setup-mxg.c    |  1 +
 arch/sh/kernel/cpu/sh2a/setup-sh7201.c |  8 +++++
 arch/sh/kernel/cpu/sh2a/setup-sh7203.c |  4 +++
 arch/sh/kernel/cpu/sh2a/setup-sh7206.c |  4 +++
 arch/sh/kernel/cpu/sh3/setup-sh7705.c  |  2 ++
 arch/sh/kernel/cpu/sh3/setup-sh770x.c  |  3 ++
 arch/sh/kernel/cpu/sh3/setup-sh7710.c  |  2 ++
 arch/sh/kernel/cpu/sh3/setup-sh7720.c  |  2 ++
 arch/sh/kernel/cpu/sh4/setup-sh4-202.c |  1 +
 arch/sh/kernel/cpu/sh4/setup-sh7750.c  |  2 ++
 arch/sh/kernel/cpu/sh4/setup-sh7760.c  |  4 +++
 arch/sh/kernel/cpu/sh4a/setup-sh7343.c |  4 +++
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c |  1 +
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c |  3 ++
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c |  6 ++++
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c |  6 ++++
 arch/sh/kernel/cpu/sh4a/setup-sh7763.c |  3 ++
 arch/sh/kernel/cpu/sh4a/setup-sh7770.c | 10 ++++++
 arch/sh/kernel/cpu/sh4a/setup-sh7780.c |  2 ++
 arch/sh/kernel/cpu/sh4a/setup-sh7785.c |  6 ++++
 arch/sh/kernel/cpu/sh4a/setup-sh7786.c |  6 ++++
 arch/sh/kernel/cpu/sh4a/setup-shx3.c   |  4 +++
 arch/sh/kernel/cpu/sh5/setup-sh5.c     |  1 +
 drivers/serial/sh-sci.c                | 27 ++++++++++++++--
 drivers/serial/sh-sci.h                | 56 ----------------------------------
 include/linux/serial_sci.h             |  9 ++++++
 27 files changed, 122 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index ace016b17036..86acede777b9 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -64,18 +64,21 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xf8400000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 88, 88, 88, 88 },
 	}, {
 		.mapbase	= 0xf8410000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 92, 92, 92, 92 },
 	}, {
 		.mapbase	= 0xf8420000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 96, 96, 96, 96 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 7ec658ce14f8..b2c3bcc01190 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -212,6 +212,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xff804000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 220, 220, 220, 220 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 2a2ac222f9c7..8d44917ce50b 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -182,48 +182,56 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xfffe8000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 180, 180, 180, 180 }
 	}, {
 		.mapbase	= 0xfffe8800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 184, 184, 184, 184 }
 	}, {
 		.mapbase	= 0xfffe9000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 188, 188, 188, 188 }
 	}, {
 		.mapbase	= 0xfffe9800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 192, 192, 192, 192 }
 	}, {
 		.mapbase	= 0xfffea000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 196, 196, 196, 196 }
 	}, {
 		.mapbase	= 0xfffea800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 200, 200, 200, 200 }
 	}, {
 		.mapbase	= 0xfffeb000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 204, 204, 204, 204 }
 	}, {
 		.mapbase	= 0xfffeb800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 208, 208, 208, 208 }
 	}, {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 2c9f3ababfd7..a78d2a219f3b 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -178,24 +178,28 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xfffe8000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		=  { 192, 192, 192, 192 },
 	}, {
 		.mapbase	= 0xfffe8800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		=  { 196, 196, 196, 196 },
 	}, {
 		.mapbase	= 0xfffe9000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		=  { 200, 200, 200, 200 },
 	}, {
 		.mapbase	= 0xfffe9800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		=  { 204, 204, 204, 204 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index 5a47987f3902..68b93ed44cc2 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -138,24 +138,28 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xfffe8000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 240, 240, 240, 240 },
 	}, {
 		.mapbase	= 0xfffe8800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 244, 244, 244, 244 },
 	}, {
 		.mapbase	= 0xfffe9000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 248, 248, 248, 248 },
 	}, {
 		.mapbase	= 0xfffe9800,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 252, 252, 252, 252 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
index 28de53b281f3..27d03d836056 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
@@ -73,12 +73,14 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_TIE | SCSCR_RIE  | SCSCR_TE |
 				  SCSCR_RE  | SCSCR_CKE1 | SCSCR_CKE0,
+		.scbrr_algo_id	= SCBRR_ALGO_4,
 		.type		= PORT_SCIF,
 		.irqs		= { 56, 56, 56 },
 	}, {
 		.mapbase	= 0xa4400000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_TIE | SCSCR_RIE | SCSCR_TE | SCSCR_RE,
+		.scbrr_algo_id	= SCBRR_ALGO_4,
 		.type		= PORT_SCIF,
 		.irqs		= { 52, 52, 52 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index 50ac42836dc7..83c9a5a39685 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -111,6 +111,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xfffffe80,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_TE | SCSCR_RE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCI,
 		.irqs		= { 23, 23, 23, 0 },
 	},
@@ -121,6 +122,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4000150,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_TE | SCSCR_RE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 56, 56, 56, 56 },
 	},
@@ -131,6 +133,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4000140,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_TE | SCSCR_RE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_IRDA,
 		.irqs		= { 52, 52, 52, 52 },
 	},
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
index 007627ecb7c8..9a60ffd34a9f 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
@@ -102,6 +102,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_TE | SCSCR_RE | SCSCR_REIE |
 				  SCSCR_CKE1 | SCSCR_CKE0,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 52, 52, 52, 52 },
 	}, {
@@ -109,6 +110,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_TE | SCSCR_RE | SCSCR_REIE |
 				  SCSCR_CKE1 | SCSCR_CKE0,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs           = { 56, 56, 56, 56 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index 1fc3d9089199..48d50a65db32 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -53,12 +53,14 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4430000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE,
+		.scbrr_algo_id	= SCBRR_ALGO_4,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
 	}, {
 		.mapbase	= 0xa4438000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE,
+		.scbrr_algo_id	= SCBRR_ALGO_4,
 		.type		= PORT_SCIF,
 		.irqs           = { 81, 81, 81, 81 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index 9aa6fa3ca4e6..ec2104b49ef7 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -20,6 +20,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe80000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 41, 43, 42 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index 2159c439dce9..51a945e0d72c 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -41,6 +41,7 @@ static struct plat_sci_port sci_platform_data = {
 	.flags		= UPF_BOOT_AUTOCONF,
 	.type		= PORT_SCI,
 	.scscr		= SCSCR_TE | SCSCR_RE,
+	.scbrr_algo_id	= SCBRR_ALGO_2,
 	.irqs		= { 23, 23, 23, 0 },
 };
 
@@ -55,6 +56,7 @@ static struct plat_sci_port scif_platform_data = {
 	.mapbase	= 0xffe80000,
 	.flags		= UPF_BOOT_AUTOCONF,
 	.scscr		= SCSCR_TE | SCSCR_RE | SCSCR_REIE,
+	.scbrr_algo_id	= SCBRR_ALGO_2,
 	.type		= PORT_SCIF,
 	.irqs		= { 40, 40, 40, 40 },
 };
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index 74b5e994724d..cee660fe1d90 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -131,24 +131,28 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xfe600000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 52, 53, 55, 54 },
 	}, {
 		.mapbase	= 0xfe610000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 72, 73, 75, 74 },
 	}, {
 		.mapbase	= 0xfe620000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 76, 77, 79, 78 },
 	}, {
 		.mapbase	= 0xfe480000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCI,
 		.irqs		= { 80, 81, 82, 0 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index 09fb5814d925..fbae06b1c98d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -270,6 +270,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe00000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
 		.clk		= "scif0",
@@ -277,6 +278,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 81, 81, 81, 81 },
 		.clk		= "scif1",
@@ -284,6 +286,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe20000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 82, 82, 82, 82 },
 		.clk		= "scif2",
@@ -291,6 +294,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 83, 83, 83, 83 },
 		.clk		= "scif3",
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 307777cf04cc..d4ee429032b1 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -281,6 +281,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe00000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
 		.clk		= "scif0",
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index ffc69bc95932..f7b0551bf104 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -306,6 +306,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe00000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
 		.clk		= "scif0",
@@ -313,6 +314,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 81, 81, 81, 81 },
 		.clk		= "scif1",
@@ -320,6 +322,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe20000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 82, 82, 82, 82 },
 		.clk		= "scif2",
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 6ce331a8f1bd..bb4837b9dcf4 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -322,6 +322,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase        = 0xffe00000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type           = PORT_SCIF,
 		.irqs           = { 80, 80, 80, 80 },
 		.clk		= "scif0",
@@ -329,6 +330,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase        = 0xffe10000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type           = PORT_SCIF,
 		.irqs           = { 81, 81, 81, 81 },
 		.clk		= "scif1",
@@ -336,6 +338,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase        = 0xffe20000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type           = PORT_SCIF,
 		.irqs           = { 82, 82, 82, 82 },
 		.clk		= "scif2",
@@ -343,6 +346,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4e30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_3,
 		.type		= PORT_SCIFA,
 		.irqs		= { 56, 56, 56, 56 },
 		.clk		= "scif3",
@@ -350,6 +354,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4e40000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_3,
 		.type		= PORT_SCIFA,
 		.irqs		= { 88, 88, 88, 88 },
 		.clk		= "scif4",
@@ -357,6 +362,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4e50000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_3,
 		.type		= PORT_SCIFA,
 		.irqs		= { 109, 109, 109, 109 },
 		.clk		= "scif5",
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 4bf03c1ec8d6..c934b78e5658 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -29,6 +29,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase        = 0xffe00000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type           = PORT_SCIF,
 		.irqs           = { 80, 80, 80, 80 },
 		.clk		= "scif0",
@@ -36,6 +37,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase        = 0xffe10000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type           = PORT_SCIF,
 		.irqs           = { 81, 81, 81, 81 },
 		.clk		= "scif1",
@@ -43,6 +45,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase        = 0xffe20000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type           = PORT_SCIF,
 		.irqs           = { 82, 82, 82, 82 },
 		.clk		= "scif2",
@@ -50,6 +53,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4e30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_3,
 		.type		= PORT_SCIFA,
 		.irqs		= { 56, 56, 56, 56 },
 		.clk		= "scif3",
@@ -57,6 +61,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4e40000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_3,
 		.type		= PORT_SCIFA,
 		.irqs		= { 88, 88, 88, 88 },
 		.clk		= "scif4",
@@ -64,6 +69,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xa4e50000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_3,
 		.type		= PORT_SCIFA,
 		.irqs		= { 109, 109, 109, 109 },
 		.clk		= "scif5",
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
index 76339c6da01e..ab02771ee888 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
@@ -41,18 +41,21 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe00000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 40, 40, 40 },
 	}, {
 		.mapbase	= 0xffe08000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 76, 76, 76, 76 },
 	}, {
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 104, 104, 104, 104 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index 07a41ff20504..746f4fb9ccf0 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -19,60 +19,70 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xff923000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 61, 61, 61, 61 },
 	}, {
 		.mapbase	= 0xff924000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 62, 62, 62, 62 },
 	}, {
 		.mapbase	= 0xff925000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 63, 63, 63, 63 },
 	}, {
 		.mapbase	= 0xff926000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 64, 64, 64, 64 },
 	}, {
 		.mapbase	= 0xff927000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 65, 65, 65, 65 },
 	}, {
 		.mapbase	= 0xff928000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 66, 66, 66, 66 },
 	}, {
 		.mapbase	= 0xff929000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 67, 67, 67, 67 },
 	}, {
 		.mapbase	= 0xff92a000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 68, 68, 68, 68 },
 	}, {
 		.mapbase	= 0xff92b000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 69, 69, 69, 69 },
 	}, {
 		.mapbase	= 0xff92c000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_TOIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 70, 70, 70, 70 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
index 2b355b67a33d..bcd411eb9cb0 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
@@ -221,12 +221,14 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffe00000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 40, 40, 40 },
 	}, {
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 76, 76, 76, 76 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index acd4b1d1b813..3ae2e2071009 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -203,6 +203,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffea0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 40, 40, 40 },
 		.clk		= "scif_fck",
@@ -210,6 +211,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffeb0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 44, 44, 44, 44 },
 		.clk		= "scif_fck",
@@ -217,6 +219,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffec0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 60, 60, 60, 60 },
 		.clk		= "scif_fck",
@@ -224,6 +227,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffed0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 61, 61, 61, 61 },
 		.clk		= "scif_fck",
@@ -231,6 +235,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffee0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 62, 62, 62, 62 },
 		.clk		= "scif_fck",
@@ -238,6 +243,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffef0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 63, 63, 63, 63 },
 		.clk		= "scif_fck",
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index 347ce88de570..8b7ea4bd965d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -28,6 +28,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffea0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 41, 43, 42 },
 	},
@@ -38,30 +39,35 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffeb0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 44, 44, 44, 44 },
 	}, {
 		.mapbase	= 0xffec0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 50, 50, 50, 50 },
 	}, {
 		.mapbase	= 0xffed0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 51, 51, 51, 51 },
 	}, {
 		.mapbase	= 0xffee0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 52, 52, 52, 52 },
 	}, {
 		.mapbase	= 0xffef0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE | SCSCR_CKE1,
+		.scbrr_algo_id	= SCBRR_ALGO_1,
 		.type		= PORT_SCIF,
 		.irqs		= { 53, 53, 53, 53 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index eef94934f542..4a26cc304139 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -20,24 +20,28 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= 0xffc30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 41, 43, 42 },
 	}, {
 		.mapbase	= 0xffc40000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 44, 45, 47, 46 },
 	}, {
 		.mapbase	= 0xffc50000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 48, 49, 51, 50 },
 	}, {
 		.mapbase	= 0xffc60000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 52, 53, 55, 54 },
 	}, {
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index 26fa10c560de..72aa86ec7446 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -21,6 +21,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.mapbase	= PHYS_PERIPHERAL_BLOCK + 0x01030000,
 		.flags		= UPF_BOOT_AUTOCONF | UPF_IOREMAP,
 		.scscr		= SCSCR_RE | SCSCR_TE | SCSCR_REIE,
+		.scbrr_algo_id	= SCBRR_ALGO_2,
 		.type		= PORT_SCIF,
 		.irqs		= { 39, 40, 42, 0 },
 	}, {
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 3a13e58e9c5d..386fb878680c 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -82,6 +82,9 @@ struct sci_port {
 	/* SCSCR initialization */
 	unsigned int		scscr;
 
+	/* SCBRR calculation algo */
+	unsigned int		scbrr_algo_id;
+
 #ifdef CONFIG_HAVE_CLK
 	/* Interface clock */
 	struct clk		*iclk;
@@ -928,6 +931,27 @@ static void sci_shutdown(struct uart_port *port)
 		s->disable(port);
 }
 
+static unsigned int sci_scbrr_calc(unsigned int algo_id, unsigned int bps,
+				   unsigned long freq)
+{
+	switch (algo_id) {
+	case SCBRR_ALGO_1:
+		return ((freq + 16 * bps) / (16 * bps) - 1);
+	case SCBRR_ALGO_2:
+		return ((freq + 16 * bps) / (32 * bps) - 1);
+	case SCBRR_ALGO_3:
+		return (((freq * 2) + 16 * bps) / (16 * bps) - 1);
+	case SCBRR_ALGO_4:
+		return (((freq * 2) + 16 * bps) / (32 * bps) - 1);
+	case SCBRR_ALGO_5:
+		return (((freq * 1000 / 32) / bps) - 1);
+	}
+
+	/* Warn, but use a safe default */
+	WARN_ON(1);
+	return ((freq + 16 * bps) / (32 * bps) - 1);
+}
+
 static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
 			    struct ktermios *old)
 {
@@ -937,7 +961,7 @@ static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
 
 	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16);
 	if (likely(baud))
-		t = SCBRR_VALUE(baud, port->uartclk);
+		t = sci_scbrr_calc(s->scbrr_algo_id, baud, port->uartclk);
 
 	do {
 		status = sci_in(port, SCxSR);
@@ -1108,7 +1132,6 @@ static void __devinit sci_init_single(struct platform_device *dev,
 	sci_port->type		= sci_port->port.type = p->type;
 
 	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
-
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index 4aa0ac8e67dd..81104777a0a6 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -713,59 +713,3 @@ static inline int sci_rxd_in(struct uart_port *port)
 	return 1;
 }
 #endif
-
-/*
- * Values for the BitRate Register (SCBRR)
- *
- * The values are actually divisors for a frequency which can
- * be internal to the SH3 (14.7456MHz) or derived from an external
- * clock source.  This driver assumes the internal clock is used;
- * to support using an external clock source, config options or
- * possibly command-line options would need to be added.
- *
- * Also, to support speeds below 2400 (why?) the lower 2 bits of
- * the SCSMR register would also need to be set to non-zero values.
- *
- * -- Greg Banks 27Feb2000
- *
- * Answer: The SCBRR register is only eight bits, and the value in
- * it gets larger with lower baud rates. At around 2400 (depending on
- * the peripherial module clock) you run out of bits. However the
- * lower two bits of SCSMR allow the module clock to be divided down,
- * scaling the value which is needed in SCBRR.
- *
- * -- Stuart Menefy - 23 May 2000
- *
- * I meant, why would anyone bother with bitrates below 2400.
- *
- * -- Greg Banks - 7Jul2000
- *
- * You "speedist"!  How will I use my 110bps ASR-33 teletype with paper
- * tape reader as a console!
- *
- * -- Mitch Davis - 15 Jul 2000
- */
-
-#if defined(CONFIG_CPU_SUBTYPE_SH7780) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7785) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7786)
-#define SCBRR_VALUE(bps, clk) ((clk+16*bps)/(16*bps)-1)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7705) || \
-      defined(CONFIG_CPU_SUBTYPE_SH7720) || \
-      defined(CONFIG_CPU_SUBTYPE_SH7721)
-#define SCBRR_VALUE(bps, clk) (((clk*2)+16*bps)/(32*bps)-1)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
-      defined(CONFIG_CPU_SUBTYPE_SH7724)
-static inline int scbrr_calc(struct uart_port *port, int bps, int clk)
-{
-	if (port->type == PORT_SCIF)
-		return (clk+16*bps)/(32*bps)-1;
-	else
-		return ((clk*2)+16*bps)/(16*bps)-1;
-}
-#define SCBRR_VALUE(bps, clk) scbrr_calc(port, bps, clk)
-#elif defined(__H8300H__) || defined(__H8300S__)
-#define SCBRR_VALUE(bps, clk) (((clk*1000/32)/bps)-1)
-#else /* Generic SH */
-#define SCBRR_VALUE(bps, clk) ((clk+16*bps)/(32*bps)-1)
-#endif
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index f722a2275add..ff856b5bd276 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -7,6 +7,14 @@
  * Generic header for SuperH SCI(F) (used by sh/sh64/h8300 and related parts)
  */
 
+enum {
+	SCBRR_ALGO_1,		/* ((clk + 16 * bps) / (16 * bps) - 1) */
+	SCBRR_ALGO_2,		/* ((clk + 16 * bps) / (32 * bps) - 1) */
+	SCBRR_ALGO_3,		/* (((clk * 2) + 16 * bps) / (16 * bps) - 1) */
+	SCBRR_ALGO_4,		/* (((clk * 2) + 16 * bps) / (32 * bps) - 1) */
+	SCBRR_ALGO_5,		/* (((clk * 1000 / 32) / bps) - 1) */
+};
+
 #define SCSCR_TIE	(1 << 7)
 #define SCSCR_RIE	(1 << 6)
 #define SCSCR_TE	(1 << 5)
@@ -36,6 +44,7 @@ struct plat_sci_port {
 	upf_t		flags;			/* UPF_* flags */
 	char		*clk;			/* clock string */
 
+	unsigned int	scbrr_algo_id;		/* SCBRR calculation algo */
 	unsigned int	scscr;			/* SCSCR initialization */
 };
 
-- 
cgit v1.2.3-71-gd317


From 4b6ba8aacbb3185703b797286547d0f8f3859b02 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Tue, 26 Oct 2010 15:07:13 -0700
Subject: of/net: Move of_get_mac_address() to a common source file.

There are two identical implementations of of_get_mac_address(), one
each in arch/powerpc/kernel/prom_parse.c and
arch/microblaze/kernel/prom_parse.c.  Move this function to a new
common file of_net.{c,h} and adjust all the callers to include the new
header.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
[grant.likely@secretlab.ca: protect header with #ifdef]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/include/asm/prom.h  |  3 ---
 arch/microblaze/kernel/prom_parse.c | 38 -----------------------------
 arch/powerpc/include/asm/prom.h     |  3 ---
 arch/powerpc/kernel/prom_parse.c    | 38 -----------------------------
 arch/powerpc/sysdev/mv64x60_dev.c   |  1 +
 arch/powerpc/sysdev/tsi108_dev.c    |  1 +
 drivers/net/fs_enet/fs_enet-main.c  |  1 +
 drivers/net/gianfar.c               |  1 +
 drivers/net/ucc_geth.c              |  1 +
 drivers/net/xilinx_emaclite.c       |  1 +
 drivers/of/Kconfig                  |  4 ++++
 drivers/of/Makefile                 |  1 +
 drivers/of/of_net.c                 | 48 +++++++++++++++++++++++++++++++++++++
 include/linux/of_net.h              | 15 ++++++++++++
 14 files changed, 74 insertions(+), 82 deletions(-)
 create mode 100644 drivers/of/of_net.c
 create mode 100644 include/linux/of_net.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index bdc38312ae4a..2e72af078b05 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -64,9 +64,6 @@ extern void kdump_move_device_tree(void);
 /* CPU OF node matching */
 struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 
-/* Get the MAC address */
-extern const void *of_get_mac_address(struct device_node *np);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index 99d9b61cccb5..9ae24f4b882b 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -110,41 +110,3 @@ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 	cells = prop ? *(u32 *)prop : of_n_size_cells(dn);
 	*size = of_read_number(dma_window, cells);
 }
-
-/**
- * Search the device tree for the best MAC address to use.  'mac-address' is
- * checked first, because that is supposed to contain to "most recent" MAC
- * address. If that isn't set, then 'local-mac-address' is checked next,
- * because that is the default address.  If that isn't set, then the obsolete
- * 'address' is checked, just in case we're using an old device tree.
- *
- * Note that the 'address' property is supposed to contain a virtual address of
- * the register set, but some DTS files have redefined that property to be the
- * MAC address.
- *
- * All-zero MAC addresses are rejected, because those could be properties that
- * exist in the device tree, but were not set by U-Boot.  For example, the
- * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
- * addresses.  Some older U-Boots only initialized 'local-mac-address'.  In
- * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
- * but is all zeros.
-*/
-const void *of_get_mac_address(struct device_node *np)
-{
-	struct property *pp;
-
-	pp = of_find_property(np, "mac-address", NULL);
-	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
-		return pp->value;
-
-	pp = of_find_property(np, "local-mac-address", NULL);
-	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
-		return pp->value;
-
-	pp = of_find_property(np, "address", NULL);
-	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
-		return pp->value;
-
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_mac_address);
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index ae26f2efd089..98264bf0a433 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -63,9 +63,6 @@ struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 /* cache lookup */
 struct device_node *of_find_next_cache_node(struct device_node *np);
 
-/* Get the MAC address */
-extern const void *of_get_mac_address(struct device_node *np);
-
 #ifdef CONFIG_NUMA
 extern int of_node_to_nid(struct device_node *device);
 #else
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 88334af038e5..c2b7a07cc3d3 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -117,41 +117,3 @@ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 	cells = prop ? *(u32 *)prop : of_n_size_cells(dn);
 	*size = of_read_number(dma_window, cells);
 }
-
-/**
- * Search the device tree for the best MAC address to use.  'mac-address' is
- * checked first, because that is supposed to contain to "most recent" MAC
- * address. If that isn't set, then 'local-mac-address' is checked next,
- * because that is the default address.  If that isn't set, then the obsolete
- * 'address' is checked, just in case we're using an old device tree.
- *
- * Note that the 'address' property is supposed to contain a virtual address of
- * the register set, but some DTS files have redefined that property to be the
- * MAC address.
- *
- * All-zero MAC addresses are rejected, because those could be properties that
- * exist in the device tree, but were not set by U-Boot.  For example, the
- * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
- * addresses.  Some older U-Boots only initialized 'local-mac-address'.  In
- * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
- * but is all zeros.
-*/
-const void *of_get_mac_address(struct device_node *np)
-{
-	struct property *pp;
-
-	pp = of_find_property(np, "mac-address", NULL);
-	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
-		return pp->value;
-
-	pp = of_find_property(np, "local-mac-address", NULL);
-	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
-		return pp->value;
-
-	pp = of_find_property(np, "address", NULL);
-	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
-		return pp->value;
-
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_mac_address);
diff --git a/arch/powerpc/sysdev/mv64x60_dev.c b/arch/powerpc/sysdev/mv64x60_dev.c
index 1398bc454999..feaee402e2d6 100644
--- a/arch/powerpc/sysdev/mv64x60_dev.c
+++ b/arch/powerpc/sysdev/mv64x60_dev.c
@@ -16,6 +16,7 @@
 #include <linux/mv643xx.h>
 #include <linux/platform_device.h>
 #include <linux/of_platform.h>
+#include <linux/of_net.h>
 #include <linux/dma-mapping.h>
 
 #include <asm/prom.h>
diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c
index d4d15aaf18fa..c2d675b6392c 100644
--- a/arch/powerpc/sysdev/tsi108_dev.c
+++ b/arch/powerpc/sysdev/tsi108_dev.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/platform_device.h>
+#include <linux/of_net.h>
 #include <asm/tsi108.h>
 
 #include <asm/system.h>
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index d684f187de57..7a1f3d0ffa78 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -40,6 +40,7 @@
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 #include <linux/of_gpio.h>
+#include <linux/of_net.h>
 
 #include <linux/vmalloc.h>
 #include <asm/pgtable.h>
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 49e4ce1246a7..f860072e2f68 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -95,6 +95,7 @@
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
 #include <linux/of.h>
+#include <linux/of_net.h>
 
 #include "gianfar.h"
 #include "fsl_pq_mdio.h"
diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index a4c3f5708246..f7e370fd8ddc 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -28,6 +28,7 @@
 #include <linux/phy.h>
 #include <linux/workqueue.h>
 #include <linux/of_mdio.h>
+#include <linux/of_net.h>
 #include <linux/of_platform.h>
 
 #include <asm/uaccess.h>
diff --git a/drivers/net/xilinx_emaclite.c b/drivers/net/xilinx_emaclite.c
index 14f0955eca68..2a34b22ea26a 100644
--- a/drivers/net/xilinx_emaclite.c
+++ b/drivers/net/xilinx_emaclite.c
@@ -24,6 +24,7 @@
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/of_mdio.h>
+#include <linux/of_net.h>
 #include <linux/phy.h>
 
 #define DRIVER_NAME "xilinx_emaclite"
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index aa675ebd8eb3..e4b93a0a15d2 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -49,6 +49,10 @@ config OF_I2C
 	help
 	  OpenFirmware I2C accessors
 
+config OF_NET
+	depends on NETDEVICES
+	def_bool y
+
 config OF_SPI
 	def_tristate SPI
 	depends on SPI && !SPARC
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 7888155bea08..3ab21a0a4907 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -6,5 +6,6 @@ obj-$(CONFIG_OF_IRQ)    += irq.o
 obj-$(CONFIG_OF_DEVICE) += device.o platform.o
 obj-$(CONFIG_OF_GPIO)   += gpio.o
 obj-$(CONFIG_OF_I2C)	+= of_i2c.o
+obj-$(CONFIG_OF_NET)	+= of_net.o
 obj-$(CONFIG_OF_SPI)	+= of_spi.o
 obj-$(CONFIG_OF_MDIO)	+= of_mdio.o
diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
new file mode 100644
index 000000000000..86f334a2769c
--- /dev/null
+++ b/drivers/of/of_net.c
@@ -0,0 +1,48 @@
+/*
+ * OF helpers for network devices.
+ *
+ * This file is released under the GPLv2
+ *
+ * Initially copied out of arch/powerpc/kernel/prom_parse.c
+ */
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/of_net.h>
+
+/**
+ * Search the device tree for the best MAC address to use.  'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address.  If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the device tree, but were not set by U-Boot.  For example, the
+ * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
+ * addresses.  Some older U-Boots only initialized 'local-mac-address'.  In
+ * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
+ * but is all zeros.
+*/
+const void *of_get_mac_address(struct device_node *np)
+{
+	struct property *pp;
+
+	pp = of_find_property(np, "mac-address", NULL);
+	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
+		return pp->value;
+
+	pp = of_find_property(np, "local-mac-address", NULL);
+	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
+		return pp->value;
+
+	pp = of_find_property(np, "address", NULL);
+	if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
+		return pp->value;
+
+	return NULL;
+}
+EXPORT_SYMBOL(of_get_mac_address);
diff --git a/include/linux/of_net.h b/include/linux/of_net.h
new file mode 100644
index 000000000000..e913081fb52a
--- /dev/null
+++ b/include/linux/of_net.h
@@ -0,0 +1,15 @@
+/*
+ * OF helpers for network devices.
+ *
+ * This file is released under the GPLv2
+ */
+
+#ifndef __LINUX_OF_NET_H
+#define __LINUX_OF_NET_H
+
+#ifdef CONFIG_OF_NET
+#include <linux/of.h>
+extern const void *of_get_mac_address(struct device_node *np);
+#endif
+
+#endif /* __LINUX_OF_NET_H */
-- 
cgit v1.2.3-71-gd317


From b595076a180a56d1bb170e6eceda6eb9d76f4cd3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 1 Nov 2010 15:38:34 -0400
Subject: tree-wide: fix comment/printk typos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"gadget", "through", "command", "maintain", "maintain", "controller", "address",
"between", "initiali[zs]e", "instead", "function", "select", "already",
"equal", "access", "management", "hierarchy", "registration", "interest",
"relative", "memory", "offset", "already",

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/DocBook/mtdnand.tmpl                    | 2 +-
 Documentation/cgroups/cgroup_event_listener.c         | 2 +-
 Documentation/cgroups/memcg_test.txt                  | 2 +-
 Documentation/kprobes.txt                             | 2 +-
 Documentation/kvm/api.txt                             | 2 +-
 Documentation/networking/caif/spi_porting.txt         | 2 +-
 Documentation/powerpc/booting-without-of.txt          | 2 +-
 Documentation/scsi/ChangeLog.lpfc                     | 2 +-
 Documentation/timers/timer_stats.txt                  | 2 +-
 arch/arm/common/it8152.c                              | 2 +-
 arch/arm/common/vic.c                                 | 2 +-
 arch/arm/mach-at91/board-ecbat91.c                    | 6 +++---
 arch/arm/mach-bcmring/csp/chipc/chipcHw.c             | 2 +-
 arch/arm/mach-bcmring/csp/dmac/dmacHw.c               | 2 +-
 arch/arm/mach-bcmring/csp/dmac/dmacHw_extra.c         | 2 +-
 arch/arm/mach-bcmring/csp/tmr/tmrHw.c                 | 2 +-
 arch/arm/mach-bcmring/dma.c                           | 2 +-
 arch/arm/mach-bcmring/include/csp/dmacHw.h            | 2 +-
 arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h  | 2 +-
 arch/arm/mach-bcmring/include/mach/csp/dmacHw_reg.h   | 2 +-
 arch/arm/mach-gemini/include/mach/hardware.h          | 2 +-
 arch/arm/mach-msm/io.c                                | 2 +-
 arch/arm/mach-omap2/cpuidle34xx.c                     | 2 +-
 arch/arm/mach-omap2/serial.c                          | 2 +-
 arch/arm/mach-pxa/mxm8x10.c                           | 2 +-
 arch/arm/mach-s3c64xx/dma.c                           | 2 +-
 arch/arm/mach-spear3xx/spear300.c                     | 4 ++--
 arch/arm/mach-spear3xx/spear310.c                     | 2 +-
 arch/arm/mach-spear3xx/spear320.c                     | 2 +-
 arch/arm/mach-spear3xx/spear3xx.c                     | 6 +++---
 arch/arm/mach-spear6xx/spear6xx.c                     | 4 ++--
 arch/arm/mach-u300/Kconfig                            | 2 +-
 arch/arm/mach-u300/include/mach/coh901318.h           | 4 ++--
 arch/arm/plat-mxc/include/mach/irqs.h                 | 2 +-
 arch/arm/plat-omap/include/plat/omap_hwmod.h          | 2 +-
 arch/blackfin/mach-bf518/include/mach/defBF51x_base.h | 2 +-
 arch/blackfin/mach-bf527/include/mach/defBF52x_base.h | 2 +-
 arch/blackfin/mach-bf537/include/mach/defBF534.h      | 2 +-
 arch/blackfin/mach-bf538/include/mach/defBF539.h      | 2 +-
 arch/cris/arch-v32/lib/nand_init.S                    | 2 +-
 arch/cris/include/asm/etraxgpio.h                     | 2 +-
 arch/h8300/Kconfig.debug                              | 2 +-
 arch/ia64/kvm/mmio.c                                  | 2 +-
 arch/mips/alchemy/common/power.c                      | 2 +-
 arch/mips/include/asm/mach-powertv/ioremap.h          | 2 +-
 arch/mips/jz4740/board-qi_lb60.c                      | 4 ++--
 arch/mips/jz4740/gpio.c                               | 2 +-
 arch/mips/mti-malta/malta-memory.c                    | 2 +-
 arch/mips/pci/pcie-octeon.c                           | 2 +-
 arch/mips/powertv/memory.c                            | 2 +-
 arch/mips/txx9/generic/pci.c                          | 2 +-
 arch/powerpc/include/asm/8xx_immap.h                  | 4 ++--
 arch/powerpc/oprofile/op_model_cell.c                 | 2 +-
 arch/powerpc/platforms/83xx/suspend-asm.S             | 6 +++---
 arch/powerpc/platforms/ps3/device-init.c              | 4 ++--
 arch/powerpc/platforms/ps3/interrupt.c                | 2 +-
 arch/sh/mm/cache-sh5.c                                | 2 +-
 arch/sparc/kernel/traps_64.c                          | 2 +-
 arch/x86/include/asm/pgalloc.h                        | 2 +-
 arch/x86/include/asm/processor.h                      | 2 +-
 arch/x86/kernel/amd_iommu.c                           | 4 ++--
 arch/x86/kernel/early_printk_mrst.c                   | 2 +-
 arch/x86/kernel/head_32.S                             | 2 +-
 block/cfq-iosched.c                                   | 2 +-
 drivers/acpi/acpica/acobject.h                        | 2 +-
 drivers/ata/libata-core.c                             | 4 ++--
 drivers/ata/sata_vsc.c                                | 2 +-
 drivers/atm/idt77252.h                                | 2 +-
 drivers/atm/iphase.c                                  | 4 ++--
 drivers/base/power/main.c                             | 2 +-
 drivers/dma/intel_mid_dma.c                           | 6 +++---
 drivers/edac/amd8131_edac.h                           | 2 +-
 drivers/edac/cell_edac.c                              | 4 ++--
 drivers/edac/edac_core.h                              | 2 +-
 drivers/edac/ppc4xx_edac.c                            | 6 +++---
 drivers/gpu/drm/radeon/atombios.h                     | 2 +-
 drivers/i2c/busses/i2c-nomadik.c                      | 4 ++--
 drivers/infiniband/hw/cxgb3/cxio_wr.h                 | 2 +-
 drivers/infiniband/hw/qib/qib_iba7322.c               | 2 +-
 drivers/input/touchscreen/Kconfig                     | 2 +-
 drivers/isdn/gigaset/bas-gigaset.c                    | 4 ++--
 drivers/isdn/gigaset/ser-gigaset.c                    | 4 ++--
 drivers/isdn/gigaset/usb-gigaset.c                    | 4 ++--
 drivers/isdn/hardware/mISDN/ipac.h                    | 4 ++--
 drivers/isdn/hardware/mISDN/isar.h                    | 2 +-
 drivers/isdn/hisax/isar.c                             | 4 ++--
 drivers/media/video/cx25840/cx25840-ir.c              | 2 +-
 drivers/media/video/davinci/vpif.h                    | 2 +-
 drivers/media/video/davinci/vpss.c                    | 2 +-
 drivers/media/video/omap/omap_vout.c                  | 2 +-
 drivers/media/video/saa7164/saa7164-core.c            | 4 ++--
 drivers/media/video/sn9c102/sn9c102_sensor.h          | 2 +-
 drivers/media/video/zoran/zoran.h                     | 2 +-
 drivers/message/fusion/lsi/mpi_log_sas.h              | 2 +-
 drivers/message/fusion/mptbase.c                      | 2 +-
 drivers/message/fusion/mptsas.c                       | 2 +-
 drivers/message/i2o/i2o_block.c                       | 2 +-
 drivers/misc/arm-charlcd.c                            | 2 +-
 drivers/mmc/card/block.c                              | 2 +-
 drivers/mmc/host/Kconfig                              | 2 +-
 drivers/mmc/host/au1xmmc.c                            | 2 +-
 drivers/mmc/host/sdricoh_cs.c                         | 4 ++--
 drivers/mtd/nand/nand_base.c                          | 2 +-
 drivers/net/bnx2x/bnx2x_main.c                        | 2 +-
 drivers/net/bnx2x/bnx2x_reg.h                         | 2 +-
 drivers/net/bonding/bond_3ad.c                        | 6 +++---
 drivers/net/chelsio/subr.c                            | 2 +-
 drivers/net/cxgb3/mc5.c                               | 2 +-
 drivers/net/cxgb3/t3_hw.c                             | 2 +-
 drivers/net/e1000/e1000_hw.h                          | 2 +-
 drivers/net/e1000/e1000_main.c                        | 2 +-
 drivers/net/e1000e/82571.c                            | 2 +-
 drivers/net/e1000e/ich8lan.c                          | 2 +-
 drivers/net/e1000e/phy.c                              | 2 +-
 drivers/net/eepro.c                                   | 2 +-
 drivers/net/irda/donauboe.h                           | 2 +-
 drivers/net/ixgbe/ixgbe_82599.c                       | 4 ++--
 drivers/net/ll_temac_main.c                           | 2 +-
 drivers/net/sis900.c                                  | 2 +-
 drivers/net/tehuti.c                                  | 6 +++---
 drivers/net/tun.c                                     | 2 +-
 drivers/net/vxge/vxge-traffic.h                       | 2 +-
 drivers/net/wan/dscc4.c                               | 2 +-
 drivers/net/wimax/i2400m/driver.c                     | 2 +-
 drivers/net/wimax/i2400m/i2400m.h                     | 2 +-
 drivers/net/wireless/ath/ath5k/reg.h                  | 6 +++---
 drivers/net/wireless/b43/phy_g.c                      | 2 +-
 drivers/net/wireless/b43legacy/phy.c                  | 2 +-
 drivers/net/wireless/iwlwifi/iwl-sta.c                | 2 +-
 drivers/net/wireless/prism54/islpci_dev.c             | 6 +++---
 drivers/net/wireless/prism54/islpci_eth.c             | 2 +-
 drivers/net/wireless/rt2x00/rt2x00mac.c               | 2 +-
 drivers/net/wireless/wl3501_cs.c                      | 2 +-
 drivers/pcmcia/m32r_cfc.h                             | 2 +-
 drivers/pcmcia/m32r_pcc.h                             | 2 +-
 drivers/pcmcia/m8xx_pcmcia.c                          | 2 +-
 drivers/platform/x86/thinkpad_acpi.c                  | 2 +-
 drivers/power/s3c_adc_battery.c                       | 4 ++--
 drivers/s390/net/lcs.c                                | 2 +-
 drivers/s390/scsi/zfcp_cfdc.c                         | 2 +-
 drivers/scsi/a100u2w.c                                | 2 +-
 drivers/scsi/aacraid/commsup.c                        | 2 +-
 drivers/scsi/aic7xxx_old/aic7xxx.seq                  | 2 +-
 drivers/scsi/aic94xx/aic94xx_reg_def.h                | 4 ++--
 drivers/scsi/aic94xx/aic94xx_seq.c                    | 6 +++---
 drivers/scsi/bfa/bfa_fcpim.c                          | 2 +-
 drivers/scsi/bfa/bfa_fcs_lport.c                      | 2 +-
 drivers/scsi/dc395x.c                                 | 8 ++++----
 drivers/scsi/libfc/fc_fcp.c                           | 2 +-
 drivers/scsi/lpfc/lpfc_attr.c                         | 2 +-
 drivers/scsi/lpfc/lpfc_hbadisc.c                      | 2 +-
 drivers/scsi/lpfc/lpfc_sli.c                          | 2 +-
 drivers/scsi/megaraid.h                               | 2 +-
 drivers/scsi/pm8001/pm8001_init.c                     | 2 +-
 drivers/scsi/scsi_netlink.c                           | 2 +-
 drivers/scsi/sym53c8xx_2/sym_glue.c                   | 2 +-
 drivers/spi/atmel_spi.c                               | 4 ++--
 drivers/spi/spidev.c                                  | 2 +-
 drivers/staging/stradis/stradis.c                     | 2 +-
 drivers/usb/gadget/imx_udc.c                          | 2 +-
 drivers/usb/host/imx21-hcd.c                          | 2 +-
 drivers/usb/host/oxu210hp-hcd.c                       | 2 +-
 drivers/usb/misc/adutux.c                             | 2 +-
 drivers/usb/misc/iowarrior.c                          | 2 +-
 drivers/usb/misc/ldusb.c                              | 2 +-
 drivers/usb/musb/musb_gadget.c                        | 4 ++--
 drivers/usb/wusbcore/wa-rpipe.c                       | 2 +-
 drivers/video/sstfb.c                                 | 2 +-
 fs/ext4/ext4.h                                        | 2 +-
 fs/ext4/extents.c                                     | 4 ++--
 fs/ext4/inode.c                                       | 4 ++--
 fs/ocfs2/inode.c                                      | 2 +-
 fs/ocfs2/suballoc.c                                   | 2 +-
 fs/xfs/linux-2.6/xfs_super.c                          | 2 +-
 include/acpi/actbl1.h                                 | 2 +-
 include/linux/cgroup.h                                | 2 +-
 include/linux/firewire-cdev.h                         | 2 +-
 include/linux/mfd/core.h                              | 2 +-
 include/net/sctp/user.h                               | 4 ++--
 include/scsi/fc/fc_fcp.h                              | 4 ++--
 kernel/debug/kdb/kdb_main.c                           | 2 +-
 kernel/kexec.c                                        | 2 +-
 kernel/power/swap.c                                   | 2 +-
 kernel/sched.c                                        | 2 +-
 kernel/sysctl_binary.c                                | 2 +-
 kernel/time/clocksource.c                             | 2 +-
 kernel/trace/trace_entries.h                          | 2 +-
 lib/nlattr.c                                          | 2 +-
 lib/swiotlb.c                                         | 2 +-
 mm/percpu.c                                           | 2 +-
 mm/sparse-vmemmap.c                                   | 2 +-
 net/core/dev.c                                        | 2 +-
 net/decnet/dn_dev.c                                   | 2 +-
 net/ipv4/tcp_output.c                                 | 2 +-
 scripts/kconfig/confdata.c                            | 2 +-
 scripts/mod/modpost.c                                 | 2 +-
 security/apparmor/include/match.h                     | 2 +-
 sound/core/init.c                                     | 2 +-
 sound/core/pcm_native.c                               | 2 +-
 sound/isa/opl3sa2.c                                   | 2 +-
 sound/pci/ca0106/ca0106.h                             | 2 +-
 sound/pci/emu10k1/emu10k1x.c                          | 2 +-
 sound/pci/emu10k1/p16v.h                              | 2 +-
 sound/pci/es1968.c                                    | 2 +-
 sound/pci/rme9652/hdspm.c                             | 6 +++---
 sound/soc/codecs/max98088.c                           | 2 +-
 sound/soc/s3c24xx/smdk_spdif.c                        | 6 +++---
 207 files changed, 261 insertions(+), 261 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 020ac80d4682..620eb3f6a90a 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -250,7 +250,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd)
 		<title>Device ready function</title>
 		<para>
 			If the hardware interface has the ready busy pin of the NAND chip connected to a
-			GPIO or other accesible I/O pin, this function is used to read back the state of the
+			GPIO or other accessible I/O pin, this function is used to read back the state of the
 			pin. The function has no arguments and should return 0, if the device is busy (R/B pin 
 			is low) and 1, if the device is ready (R/B pin is high).
 			If the hardware interface does not give access to the ready busy pin, then
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c
index 8c2bfc4a6358..3e082f96dc12 100644
--- a/Documentation/cgroups/cgroup_event_listener.c
+++ b/Documentation/cgroups/cgroup_event_listener.c
@@ -91,7 +91,7 @@ int main(int argc, char **argv)
 
 		if (ret == -1) {
 			perror("cgroup.event_control "
-					"is not accessable any more");
+					"is not accessible any more");
 			break;
 		}
 
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index b7eececfb195..fc8fa97a09ac 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -398,7 +398,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	written to move_charge_at_immigrate.
 
  9.10 Memory thresholds
-	Memory controler implements memory thresholds using cgroups notification
+	Memory controller implements memory thresholds using cgroups notification
 	API. You can use Documentation/cgroups/cgroup_event_listener.c to test
 	it.
 
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 741fe66d6eca..0cfb00fd86ff 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -598,7 +598,7 @@ a 5-byte jump instruction. So there are several limitations.
 a) The instructions in DCR must be relocatable.
 b) The instructions in DCR must not include a call instruction.
 c) JTPR must not be targeted by any jump or call instruction.
-d) DCR must not straddle the border betweeen functions.
+d) DCR must not straddle the border between functions.
 
 Anyway, these limitations are checked by the in-kernel instruction
 decoder, so you don't need to worry about that.
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index b336266bea5e..50713e37c695 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -874,7 +874,7 @@ Possible values are:
  - KVM_MP_STATE_HALTED:          the vcpu has executed a HLT instruction and
                                  is waiting for an interrupt
  - KVM_MP_STATE_SIPI_RECEIVED:   the vcpu has just received a SIPI (vector
-                                 accesible via KVM_GET_VCPU_EVENTS)
+                                 accessible via KVM_GET_VCPU_EVENTS)
 
 This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 irqchip, the multiprocessing state must be maintained by userspace.
diff --git a/Documentation/networking/caif/spi_porting.txt b/Documentation/networking/caif/spi_porting.txt
index 61d7c9247453..0cb8cb9098f4 100644
--- a/Documentation/networking/caif/spi_porting.txt
+++ b/Documentation/networking/caif/spi_porting.txt
@@ -32,7 +32,7 @@ the physical hardware, both with regard to SPI and to GPIOs.
 	This function is called by the CAIF SPI interface to give
 	you a chance to set up your hardware to be ready to receive
 	a stream of data from the master. The xfer structure contains
-	both physical and logical adresses, as well as the total length
+	both physical and logical addresses, as well as the total length
 	of the transfer in both directions.The dev parameter can be used
 	to map to different CAIF SPI slave devices.
 
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 302db5da49b3..1c6660d2d7f5 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1098,7 +1098,7 @@ supported currently at the toplevel.
                                  * an arbitrary array of bytes
                                  */
 
-  childnode@addresss {	/* define a child node named "childnode"
+  childnode@address {	/* define a child node named "childnode"
                                  * whose unit name is "childnode at
 				 * address"
                                  */
diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc
index 337c924cc81f..5e83769c6aa9 100644
--- a/Documentation/scsi/ChangeLog.lpfc
+++ b/Documentation/scsi/ChangeLog.lpfc
@@ -573,7 +573,7 @@ Changes from 20041018 to 20041123
 	* Backround nodev_timeout processing to DPC This enables us to
 	  unblock (stop dev_loss_tmo) when appopriate.
 	* Fix array discovery with multiple luns.  The max_luns was 0 at
-	  the time the host structure was intialized.  lpfc_cfg_params
+	  the time the host structure was initialized.  lpfc_cfg_params
 	  then set the max_luns to the correct value afterwards.
 	* Remove unused define LPFC_MAX_LUN and set the default value of
 	  lpfc_max_lun parameter to 512.
diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt
index 9bd00fc2e823..8abd40b22b7f 100644
--- a/Documentation/timers/timer_stats.txt
+++ b/Documentation/timers/timer_stats.txt
@@ -19,7 +19,7 @@ Linux system over a sample period:
 
 - the pid of the task(process) which initialized the timer
 - the name of the process which initialized the timer
-- the function where the timer was intialized
+- the function where the timer was initialized
 - the callback function which is associated to the timer
 - the number of events (callbacks)
 
diff --git a/arch/arm/common/it8152.c b/arch/arm/common/it8152.c
index 1bec96e85196..60dc077ea7cc 100644
--- a/arch/arm/common/it8152.c
+++ b/arch/arm/common/it8152.c
@@ -236,7 +236,7 @@ static struct resource it8152_mem = {
 
 /*
  * The following functions are needed for DMA bouncing.
- * ITE8152 chip can addrees up to 64MByte, so all the devices
+ * ITE8152 chip can address up to 64MByte, so all the devices
  * connected to ITE8152 (PCI and USB) should have limited DMA window
  */
 
diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c
index ba65f6eedca6..cb660bc54d7a 100644
--- a/arch/arm/common/vic.c
+++ b/arch/arm/common/vic.c
@@ -70,7 +70,7 @@ static inline struct vic_device *to_vic(struct sys_device *sys)
  * vic_init2 - common initialisation code
  * @base: Base of the VIC.
  *
- * Common initialisation code for registeration
+ * Common initialisation code for registration
  * and resume.
 */
 static void vic_init2(void __iomem *base)
diff --git a/arch/arm/mach-at91/board-ecbat91.c b/arch/arm/mach-at91/board-ecbat91.c
index 7b58c948a957..de2fd04e7c8a 100644
--- a/arch/arm/mach-at91/board-ecbat91.c
+++ b/arch/arm/mach-at91/board-ecbat91.c
@@ -128,17 +128,17 @@ static struct spi_board_info __initdata ecb_at91spi_devices[] = {
 		.platform_data	= &my_flash0_platform,
 #endif
 	},
-	{	/* User accessable spi - cs1 (250KHz) */
+	{	/* User accessible spi - cs1 (250KHz) */
 		.modalias	= "spi-cs1",
 		.chip_select	= 1,
 		.max_speed_hz	= 250 * 1000,
 	},
-	{	/* User accessable spi - cs2 (1MHz) */
+	{	/* User accessible spi - cs2 (1MHz) */
 		.modalias	= "spi-cs2",
 		.chip_select	= 2,
 		.max_speed_hz	= 1 * 1000 * 1000,
 	},
-	{	/* User accessable spi - cs3 (10MHz) */
+	{	/* User accessible spi - cs3 (10MHz) */
 		.modalias	= "spi-cs3",
 		.chip_select	= 3,
 		.max_speed_hz	= 10 * 1000 * 1000,
diff --git a/arch/arm/mach-bcmring/csp/chipc/chipcHw.c b/arch/arm/mach-bcmring/csp/chipc/chipcHw.c
index b3a61d860c65..96273ff34956 100644
--- a/arch/arm/mach-bcmring/csp/chipc/chipcHw.c
+++ b/arch/arm/mach-bcmring/csp/chipc/chipcHw.c
@@ -757,7 +757,7 @@ static int chipcHw_divide(int num, int denom)
 		t = t << 1;
 	}
 
-	/* Intialize the result */
+	/* Initialize the result */
 	r = 0;
 
 	do {
diff --git a/arch/arm/mach-bcmring/csp/dmac/dmacHw.c b/arch/arm/mach-bcmring/csp/dmac/dmacHw.c
index 7b9bac2d79a5..6b9be2e98e51 100644
--- a/arch/arm/mach-bcmring/csp/dmac/dmacHw.c
+++ b/arch/arm/mach-bcmring/csp/dmac/dmacHw.c
@@ -893,7 +893,7 @@ int dmacHw_setDataDescriptor(dmacHw_CONFIG_t *pConfig,	/*   [ IN ] Configuration
 */
 /****************************************************************************/
 uint32_t dmacHw_getDmaControllerAttribute(dmacHw_HANDLE_t handle,	/*  [ IN ]  DMA Channel handle */
-					  dmacHw_CONTROLLER_ATTRIB_e attr	/*  [ IN ]  DMA Controler attribute of type  dmacHw_CONTROLLER_ATTRIB_e */
+					  dmacHw_CONTROLLER_ATTRIB_e attr	/*  [ IN ]  DMA Controller attribute of type  dmacHw_CONTROLLER_ATTRIB_e */
     ) {
 	dmacHw_CBLK_t *pCblk = dmacHw_HANDLE_TO_CBLK(handle);
 
diff --git a/arch/arm/mach-bcmring/csp/dmac/dmacHw_extra.c b/arch/arm/mach-bcmring/csp/dmac/dmacHw_extra.c
index ff7b436d0935..77f84b40dda9 100644
--- a/arch/arm/mach-bcmring/csp/dmac/dmacHw_extra.c
+++ b/arch/arm/mach-bcmring/csp/dmac/dmacHw_extra.c
@@ -316,7 +316,7 @@ static void DisplayDescRing(void *pDescriptor,	/*   [ IN ] Descriptor buffer */
 /**
 *  @brief   Check if DMA channel is the flow controller
 *
-*  @return  1 : If DMA is a flow controler
+*  @return  1 : If DMA is a flow controller
 *           0 : Peripheral is the flow controller
 *
 *  @note
diff --git a/arch/arm/mach-bcmring/csp/tmr/tmrHw.c b/arch/arm/mach-bcmring/csp/tmr/tmrHw.c
index 5c1c9a0e5ed2..16225e43f3c3 100644
--- a/arch/arm/mach-bcmring/csp/tmr/tmrHw.c
+++ b/arch/arm/mach-bcmring/csp/tmr/tmrHw.c
@@ -558,7 +558,7 @@ static int tmrHw_divide(int num, int denom)
 		t = t << 1;
 	}
 
-	/* Intialize the result */
+	/* Initialize the result */
 	r = 0;
 
 	do {
diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c
index 77eb35c89cd0..8d1baf3f4683 100644
--- a/arch/arm/mach-bcmring/dma.c
+++ b/arch/arm/mach-bcmring/dma.c
@@ -671,7 +671,7 @@ static int ConfigChannel(DMA_Handle_t handle)
 
 /****************************************************************************/
 /**
-*   Intializes all of the data structures associated with the DMA.
+*   Initializes all of the data structures associated with the DMA.
 *   @return
 *       >= 0    - Initialization was successfull.
 *
diff --git a/arch/arm/mach-bcmring/include/csp/dmacHw.h b/arch/arm/mach-bcmring/include/csp/dmacHw.h
index 5d510130a25f..6c8da2b9fc1f 100644
--- a/arch/arm/mach-bcmring/include/csp/dmacHw.h
+++ b/arch/arm/mach-bcmring/include/csp/dmacHw.h
@@ -590,7 +590,7 @@ void dmacHw_printDebugInfo(dmacHw_HANDLE_t handle,	/*  [ IN ] DMA Channel handle
 */
 /****************************************************************************/
 uint32_t dmacHw_getDmaControllerAttribute(dmacHw_HANDLE_t handle,	/*  [ IN ]  DMA Channel handle  */
-					  dmacHw_CONTROLLER_ATTRIB_e attr	/*  [ IN ]  DMA Controler attribute of type  dmacHw_CONTROLLER_ATTRIB_e */
+					  dmacHw_CONTROLLER_ATTRIB_e attr	/*  [ IN ]  DMA Controller attribute of type  dmacHw_CONTROLLER_ATTRIB_e */
     );
 
 #endif /* _DMACHW_H */
diff --git a/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h b/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h
index cbf334d1c761..d67e2f8c22de 100644
--- a/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h
+++ b/arch/arm/mach-bcmring/include/mach/csp/dmacHw_priv.h
@@ -28,7 +28,7 @@
 
 /* Data type for DMA Link List Item */
 typedef struct {
-	uint32_t sar;		/* Source Adress Register.
+	uint32_t sar;		/* Source Address Register.
 				   Address must be aligned to CTLx.SRC_TR_WIDTH.             */
 	uint32_t dar;		/* Destination Address Register.
 				   Address must be aligned to CTLx.DST_TR_WIDTH.             */
diff --git a/arch/arm/mach-bcmring/include/mach/csp/dmacHw_reg.h b/arch/arm/mach-bcmring/include/mach/csp/dmacHw_reg.h
index 891cea87e333..f1ecf96f2da5 100644
--- a/arch/arm/mach-bcmring/include/mach/csp/dmacHw_reg.h
+++ b/arch/arm/mach-bcmring/include/mach/csp/dmacHw_reg.h
@@ -35,7 +35,7 @@ typedef struct {
 
 /* Data type representing DMA channel registers */
 typedef struct {
-	dmacHw_REG64_t ChannelSar;	/*  Source Adress Register. 64 bits (upper 32 bits are reserved)
+	dmacHw_REG64_t ChannelSar;	/*  Source Address Register. 64 bits (upper 32 bits are reserved)
 					   Address must be aligned to CTLx.SRC_TR_WIDTH.
 					 */
 	dmacHw_REG64_t ChannelDar;	/*  Destination Address Register.64 bits (upper 32 bits are reserved)
diff --git a/arch/arm/mach-gemini/include/mach/hardware.h b/arch/arm/mach-gemini/include/mach/hardware.h
index 213a4fcfeb1c..8c950e1d06be 100644
--- a/arch/arm/mach-gemini/include/mach/hardware.h
+++ b/arch/arm/mach-gemini/include/mach/hardware.h
@@ -33,7 +33,7 @@
 #define GEMINI_LPC_HOST_BASE	0x47000000
 #define GEMINI_LPC_IO_BASE	0x47800000
 #define GEMINI_INTERRUPT_BASE	0x48000000
-/* TODO: Different interrupt controlers when SMP
+/* TODO: Different interrupt controllers when SMP
  * #define GEMINI_INTERRUPT0_BASE	0x48000000
  * #define GEMINI_INTERRUPT1_BASE	0x49000000
  */
diff --git a/arch/arm/mach-msm/io.c b/arch/arm/mach-msm/io.c
index d36b61074146..7f8d15d56834 100644
--- a/arch/arm/mach-msm/io.c
+++ b/arch/arm/mach-msm/io.c
@@ -153,7 +153,7 @@ __msm_ioremap(unsigned long phys_addr, size_t size, unsigned int mtype)
 {
 	if (mtype == MT_DEVICE) {
 		/* The peripherals in the 88000000 - D0000000 range
-		 * are only accessable by type MT_DEVICE_NONSHARED.
+		 * are only accessible by type MT_DEVICE_NONSHARED.
 		 * Adjust mtype as necessary to make this "just work."
 		 */
 		if ((phys_addr >= 0x88000000) && (phys_addr < 0xD0000000))
diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index 0d50b45d041c..1e6fe6d11aee 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -252,7 +252,7 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	 * FIXME: we currently manage device-specific idle states
 	 *        for PER and CORE in combination with CPU-specific
 	 *        idle states.  This is wrong, and device-specific
-	 *        idle managment needs to be separated out into 
+	 *        idle management needs to be separated out into 
 	 *        its own code.
 	 */
 
diff --git a/arch/arm/mach-omap2/serial.c b/arch/arm/mach-omap2/serial.c
index becf0e38ef7e..db79ac68ed97 100644
--- a/arch/arm/mach-omap2/serial.c
+++ b/arch/arm/mach-omap2/serial.c
@@ -843,7 +843,7 @@ void __init omap_serial_init_port(int port)
 }
 
 /**
- * omap_serial_init() - intialize all supported serial ports
+ * omap_serial_init() - initialize all supported serial ports
  *
  * Initializes all available UARTs as serial ports. Platforms
  * can call this function when they want to have default behaviour
diff --git a/arch/arm/mach-pxa/mxm8x10.c b/arch/arm/mach-pxa/mxm8x10.c
index 462167ac05f9..cdf7f41e2bb3 100644
--- a/arch/arm/mach-pxa/mxm8x10.c
+++ b/arch/arm/mach-pxa/mxm8x10.c
@@ -337,7 +337,7 @@ void __init mxm_8x10_mmc_init(void)
 }
 #endif
 
-/* USB Open Host Controler Interface */
+/* USB Open Host Controller Interface */
 static struct pxaohci_platform_data mxm_8x10_ohci_platform_data = {
 	.port_mode = PMM_NPS_MODE,
 	.flags = ENABLE_PORT_ALL
diff --git a/arch/arm/mach-s3c64xx/dma.c b/arch/arm/mach-s3c64xx/dma.c
index e7d03ab41d80..372ea6855454 100644
--- a/arch/arm/mach-s3c64xx/dma.c
+++ b/arch/arm/mach-s3c64xx/dma.c
@@ -740,7 +740,7 @@ static int __init s3c64xx_dma_init(void)
 	/* Set all DMA configuration to be DMA, not SDMA */
 	writel(0xffffff, S3C_SYSREG(0x110));
 
-	/* Register standard DMA controlers */
+	/* Register standard DMA controllers */
 	s3c64xx_dma_init1(0, DMACH_UART0, IRQ_DMA0, 0x75000000);
 	s3c64xx_dma_init1(8, DMACH_PCM1_TX, IRQ_DMA1, 0x75100000);
 
diff --git a/arch/arm/mach-spear3xx/spear300.c b/arch/arm/mach-spear3xx/spear300.c
index 3560f8c1e723..5aa2d54ebfaa 100644
--- a/arch/arm/mach-spear3xx/spear300.c
+++ b/arch/arm/mach-spear3xx/spear300.c
@@ -371,7 +371,7 @@ struct pmx_driver pmx_driver = {
 };
 
 /* Add spear300 specific devices here */
-/* arm gpio1 device registeration */
+/* arm gpio1 device registration */
 static struct pl061_platform_data gpio1_plat_data = {
 	.gpio_base	= 8,
 	.irq_base	= SPEAR_GPIO1_INT_BASE,
@@ -451,7 +451,7 @@ void __init spear300_init(void)
 	/* call spear3xx family common init function */
 	spear3xx_init();
 
-	/* shared irq registeration */
+	/* shared irq registration */
 	shirq_ras1.regs.base =
 		ioremap(SPEAR300_TELECOM_BASE, SPEAR300_TELECOM_REG_SIZE);
 	if (shirq_ras1.regs.base) {
diff --git a/arch/arm/mach-spear3xx/spear310.c b/arch/arm/mach-spear3xx/spear310.c
index 96a1ab824bac..53b41b52d7ee 100644
--- a/arch/arm/mach-spear3xx/spear310.c
+++ b/arch/arm/mach-spear3xx/spear310.c
@@ -266,7 +266,7 @@ void __init spear310_init(void)
 	/* call spear3xx family common init function */
 	spear3xx_init();
 
-	/* shared irq registeration */
+	/* shared irq registration */
 	base = ioremap(SPEAR310_SOC_CONFIG_BASE, SPEAR310_SOC_CONFIG_SIZE);
 	if (base) {
 		/* shirq 1 */
diff --git a/arch/arm/mach-spear3xx/spear320.c b/arch/arm/mach-spear3xx/spear320.c
index 6a1219549369..88b465284c36 100644
--- a/arch/arm/mach-spear3xx/spear320.c
+++ b/arch/arm/mach-spear3xx/spear320.c
@@ -519,7 +519,7 @@ void __init spear320_init(void)
 	/* call spear3xx family common init function */
 	spear3xx_init();
 
-	/* shared irq registeration */
+	/* shared irq registration */
 	base = ioremap(SPEAR320_SOC_CONFIG_BASE, SPEAR320_SOC_CONFIG_SIZE);
 	if (base) {
 		/* shirq 1 */
diff --git a/arch/arm/mach-spear3xx/spear3xx.c b/arch/arm/mach-spear3xx/spear3xx.c
index e87313aeae20..52f553c8c46d 100644
--- a/arch/arm/mach-spear3xx/spear3xx.c
+++ b/arch/arm/mach-spear3xx/spear3xx.c
@@ -22,7 +22,7 @@
 #include <mach/spear.h>
 
 /* Add spear3xx machines common devices here */
-/* gpio device registeration */
+/* gpio device registration */
 static struct pl061_platform_data gpio_plat_data = {
 	.gpio_base	= 0,
 	.irq_base	= SPEAR_GPIO_INT_BASE,
@@ -41,7 +41,7 @@ struct amba_device gpio_device = {
 	.irq = {IRQ_BASIC_GPIO, NO_IRQ},
 };
 
-/* uart device registeration */
+/* uart device registration */
 struct amba_device uart_device = {
 	.dev = {
 		.init_name = "uart",
@@ -543,6 +543,6 @@ void spear_pmx_init(struct pmx_driver *pmx_driver, uint base, uint size)
 
 pmx_fail:
 	if (ret)
-		printk(KERN_ERR "padmux: registeration failed. err no: %d\n",
+		printk(KERN_ERR "padmux: registration failed. err no: %d\n",
 				ret);
 }
diff --git a/arch/arm/mach-spear6xx/spear6xx.c b/arch/arm/mach-spear6xx/spear6xx.c
index baf6bcc3169c..f2fe14e8471d 100644
--- a/arch/arm/mach-spear6xx/spear6xx.c
+++ b/arch/arm/mach-spear6xx/spear6xx.c
@@ -23,7 +23,7 @@
 #include <mach/spear.h>
 
 /* Add spear6xx machines common devices here */
-/* uart device registeration */
+/* uart device registration */
 struct amba_device uart_device[] = {
 	{
 		.dev = {
@@ -50,7 +50,7 @@ struct amba_device uart_device[] = {
 	}
 };
 
-/* gpio device registeration */
+/* gpio device registration */
 static struct pl061_platform_data gpio_plat_data[] = {
 	{
 		.gpio_base	= 0,
diff --git a/arch/arm/mach-u300/Kconfig b/arch/arm/mach-u300/Kconfig
index 801b21e7f677..32a7b0f7e9f7 100644
--- a/arch/arm/mach-u300/Kconfig
+++ b/arch/arm/mach-u300/Kconfig
@@ -64,7 +64,7 @@ config MACH_U300_DUAL_RAM
 	bool "Dual RAM"
 	help
 		Select this if you want support for Dual RAM phones.
-		This is two RAM memorys on different EMIFs.
+		This is two RAM memories on different EMIFs.
 endchoice
 
 config U300_DEBUG
diff --git a/arch/arm/mach-u300/include/mach/coh901318.h b/arch/arm/mach-u300/include/mach/coh901318.h
index 193da2df732c..6193aaa47794 100644
--- a/arch/arm/mach-u300/include/mach/coh901318.h
+++ b/arch/arm/mach-u300/include/mach/coh901318.h
@@ -24,7 +24,7 @@
  * @src_addr: transfer source address
  * @dst_addr: transfer destination address
  * @link_addr:  physical address to next lli
- * @virt_link_addr: virtual addres of next lli (only used by pool_free)
+ * @virt_link_addr: virtual address of next lli (only used by pool_free)
  * @phy_this: physical address of current lli (only used by pool_free)
  */
 struct coh901318_lli {
@@ -90,7 +90,7 @@ struct powersave {
  * struct coh901318_platform - platform arch structure
  * @chans_slave: specifying dma slave channels
  * @chans_memcpy: specifying dma memcpy channels
- * @access_memory_state: requesting DMA memeory access (on / off)
+ * @access_memory_state: requesting DMA memory access (on / off)
  * @chan_conf: dma channel configurations
  * @max_channels: max number of dma chanenls
  */
diff --git a/arch/arm/plat-mxc/include/mach/irqs.h b/arch/arm/plat-mxc/include/mach/irqs.h
index 86781f7b0c0c..664c453babbe 100644
--- a/arch/arm/plat-mxc/include/mach/irqs.h
+++ b/arch/arm/plat-mxc/include/mach/irqs.h
@@ -66,7 +66,7 @@ extern int imx_irq_set_priority(unsigned char irq, unsigned char prio);
 
 /* all normal IRQs can be FIQs */
 #define FIQ_START	0
-/* switch betwean IRQ and FIQ */
+/* switch between IRQ and FIQ */
 extern int mxc_set_irq_fiq(unsigned int irq, unsigned int type);
 
 #endif /* __ASM_ARCH_MXC_IRQS_H__ */
diff --git a/arch/arm/plat-omap/include/plat/omap_hwmod.h b/arch/arm/plat-omap/include/plat/omap_hwmod.h
index 7eaa8edf3b14..79ac24ceb54d 100644
--- a/arch/arm/plat-omap/include/plat/omap_hwmod.h
+++ b/arch/arm/plat-omap/include/plat/omap_hwmod.h
@@ -339,7 +339,7 @@ struct omap_hwmod_omap2_prcm {
 /**
  * struct omap_hwmod_omap4_prcm - OMAP4-specific PRCM data
  * @clkctrl_reg: PRCM address of the clock control register
- * @rstctrl_reg: adress of the XXX_RSTCTRL register located in the PRM
+ * @rstctrl_reg: address of the XXX_RSTCTRL register located in the PRM
  * @submodule_wkdep_bit: bit shift of the WKDEP range
  */
 struct omap_hwmod_omap4_prcm {
diff --git a/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h b/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h
index 5f84913dcd91..750f4d803845 100644
--- a/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h
+++ b/arch/blackfin/mach-bf518/include/mach/defBF51x_base.h
@@ -1197,7 +1197,7 @@
 #define	SADD_LEN	0x0002		/* Slave Address Length							*/
 #define	STDVAL		0x0004		/* Slave Transmit Data Valid					*/
 #define	NAK			0x0008		/* NAK/ACK* Generated At Conclusion Of Transfer */
-#define	GEN			0x0010		/* General Call Adrress Matching Enabled		*/
+#define	GEN			0x0010		/* General Call Address Matching Enabled		*/
 
 /* TWI_SLAVE_STAT Masks															*/
 #define	SDIR		0x0001		/* Slave Transfer Direction (Transmit/Receive*)	*/
diff --git a/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h b/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h
index 09475034c6a1..e3ab78ef9001 100644
--- a/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h
+++ b/arch/blackfin/mach-bf527/include/mach/defBF52x_base.h
@@ -1206,7 +1206,7 @@
 #define	SADD_LEN	0x0002		/* Slave Address Length							*/
 #define	STDVAL		0x0004		/* Slave Transmit Data Valid					*/
 #define	NAK			0x0008		/* NAK/ACK* Generated At Conclusion Of Transfer */
-#define	GEN			0x0010		/* General Call Adrress Matching Enabled		*/
+#define	GEN			0x0010		/* General Call Address Matching Enabled		*/
 
 /* TWI_SLAVE_STAT Masks															*/
 #define	SDIR		0x0001		/* Slave Transfer Direction (Transmit/Receive*)	*/
diff --git a/arch/blackfin/mach-bf537/include/mach/defBF534.h b/arch/blackfin/mach-bf537/include/mach/defBF534.h
index 0323e6bacdae..042064b2b44a 100644
--- a/arch/blackfin/mach-bf537/include/mach/defBF534.h
+++ b/arch/blackfin/mach-bf537/include/mach/defBF534.h
@@ -1523,7 +1523,7 @@
 #define	SADD_LEN	0x0002	/* Slave Address Length                                                 */
 #define	STDVAL		0x0004	/* Slave Transmit Data Valid                                    */
 #define	NAK			0x0008	/* NAK/ACK* Generated At Conclusion Of Transfer */
-#define	GEN			0x0010	/* General Call Adrress Matching Enabled                */
+#define	GEN			0x0010	/* General Call Address Matching Enabled                */
 
 /* TWI_SLAVE_STAT Masks															*/
 #define	SDIR		0x0001	/* Slave Transfer Direction (Transmit/Receive*) */
diff --git a/arch/blackfin/mach-bf538/include/mach/defBF539.h b/arch/blackfin/mach-bf538/include/mach/defBF539.h
index 7a8ac5f44204..d85d128c97ca 100644
--- a/arch/blackfin/mach-bf538/include/mach/defBF539.h
+++ b/arch/blackfin/mach-bf538/include/mach/defBF539.h
@@ -2185,7 +2185,7 @@
 #define	SADD_LEN	0x0002		/* Slave Address Length */
 #define	STDVAL		0x0004		/* Slave Transmit Data Valid */
 #define	NAK			0x0008		/* NAK/ACK* Generated At Conclusion Of Transfer */
-#define	GEN			0x0010		/* General Call	Adrress	Matching Enabled */
+#define	GEN			0x0010		/* General Call	Address	Matching Enabled */
 
 /* TWIx_SLAVE_STAT Masks								 */
 #define	SDIR		0x0001		/* Slave Transfer Direction (Transmit/Receive*) */
diff --git a/arch/cris/arch-v32/lib/nand_init.S b/arch/cris/arch-v32/lib/nand_init.S
index e705f5cce969..d671fed451c9 100644
--- a/arch/cris/arch-v32/lib/nand_init.S
+++ b/arch/cris/arch-v32/lib/nand_init.S
@@ -139,7 +139,7 @@ copy_nand_to_ram:
 	lsrq	8, $r4
 	move.b	$r4, [$r1]	; Row address
 	lsrq	8, $r4
-	move.b	$r4, [$r1]	; Row adddress
+	move.b	$r4, [$r1]	; Row address
 	moveq	20, $r4
 2:	bne	2b
 	subq	1, $r4
diff --git a/arch/cris/include/asm/etraxgpio.h b/arch/cris/include/asm/etraxgpio.h
index d474818a537e..461c089db765 100644
--- a/arch/cris/include/asm/etraxgpio.h
+++ b/arch/cris/include/asm/etraxgpio.h
@@ -1,5 +1,5 @@
 /*
- * The following devices are accessable using this driver using
+ * The following devices are accessible using this driver using
  * GPIO_MAJOR (120) and a couple of minor numbers.
  *
  * For ETRAX 100LX (CONFIG_ETRAX_ARCH_V10):
diff --git a/arch/h8300/Kconfig.debug b/arch/h8300/Kconfig.debug
index ee671c3f2c74..e8d1b236ad8c 100644
--- a/arch/h8300/Kconfig.debug
+++ b/arch/h8300/Kconfig.debug
@@ -48,7 +48,7 @@ config DEFAULT_CMDLINE
 	  builtin kernel commandline enabled.
 
 config KERNEL_COMMAND
-	string "Buildin commmand string"
+	string "Buildin command string"
 	depends on DEFAULT_CMDLINE
 	help
 	  builtin kernel commandline strings.
diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c
index fb8f9f59a1ed..f1e17d3d6cd9 100644
--- a/arch/ia64/kvm/mmio.c
+++ b/arch/ia64/kvm/mmio.c
@@ -130,7 +130,7 @@ static void mmio_access(struct kvm_vcpu *vcpu, u64 src_pa, u64 *dest,
 
 	local_irq_save(psr);
 
-	/*Intercept the acces for PIB range*/
+	/*Intercept the access for PIB range*/
 	if (iot == GPFN_PIB) {
 		if (!dir)
 			lsapic_write(vcpu, src_pa, s, *dest);
diff --git a/arch/mips/alchemy/common/power.c b/arch/mips/alchemy/common/power.c
index e5916a516e58..647e518c90bc 100644
--- a/arch/mips/alchemy/common/power.c
+++ b/arch/mips/alchemy/common/power.c
@@ -130,7 +130,7 @@ static void restore_core_regs(void)
 	au_writel(sleep_usb[1], USBD_ENABLE);
 	au_sync();
 #else
-	/* enable accces to OTG memory */
+	/* enable access to OTG memory */
 	au_writel(au_readl(USB_MSR_BASE + 4) | (1 << 6), USB_MSR_BASE + 4);
 	au_sync();
 
diff --git a/arch/mips/include/asm/mach-powertv/ioremap.h b/arch/mips/include/asm/mach-powertv/ioremap.h
index 076f2eeaa575..c86ef094ec37 100644
--- a/arch/mips/include/asm/mach-powertv/ioremap.h
+++ b/arch/mips/include/asm/mach-powertv/ioremap.h
@@ -88,7 +88,7 @@ static inline dma_addr_t _dma_to_phys_offset_raw(dma_addr_t dma)
 }
 
 /* These are not portable and should not be used in drivers. Drivers should
- * be using ioremap() and friends to map physical addreses to virtual
+ * be using ioremap() and friends to map physical addresses to virtual
  * addresses and dma_map*() and friends to map virtual addresses into DMA
  * addresses and back.
  */
diff --git a/arch/mips/jz4740/board-qi_lb60.c b/arch/mips/jz4740/board-qi_lb60.c
index 5742bb4d78f4..de4433c5a771 100644
--- a/arch/mips/jz4740/board-qi_lb60.c
+++ b/arch/mips/jz4740/board-qi_lb60.c
@@ -65,7 +65,7 @@ static struct nand_ecclayout qi_lb60_ecclayout_1gb = {
 
 /* Early prototypes of the QI LB60 had only 1GB of NAND.
  * In order to support these devices aswell the partition and ecc layout is
- * initalized depending on the NAND size */
+ * initialized depending on the NAND size */
 static struct mtd_partition qi_lb60_partitions_1gb[] = {
 	{
 		.name = "NAND BOOT partition",
@@ -464,7 +464,7 @@ static int __init qi_lb60_board_setup(void)
 	board_gpio_setup();
 
 	if (qi_lb60_init_platform_devices())
-		panic("Failed to initalize platform devices\n");
+		panic("Failed to initialize platform devices\n");
 
 	return 0;
 }
diff --git a/arch/mips/jz4740/gpio.c b/arch/mips/jz4740/gpio.c
index 38f60f35156c..88e6aeda5bf1 100644
--- a/arch/mips/jz4740/gpio.c
+++ b/arch/mips/jz4740/gpio.c
@@ -546,7 +546,7 @@ static int __init jz4740_gpio_init(void)
 	for (i = 0; i < ARRAY_SIZE(jz4740_gpio_chips); ++i)
 		jz4740_gpio_chip_init(&jz4740_gpio_chips[i], i);
 
-	printk(KERN_INFO "JZ4740 GPIO initalized\n");
+	printk(KERN_INFO "JZ4740 GPIO initialized\n");
 
 	return 0;
 }
diff --git a/arch/mips/mti-malta/malta-memory.c b/arch/mips/mti-malta/malta-memory.c
index b27419c84919..a96d281f9221 100644
--- a/arch/mips/mti-malta/malta-memory.c
+++ b/arch/mips/mti-malta/malta-memory.c
@@ -43,7 +43,7 @@ static struct prom_pmemblock mdesc[PROM_MAX_PMEMBLOCKS];
 static char *mtypes[3] = {
 	"Dont use memory",
 	"YAMON PROM memory",
-	"Free memmory",
+	"Free memory",
 };
 #endif
 
diff --git a/arch/mips/pci/pcie-octeon.c b/arch/mips/pci/pcie-octeon.c
index 385f035b24e4..0583c463e5f1 100644
--- a/arch/mips/pci/pcie-octeon.c
+++ b/arch/mips/pci/pcie-octeon.c
@@ -900,7 +900,7 @@ static int cvmx_pcie_rc_initialize(int pcie_port)
 	mem_access_subid.s.ror = 0;
 	/* Disable Relaxed Ordering for Writes. */
 	mem_access_subid.s.row = 0;
-	/* PCIe Adddress Bits <63:34>. */
+	/* PCIe Address Bits <63:34>. */
 	mem_access_subid.s.ba = 0;
 
 	/*
diff --git a/arch/mips/powertv/memory.c b/arch/mips/powertv/memory.c
index 73880ad29bc2..fb3d29660c42 100644
--- a/arch/mips/powertv/memory.c
+++ b/arch/mips/powertv/memory.c
@@ -57,7 +57,7 @@
 unsigned long ptv_memsize;
 
 /*
- * struct low_mem_reserved - Items in low memmory that are reserved
+ * struct low_mem_reserved - Items in low memory that are reserved
  * @start:	Physical address of item
  * @size:	Size, in bytes, of this item
  * @is_aliased:	True if this is RAM aliased from another location. If false,
diff --git a/arch/mips/txx9/generic/pci.c b/arch/mips/txx9/generic/pci.c
index 9a0be810cafa..96e69a00ffc8 100644
--- a/arch/mips/txx9/generic/pci.c
+++ b/arch/mips/txx9/generic/pci.c
@@ -107,7 +107,7 @@ int txx9_pci_mem_high __initdata;
 
 /*
  * allocate pci_controller and resources.
- * mem_base, io_base: physical addresss.  0 for auto assignment.
+ * mem_base, io_base: physical address.  0 for auto assignment.
  * mem_size and io_size means max size on auto assignment.
  * pcic must be &txx9_primary_pcic or NULL.
  */
diff --git a/arch/powerpc/include/asm/8xx_immap.h b/arch/powerpc/include/asm/8xx_immap.h
index 4b0e15206006..6b6dc20b0beb 100644
--- a/arch/powerpc/include/asm/8xx_immap.h
+++ b/arch/powerpc/include/asm/8xx_immap.h
@@ -93,7 +93,7 @@ typedef struct	mem_ctlr {
 } memctl8xx_t;
 
 /*-----------------------------------------------------------------------
- * BR - Memory Controler: Base Register					16-9
+ * BR - Memory Controller: Base Register					16-9
  */
 #define BR_BA_MSK	0xffff8000	/* Base Address Mask			*/
 #define BR_AT_MSK	0x00007000	/* Address Type Mask			*/
@@ -110,7 +110,7 @@ typedef struct	mem_ctlr {
 #define BR_V		0x00000001	/* Bank Valid				*/
 
 /*-----------------------------------------------------------------------
- * OR - Memory Controler: Option Register				16-11
+ * OR - Memory Controller: Option Register				16-11
  */
 #define OR_AM_MSK	0xffff8000	/* Address Mask Mask			*/
 #define OR_ATM_MSK	0x00007000	/* Address Type Mask Mask		*/
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index 7fd90d02d8c6..c4d2b7167568 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -1469,7 +1469,7 @@ static int cell_global_start(struct op_counter_config *ctr)
  * The pm_interval register is setup to write the SPU PC value into the
  * trace buffer at the maximum rate possible.  The trace buffer is configured
  * to store the PCs, wrapping when it is full.  The performance counter is
- * intialized to the max hardware count minus the number of events, N, between
+ * initialized to the max hardware count minus the number of events, N, between
  * samples.  Once the N events have occured, a HW counter overflow occurs
  * causing the generation of a HW counter interrupt which also stops the
  * writing of the SPU PC values to the trace buffer.  Hence the last PC
diff --git a/arch/powerpc/platforms/83xx/suspend-asm.S b/arch/powerpc/platforms/83xx/suspend-asm.S
index 1930543c98d3..3d1ecd211776 100644
--- a/arch/powerpc/platforms/83xx/suspend-asm.S
+++ b/arch/powerpc/platforms/83xx/suspend-asm.S
@@ -231,7 +231,7 @@ _GLOBAL(mpc83xx_enter_deep_sleep)
 	ori	r4, r4, 0x002a
 	mtspr	SPRN_DBAT0L, r4
 	lis	r8, TMP_VIRT_IMMR@h
-	ori	r4, r8, 0x001e	/* 1 MByte accessable from Kernel Space only */
+	ori	r4, r8, 0x001e	/* 1 MByte accessible from Kernel Space only */
 	mtspr	SPRN_DBAT0U, r4
 	isync
 
@@ -241,7 +241,7 @@ _GLOBAL(mpc83xx_enter_deep_sleep)
 	ori	r4, r4, 0x002a
 	mtspr	SPRN_DBAT1L, r4
 	lis	r9, (TMP_VIRT_IMMR + 0x01000000)@h
-	ori	r4, r9, 0x001e	/* 1 MByte accessable from Kernel Space only */
+	ori	r4, r9, 0x001e	/* 1 MByte accessible from Kernel Space only */
 	mtspr	SPRN_DBAT1U, r4
 	isync
 
@@ -253,7 +253,7 @@ _GLOBAL(mpc83xx_enter_deep_sleep)
 	li	r4, 0x0002
 	mtspr	SPRN_DBAT2L, r4
 	lis	r4, KERNELBASE@h
-	ori	r4, r4, 0x001e	/* 1 MByte accessable from Kernel Space only */
+	ori	r4, r4, 0x001e	/* 1 MByte accessible from Kernel Space only */
 	mtspr	SPRN_DBAT2U, r4
 	isync
 
diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
index b341018326df..6c4b5837fc8a 100644
--- a/arch/powerpc/platforms/ps3/device-init.c
+++ b/arch/powerpc/platforms/ps3/device-init.c
@@ -566,10 +566,10 @@ static int ps3_setup_dynamic_device(const struct ps3_repository_device *repo)
 	case PS3_DEV_TYPE_STOR_DISK:
 		result = ps3_setup_storage_dev(repo, PS3_MATCH_ID_STOR_DISK);
 
-		/* Some devices are not accessable from the Other OS lpar. */
+		/* Some devices are not accessible from the Other OS lpar. */
 		if (result == -ENODEV) {
 			result = 0;
-			pr_debug("%s:%u: not accessable\n", __func__,
+			pr_debug("%s:%u: not accessible\n", __func__,
 				 __LINE__);
 		}
 
diff --git a/arch/powerpc/platforms/ps3/interrupt.c b/arch/powerpc/platforms/ps3/interrupt.c
index 59d9712d7364..92290ff4761a 100644
--- a/arch/powerpc/platforms/ps3/interrupt.c
+++ b/arch/powerpc/platforms/ps3/interrupt.c
@@ -44,7 +44,7 @@
  * @lock:
  * @ipi_debug_brk_mask:
  *
- * The HV mantains per SMT thread mappings of HV outlet to HV plug on
+ * The HV maintains per SMT thread mappings of HV outlet to HV plug on
  * behalf of the guest.  These mappings are implemented as 256 bit guest
  * supplied bitmaps indexed by plug number.  The addresses of the bitmaps
  * are registered with the HV through lv1_configure_irq_state_bitmap().
diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c
index eb4cc4ec7952..d1bffbcd9d52 100644
--- a/arch/sh/mm/cache-sh5.c
+++ b/arch/sh/mm/cache-sh5.c
@@ -568,7 +568,7 @@ static void sh5_flush_dcache_page(void *page)
 }
 
 /*
- * Flush the range [start,end] of kernel virtual adddress space from
+ * Flush the range [start,end] of kernel virtual address space from
  * the I-cache.  The corresponding range must be purged from the
  * D-cache also because the SH-5 doesn't have cache snooping between
  * the caches.  The addresses will be visible through the superpage
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 42ad2ba85010..1e9770936c3b 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -622,7 +622,7 @@ static const char CHAFSR_PERR_msg[] =
 static const char CHAFSR_IERR_msg[] =
 	"Internal processor error";
 static const char CHAFSR_ISAP_msg[] =
-	"System request parity error on incoming addresss";
+	"System request parity error on incoming address";
 static const char CHAFSR_UCU_msg[] =
 	"Uncorrectable E-cache ECC error for ifetch/data";
 static const char CHAFSR_UCC_msg[] =
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94c3810..b4389a468fb6 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
 
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
-				  unsigned long adddress)
+				  unsigned long address)
 {
 	___pmd_free_tlb(tlb, pmd);
 }
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cae9c3cb95cf..c3e6d9a3de65 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -902,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 /*
  * The below -8 is to reserve 8 bytes on top of the ring0 stack.
  * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessable even if the CPU haven't stored the SS/ESP registers
+ * is accessible even if the CPU haven't stored the SS/ESP registers
  * on the stack (interrupt gate does not save these registers
  * when switching to the same priv ring).
  * Therefore beware: accessing the ss/esp fields of the
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d2fdb0826df2..57ca77787220 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1086,7 +1086,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 
 	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
 
-	/* Intialize the exclusion range if necessary */
+	/* Initialize the exclusion range if necessary */
 	for_each_iommu(iommu) {
 		if (iommu->exclusion_start &&
 		    iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1353,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
 
 /*
  * Allocates a new protection domain usable for the dma_ops functions.
- * It also intializes the page table and the address allocator data
+ * It also initializes the page table and the address allocator data
  * structures required for the dma_ops interface
  */
 static struct dma_ops_domain *dma_ops_domain_alloc(void)
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
index 65df603622b2..25bfdbb5b130 100644
--- a/arch/x86/kernel/early_printk_mrst.c
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -103,7 +103,7 @@ struct dw_spi_reg {
 static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
 
 static u32 *pclk_spi0;
-/* Always contains an accessable address, start with 0 */
+/* Always contains an accessible address, start with 0 */
 static struct dw_spi_reg *pspi;
 
 static struct kmsg_dumper dw_dumper;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bcece91dd311..8e09aa3aa6d6 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -124,7 +124,7 @@ ENTRY(startup_32)
 	movsl
 	movl pa(boot_params) + NEW_CL_POINTER,%esi
 	andl %esi,%esi
-	jz 1f			# No comand line
+	jz 1f			# No command line
 	movl $pa(boot_command_line),%edi
 	movl $(COMMAND_LINE_SIZE/4),%ecx
 	rep
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4cd59b0d7c15..78ee4b1d4e85 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1030,7 +1030,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 
 	/*
 	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initiliazed yet. Initialize this new group without major
+	 * not initialized yet. Initialize this new group without major
 	 * and minor info and this info will be filled in once a new thread
 	 * comes for IO. See code above.
 	 */
diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h
index bdbfaf22bd14..962a3ccff6fd 100644
--- a/drivers/acpi/acpica/acobject.h
+++ b/drivers/acpi/acpica/acobject.h
@@ -93,7 +93,7 @@
 
 #define AOPOBJ_AML_CONSTANT         0x01	/* Integer is an AML constant */
 #define AOPOBJ_STATIC_POINTER       0x02	/* Data is part of an ACPI table, don't delete */
-#define AOPOBJ_DATA_VALID           0x04	/* Object is intialized and data is valid */
+#define AOPOBJ_DATA_VALID           0x04	/* Object is initialized and data is valid */
 #define AOPOBJ_OBJECT_INITIALIZED   0x08	/* Region is initialized, _REG was run */
 #define AOPOBJ_SETUP_COMPLETE       0x10	/* Region setup is complete */
 #define AOPOBJ_INVALID              0x20	/* Host OS won't allow a Region address */
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 7f77c67d267c..5b0295a923a7 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2240,7 +2240,7 @@ int ata_dev_configure(struct ata_device *dev)
 			if (id[ATA_ID_CFA_KEY_MGMT] & 1)
 				ata_dev_printk(dev, KERN_WARNING,
 					       "supports DRM functions and may "
-					       "not be fully accessable.\n");
+					       "not be fully accessible.\n");
 			snprintf(revbuf, 7, "CFA");
 		} else {
 			snprintf(revbuf, 7, "ATA-%d", ata_id_major_version(id));
@@ -2248,7 +2248,7 @@ int ata_dev_configure(struct ata_device *dev)
 			if (ata_id_has_tpm(id))
 				ata_dev_printk(dev, KERN_WARNING,
 					       "supports DRM functions and may "
-					       "not be fully accessable.\n");
+					       "not be fully accessible.\n");
 		}
 
 		dev->n_sectors = ata_id_n_sectors(id);
diff --git a/drivers/ata/sata_vsc.c b/drivers/ata/sata_vsc.c
index b777176ff494..e079cf29ed5d 100644
--- a/drivers/ata/sata_vsc.c
+++ b/drivers/ata/sata_vsc.c
@@ -370,7 +370,7 @@ static int __devinit vsc_sata_init_one(struct pci_dev *pdev,
 	if (pci_resource_len(pdev, 0) == 0)
 		return -ENODEV;
 
-	/* map IO regions and intialize host accordingly */
+	/* map IO regions and initialize host accordingly */
 	rc = pcim_iomap_regions(pdev, 1 << VSC_MMIO_BAR, DRV_NAME);
 	if (rc == -EBUSY)
 		pcim_pin_device(pdev);
diff --git a/drivers/atm/idt77252.h b/drivers/atm/idt77252.h
index 5042bb2dab15..f53a43ae2bbe 100644
--- a/drivers/atm/idt77252.h
+++ b/drivers/atm/idt77252.h
@@ -572,7 +572,7 @@ struct idt77252_dev
 #define SAR_STAT_TSQF       0x00001000 /* Transmit Status Queue full      */
 #define SAR_STAT_TMROF      0x00000800 /* Timer overflow                  */
 #define SAR_STAT_PHYI       0x00000400 /* PHY device Interrupt flag       */
-#define SAR_STAT_CMDBZ      0x00000200 /* ABR SAR Comand Busy Flag        */
+#define SAR_STAT_CMDBZ      0x00000200 /* ABR SAR Command Busy Flag       */
 #define SAR_STAT_FBQ3A      0x00000100 /* Free Buffer Queue 3 Attention   */
 #define SAR_STAT_FBQ2A      0x00000080 /* Free Buffer Queue 2 Attention   */
 #define SAR_STAT_RSQF       0x00000040 /* Receive Status Queue full       */
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index 9309d4724e13..ad8e207784d4 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -2063,7 +2063,7 @@ static int tx_init(struct atm_dev *dev)
 		- UBR Table size is 4K  
 		- UBR wait queue is 4K  
 	   since the table and wait queues are contiguous, all the bytes   
-	   can be initialized by one memeset.  
+	   can be initialized by one memeset.
 	*/  
         
         vcsize_sel = 0;
@@ -2089,7 +2089,7 @@ static int tx_init(struct atm_dev *dev)
 		- ABR Table size is 2K  
 		- ABR wait queue is 2K  
 	   since the table and wait queues are contiguous, all the bytes   
-	   can be intialized by one memeset.  
+	   can be initialized by one memeset.
 	*/  
         i = ABR_SCHED_TABLE * iadev->memSize;
         writew((i >> 11) & 0xffff, iadev->seg_reg+ABR_SBPTR_BASE);
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 31b526661ec4..c5356ef7e54e 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -8,7 +8,7 @@
  *
  *
  * The driver model core calls device_pm_add() when a device is registered.
- * This will intialize the embedded device_pm_info object in the device
+ * This will initialize the embedded device_pm_info object in the device
  * and add it to the list of power-controlled devices. sysfs entries for
  * controlling device power management will also be added.
  *
diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c
index 338bc4eed1f3..da8b7f161cdb 100644
--- a/drivers/dma/intel_mid_dma.c
+++ b/drivers/dma/intel_mid_dma.c
@@ -1060,8 +1060,8 @@ static irqreturn_t intel_mid_dma_interrupt2(int irq, void *data)
  * mid_setup_dma -	Setup the DMA controller
  * @pdev: Controller PCI device structure
  *
- * Initilize the DMA controller, channels, registers with DMA engine,
- * ISR. Initilize DMA controller channels.
+ * Initialize the DMA controller, channels, registers with DMA engine,
+ * ISR. Initialize DMA controller channels.
  */
 static int mid_setup_dma(struct pci_dev *pdev)
 {
@@ -1219,7 +1219,7 @@ static void middma_shutdown(struct pci_dev *pdev)
  * @pdev: Controller PCI device structure
  * @id: pci device id structure
  *
- * Initilize the PCI device, map BARs, query driver data.
+ * Initialize the PCI device, map BARs, query driver data.
  * Call setup_dma to complete contoller and chan initilzation
  */
 static int __devinit intel_mid_dma_probe(struct pci_dev *pdev,
diff --git a/drivers/edac/amd8131_edac.h b/drivers/edac/amd8131_edac.h
index 60e0d1c72dee..6f8b07131ec4 100644
--- a/drivers/edac/amd8131_edac.h
+++ b/drivers/edac/amd8131_edac.h
@@ -99,7 +99,7 @@ struct amd8131_dev_info {
 
 /*
  * AMD8131 chipset has two pairs of PCIX Bridge and related IOAPIC
- * Controler, and ATCA-6101 has two AMD8131 chipsets, so there are
+ * Controller, and ATCA-6101 has two AMD8131 chipsets, so there are
  * four PCIX Bridges on ATCA-6101 altogether.
  *
  * These PCIX Bridges share the same PCI Device ID and are all of
diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c
index c973004c002c..db1df59ae2b6 100644
--- a/drivers/edac/cell_edac.c
+++ b/drivers/edac/cell_edac.c
@@ -47,7 +47,7 @@ static void cell_edac_count_ce(struct mem_ctl_info *mci, int chan, u64 ar)
 	offset = address & ~PAGE_MASK;
 	syndrome = (ar & 0x000000001fe00000ul) >> 21;
 
-	/* TODO: Decoding of the error addresss */
+	/* TODO: Decoding of the error address */
 	edac_mc_handle_ce(mci, csrow->first_page + pfn, offset,
 			  syndrome, 0, chan, "");
 }
@@ -68,7 +68,7 @@ static void cell_edac_count_ue(struct mem_ctl_info *mci, int chan, u64 ar)
 	pfn = address >> PAGE_SHIFT;
 	offset = address & ~PAGE_MASK;
 
-	/* TODO: Decoding of the error addresss */
+	/* TODO: Decoding of the error address */
 	edac_mc_handle_ue(mci, csrow->first_page + pfn, offset, 0, "");
 }
 
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index d7ca43a828bd..035eaaf2d906 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -258,7 +258,7 @@ enum scrub_type {
  *			for single channel are 64 bits, for dual channel 128
  *			bits.
  *
- * Single-Ranked stick:	A Single-ranked stick has 1 chip-select row of memmory.
+ * Single-Ranked stick:	A Single-ranked stick has 1 chip-select row of memory.
  *			Motherboards commonly drive two chip-select pins to
  *			a memory stick. A single-ranked stick, will occupy
  *			only one of those rows. The other will be unused.
diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c
index 070cea41b661..b9f0c20df1aa 100644
--- a/drivers/edac/ppc4xx_edac.c
+++ b/drivers/edac/ppc4xx_edac.c
@@ -873,7 +873,7 @@ ppc4xx_edac_get_mtype(u32 mcopt1)
 }
 
 /**
- * ppc4xx_edac_init_csrows - intialize driver instance rows
+ * ppc4xx_edac_init_csrows - initialize driver instance rows
  * @mci: A pointer to the EDAC memory controller instance
  *       associated with the ibm,sdram-4xx-ddr2 controller for which
  *       the csrows (i.e. banks/ranks) are being initialized.
@@ -881,7 +881,7 @@ ppc4xx_edac_get_mtype(u32 mcopt1)
  *          currently set for the controller, from which bank width
  *          and memory typ information is derived.
  *
- * This routine intializes the virtual "chip select rows" associated
+ * This routine initializes the virtual "chip select rows" associated
  * with the EDAC memory controller instance. An ibm,sdram-4xx-ddr2
  * controller bank/rank is mapped to a row.
  *
@@ -992,7 +992,7 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1)
 }
 
 /**
- * ppc4xx_edac_mc_init - intialize driver instance
+ * ppc4xx_edac_mc_init - initialize driver instance
  * @mci: A pointer to the EDAC memory controller instance being
  *       initialized.
  * @op: A pointer to the OpenFirmware device tree node associated
diff --git a/drivers/gpu/drm/radeon/atombios.h b/drivers/gpu/drm/radeon/atombios.h
index fe359a239df3..4707cbdb35cd 100644
--- a/drivers/gpu/drm/radeon/atombios.h
+++ b/drivers/gpu/drm/radeon/atombios.h
@@ -1314,7 +1314,7 @@ typedef struct _GET_ENGINE_CLOCK_PARAMETERS
 typedef struct _READ_EDID_FROM_HW_I2C_DATA_PARAMETERS
 {
   USHORT    usPrescale;         //Ratio between Engine clock and I2C clock
-  USHORT    usVRAMAddress;      //Adress in Frame Buffer where to pace raw EDID
+  USHORT    usVRAMAddress;      //Address in Frame Buffer where to pace raw EDID
   USHORT    usStatus;           //When use output: lower byte EDID checksum, high byte hardware status
                                 //WHen use input:  lower byte as 'byte to read':currently limited to 128byte or 1byte
   UCHAR     ucSlaveAddr;        //Read from which slave
diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c
index c9fffd0389fe..2bd3469b3071 100644
--- a/drivers/i2c/busses/i2c-nomadik.c
+++ b/drivers/i2c/busses/i2c-nomadik.c
@@ -434,7 +434,7 @@ static int read_i2c(struct nmk_i2c_dev *dev)
 	}
 
 	if (timeout == 0) {
-		/* controler has timedout, re-init the h/w */
+		/* controller has timedout, re-init the h/w */
 		dev_err(&dev->pdev->dev, "controller timed out, re-init h/w\n");
 		(void) init_hw(dev);
 		status = -ETIMEDOUT;
@@ -498,7 +498,7 @@ static int write_i2c(struct nmk_i2c_dev *dev)
 	}
 
 	if (timeout == 0) {
-		/* controler has timedout, re-init the h/w */
+		/* controller has timedout, re-init the h/w */
 		dev_err(&dev->pdev->dev, "controller timed out, re-init h/w\n");
 		(void) init_hw(dev);
 		status = -ETIMEDOUT;
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
index 4bb997aa39d0..83d2e19d31ae 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -689,7 +689,7 @@ struct t3_swrq {
  * A T3 WQ implements both the SQ and RQ.
  */
 struct t3_wq {
-	union t3_wr *queue;		/* DMA accessable memory */
+	union t3_wr *queue;		/* DMA accessible memory */
 	dma_addr_t dma_addr;		/* DMA address for HW */
 	DEFINE_DMA_UNMAP_ADDR(mapping); /* unmap kruft */
 	u32 error;			/* 1 once we go to ERROR */
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index 584d443b5335..ead2627c06a8 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -314,7 +314,7 @@ MODULE_PARM_DESC(txselect, \
 #define krp_serdesctrl KREG_IBPORT_IDX(IBSerdesCtrl)
 
 /*
- * Per-context kernel registers.  Acess only with qib_read_kreg_ctxt()
+ * Per-context kernel registers.  Access only with qib_read_kreg_ctxt()
  * or qib_write_kreg_ctxt()
  */
 #define krc_rcvhdraddr KREG_IDX(RcvHdrAddr0)
diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index 06ea8da95c62..6675bd7dde19 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -610,7 +610,7 @@ config TOUCHSCREEN_USB_ZYTRONIC
 
 config TOUCHSCREEN_USB_ETT_TC45USB
 	default y
-	bool "ET&T USB series TC4UM/TC5UH touchscreen controler support" if EMBEDDED
+	bool "ET&T USB series TC4UM/TC5UH touchscreen controller support" if EMBEDDED
 	depends on TOUCHSCREEN_USB_COMPOSITE
 
 config TOUCHSCREEN_USB_NEXIO
diff --git a/drivers/isdn/gigaset/bas-gigaset.c b/drivers/isdn/gigaset/bas-gigaset.c
index 178942a2ee61..8a3c5cfc4fea 100644
--- a/drivers/isdn/gigaset/bas-gigaset.c
+++ b/drivers/isdn/gigaset/bas-gigaset.c
@@ -2318,7 +2318,7 @@ static int gigaset_probe(struct usb_interface *interface,
 		 __func__, le16_to_cpu(udev->descriptor.idVendor),
 		 le16_to_cpu(udev->descriptor.idProduct));
 
-	/* allocate memory for our device state and intialize it */
+	/* allocate memory for our device state and initialize it */
 	cs = gigaset_initcs(driver, BAS_CHANNELS, 0, 0, cidmode,
 			    GIGASET_MODULENAME);
 	if (!cs)
@@ -2576,7 +2576,7 @@ static int __init bas_gigaset_init(void)
 {
 	int result;
 
-	/* allocate memory for our driver state and intialize it */
+	/* allocate memory for our driver state and initialize it */
 	driver = gigaset_initdriver(GIGASET_MINOR, GIGASET_MINORS,
 				    GIGASET_MODULENAME, GIGASET_DEVNAME,
 				    &gigops, THIS_MODULE);
diff --git a/drivers/isdn/gigaset/ser-gigaset.c b/drivers/isdn/gigaset/ser-gigaset.c
index d151dcbf770d..0ef09d0eb96b 100644
--- a/drivers/isdn/gigaset/ser-gigaset.c
+++ b/drivers/isdn/gigaset/ser-gigaset.c
@@ -513,7 +513,7 @@ gigaset_tty_open(struct tty_struct *tty)
 		return -ENODEV;
 	}
 
-	/* allocate memory for our device state and intialize it */
+	/* allocate memory for our device state and initialize it */
 	cs = gigaset_initcs(driver, 1, 1, 0, cidmode, GIGASET_MODULENAME);
 	if (!cs)
 		goto error;
@@ -771,7 +771,7 @@ static int __init ser_gigaset_init(void)
 		return rc;
 	}
 
-	/* allocate memory for our driver state and intialize it */
+	/* allocate memory for our driver state and initialize it */
 	driver = gigaset_initdriver(GIGASET_MINOR, GIGASET_MINORS,
 					  GIGASET_MODULENAME, GIGASET_DEVNAME,
 					  &ops, THIS_MODULE);
diff --git a/drivers/isdn/gigaset/usb-gigaset.c b/drivers/isdn/gigaset/usb-gigaset.c
index 4a66338f4e7d..5e3300d8a2a5 100644
--- a/drivers/isdn/gigaset/usb-gigaset.c
+++ b/drivers/isdn/gigaset/usb-gigaset.c
@@ -695,7 +695,7 @@ static int gigaset_probe(struct usb_interface *interface,
 
 	dev_info(&udev->dev, "%s: Device matched ... !\n", __func__);
 
-	/* allocate memory for our device state and intialize it */
+	/* allocate memory for our device state and initialize it */
 	cs = gigaset_initcs(driver, 1, 1, 0, cidmode, GIGASET_MODULENAME);
 	if (!cs)
 		return -ENODEV;
@@ -894,7 +894,7 @@ static int __init usb_gigaset_init(void)
 {
 	int result;
 
-	/* allocate memory for our driver state and intialize it */
+	/* allocate memory for our driver state and initialize it */
 	driver = gigaset_initdriver(GIGASET_MINOR, GIGASET_MINORS,
 				    GIGASET_MODULENAME, GIGASET_DEVNAME,
 				    &ops, THIS_MODULE);
diff --git a/drivers/isdn/hardware/mISDN/ipac.h b/drivers/isdn/hardware/mISDN/ipac.h
index 74a6ccf9065c..8121e046b739 100644
--- a/drivers/isdn/hardware/mISDN/ipac.h
+++ b/drivers/isdn/hardware/mISDN/ipac.h
@@ -29,7 +29,7 @@ struct isac_hw {
 	u32			type;
 	u32			off;		/* offset to isac regs */
 	char			*name;
-	spinlock_t		*hwlock;	/* lock HW acccess */
+	spinlock_t		*hwlock;	/* lock HW access */
 	read_reg_func		*read_reg;
 	write_reg_func		*write_reg;
 	fifo_func		*read_fifo;
@@ -70,7 +70,7 @@ struct ipac_hw {
 	struct hscx_hw		hscx[2];
 	char			*name;
 	void			*hw;
-	spinlock_t		*hwlock;	/* lock HW acccess */
+	spinlock_t		*hwlock;	/* lock HW access */
 	struct module		*owner;
 	u32			type;
 	read_reg_func		*read_reg;
diff --git a/drivers/isdn/hardware/mISDN/isar.h b/drivers/isdn/hardware/mISDN/isar.h
index 4a134acd44d0..9962bdf699c7 100644
--- a/drivers/isdn/hardware/mISDN/isar.h
+++ b/drivers/isdn/hardware/mISDN/isar.h
@@ -44,7 +44,7 @@ struct isar_ch {
 struct isar_hw {
 	struct	isar_ch	ch[2];
 	void		*hw;
-	spinlock_t	*hwlock;	/* lock HW acccess */
+	spinlock_t	*hwlock;	/* lock HW access */
 	char		*name;
 	struct module	*owner;
 	read_reg_func	*read_reg;
diff --git a/drivers/isdn/hisax/isar.c b/drivers/isdn/hisax/isar.c
index 40b914bded8c..2e72227bd071 100644
--- a/drivers/isdn/hisax/isar.c
+++ b/drivers/isdn/hisax/isar.c
@@ -1427,8 +1427,8 @@ modeisar(struct BCState *bcs, int mode, int bc)
 					&bcs->hw.isar.reg->Flags))
 					bcs->hw.isar.dpath = 1;
 				else {
-					printk(KERN_WARNING"isar modeisar analog funktions only with DP1\n");
-					debugl1(cs, "isar modeisar analog funktions only with DP1");
+					printk(KERN_WARNING"isar modeisar analog functions only with DP1\n");
+					debugl1(cs, "isar modeisar analog functions only with DP1");
 					return(1);
 				}
 				break;
diff --git a/drivers/media/video/cx25840/cx25840-ir.c b/drivers/media/video/cx25840/cx25840-ir.c
index 97a4e9b25fe4..a9d83b5d0e49 100644
--- a/drivers/media/video/cx25840/cx25840-ir.c
+++ b/drivers/media/video/cx25840/cx25840-ir.c
@@ -261,7 +261,7 @@ static u16 ns_to_pulse_width_count(u32 ns, u16 divider)
 	u32 rem;
 
 	/*
-	 * The 2 lsb's of the pulse width timer count are not accessable, hence
+	 * The 2 lsb's of the pulse width timer count are not accessible, hence
 	 * the (1 << 2)
 	 */
 	n = ((u64) ns) * CX25840_IR_REFCLK_FREQ / 1000000; /* millicycles */
diff --git a/drivers/media/video/davinci/vpif.h b/drivers/media/video/davinci/vpif.h
index 188841b476e0..ebd5c4338ebb 100644
--- a/drivers/media/video/davinci/vpif.h
+++ b/drivers/media/video/davinci/vpif.h
@@ -33,7 +33,7 @@ extern spinlock_t vpif_lock;
 #define regr(reg)               readl((reg) + vpif_base)
 #define regw(value, reg)        writel(value, (reg + vpif_base))
 
-/* Register Addresss Offsets */
+/* Register Address Offsets */
 #define VPIF_PID			(0x0000)
 #define VPIF_CH0_CTRL			(0x0004)
 #define VPIF_CH1_CTRL			(0x0008)
diff --git a/drivers/media/video/davinci/vpss.c b/drivers/media/video/davinci/vpss.c
index 7918680917d0..3e5cf27ec2b2 100644
--- a/drivers/media/video/davinci/vpss.c
+++ b/drivers/media/video/davinci/vpss.c
@@ -85,7 +85,7 @@ enum vpss_platform_type {
 /*
  * vpss operations. Depends on platform. Not all functions are available
  * on all platforms. The api, first check if a functio is available before
- * invoking it. In the probe, the function ptrs are intialized based on
+ * invoking it. In the probe, the function ptrs are initialized based on
  * vpss name. vpss name can be "dm355_vpss", "dm644x_vpss" etc.
  */
 struct vpss_hw_ops {
diff --git a/drivers/media/video/omap/omap_vout.c b/drivers/media/video/omap/omap_vout.c
index 15f8793e325b..21d7c0d8f51f 100644
--- a/drivers/media/video/omap/omap_vout.c
+++ b/drivers/media/video/omap/omap_vout.c
@@ -1286,7 +1286,7 @@ static int omap_vout_release(struct file *file)
 	videobuf_mmap_free(q);
 
 	/* Even if apply changes fails we should continue
-	   freeing allocated memeory */
+	   freeing allocated memory */
 	if (vout->streaming) {
 		u32 mask = 0;
 
diff --git a/drivers/media/video/saa7164/saa7164-core.c b/drivers/media/video/saa7164/saa7164-core.c
index e1bac5051460..cdd8e348a543 100644
--- a/drivers/media/video/saa7164/saa7164-core.c
+++ b/drivers/media/video/saa7164/saa7164-core.c
@@ -653,8 +653,8 @@ static irqreturn_t saa7164_irq(int irq, void *dev_id)
 		goto out;
 	}
 
-	/* Check that the hardware is accessable. If the status bytes are
-	 * 0xFF then the device is not accessable, the the IRQ belongs
+	/* Check that the hardware is accessible. If the status bytes are
+	 * 0xFF then the device is not accessible, the the IRQ belongs
 	 * to another driver.
 	 * 4 x u32 interrupt registers.
 	 */
diff --git a/drivers/media/video/sn9c102/sn9c102_sensor.h b/drivers/media/video/sn9c102/sn9c102_sensor.h
index 494957b10bac..7f38549715b6 100644
--- a/drivers/media/video/sn9c102/sn9c102_sensor.h
+++ b/drivers/media/video/sn9c102/sn9c102_sensor.h
@@ -147,7 +147,7 @@ enum sn9c102_i2c_interface {
 
 struct sn9c102_sensor {
 	char name[32], /* sensor name */
-	     maintainer[64]; /* name of the mantainer <email> */
+	     maintainer[64]; /* name of the maintainer <email> */
 
 	enum sn9c102_bridge supported_bridge; /* supported SN9C1xx bridges */
 
diff --git a/drivers/media/video/zoran/zoran.h b/drivers/media/video/zoran/zoran.h
index 37fe16181e3c..1f04438695dd 100644
--- a/drivers/media/video/zoran/zoran.h
+++ b/drivers/media/video/zoran/zoran.h
@@ -95,7 +95,7 @@ struct zoran_params {
 
 	int quality;		/* Measure for quality of compressed images.
 				 * Scales linearly with the size of the compressed images.
-				 * Must be beetween 0 and 100, 100 is a compression
+				 * Must be between 0 and 100, 100 is a compression
 				 * ratio of 1:4 */
 
 	int odd_even;		/* Which field should come first ??? */
diff --git a/drivers/message/fusion/lsi/mpi_log_sas.h b/drivers/message/fusion/lsi/mpi_log_sas.h
index 691620dbedd2..8b04810df469 100644
--- a/drivers/message/fusion/lsi/mpi_log_sas.h
+++ b/drivers/message/fusion/lsi/mpi_log_sas.h
@@ -268,7 +268,7 @@
 
 /* Compatibility Error : IR Disabled */
 #define IR_LOGINFO_COMPAT_ERROR_RAID_DISABLED                  (0x00010030)
-/* Compatibility Error : Inquiry Comand failed */
+/* Compatibility Error : Inquiry Command failed */
 #define IR_LOGINFO_COMPAT_ERROR_INQUIRY_FAILED                 (0x00010031)
 /* Compatibility Error : Device not direct access device */
 #define IR_LOGINFO_COMPAT_ERROR_NOT_DIRECT_ACCESS              (0x00010032)
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 3e57b61ca446..3358c0af3466 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -7977,7 +7977,7 @@ mpt_spi_log_info(MPT_ADAPTER *ioc, u32 log_info)
 		NULL,						/* 2Eh */
 		NULL,						/* 2Fh */
 		"Compatibility Error: IR Disabled",		/* 30h */
-		"Compatibility Error: Inquiry Comand Failed",	/* 31h */
+		"Compatibility Error: Inquiry Command Failed",	/* 31h */
 		"Compatibility Error: Device not Direct Access "
 		    "Device ",					/* 32h */
 		"Compatibility Error: Removable Device Found",	/* 33h */
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 83a5115f0251..5651f77a8cc1 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1146,7 +1146,7 @@ mptsas_target_reset_queue(MPT_ADAPTER *ioc,
  *
  * This function will delete scheduled target reset from the list and
  * try to send next target reset. This will be called from completion
- * context of any Task managment command.
+ * context of any Task management command.
  */
 
 void
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index f87a9d405a5e..ae7cad185898 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -309,7 +309,7 @@ static inline void i2o_block_request_free(struct i2o_block_request *ireq)
  *	@ireq: I2O block request
  *	@mptr: message body pointer
  *
- *	Builds the SG list and map it to be accessable by the controller.
+ *	Builds the SG list and map it to be accessible by the controller.
  *
  *	Returns 0 on failure or 1 on success.
  */
diff --git a/drivers/misc/arm-charlcd.c b/drivers/misc/arm-charlcd.c
index 9e3879ef58f2..fe8616a8d287 100644
--- a/drivers/misc/arm-charlcd.c
+++ b/drivers/misc/arm-charlcd.c
@@ -313,7 +313,7 @@ static int __init charlcd_probe(struct platform_device *pdev)
 	INIT_DELAYED_WORK(&lcd->init_work, charlcd_init_work);
 	schedule_delayed_work(&lcd->init_work, 0);
 
-	dev_info(&pdev->dev, "initalized ARM character LCD at %08x\n",
+	dev_info(&pdev->dev, "initialized ARM character LCD at %08x\n",
 		lcd->phybase);
 
 	return 0;
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 217f82037fc1..bfc8a8ae55df 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -257,7 +257,7 @@ static u32 get_card_status(struct mmc_card *card, struct request *req)
 	cmd.flags = MMC_RSP_SPI_R2 | MMC_RSP_R1 | MMC_CMD_AC;
 	err = mmc_wait_for_cmd(card->host, &cmd, 0);
 	if (err)
-		printk(KERN_ERR "%s: error %d sending status comand",
+		printk(KERN_ERR "%s: error %d sending status command",
 		       req->rq_disk->disk_name, err);
 	return cmd.resp[0];
 }
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index d618e8673996..c99a1fcd97f6 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -462,7 +462,7 @@ config MMC_SH_MMCIF
 	tristate "SuperH Internal MMCIF support"
 	depends on MMC_BLOCK && (SUPERH || ARCH_SHMOBILE)
 	help
-	  This selects the MMC Host Interface controler (MMCIF).
+	  This selects the MMC Host Interface controller (MMCIF).
 
 	  This driver supports MMCIF in sh7724/sh7757/sh7372.
 
diff --git a/drivers/mmc/host/au1xmmc.c b/drivers/mmc/host/au1xmmc.c
index 41e5a60493ad..ef72e874ca36 100644
--- a/drivers/mmc/host/au1xmmc.c
+++ b/drivers/mmc/host/au1xmmc.c
@@ -192,7 +192,7 @@ static inline void SEND_STOP(struct au1xmmc_host *host)
 	au_writel(config2 | SD_CONFIG2_DF, HOST_CONFIG2(host));
 	au_sync();
 
-	/* Send the stop commmand */
+	/* Send the stop command */
 	au_writel(STOP_CMD, HOST_CMD(host));
 }
 
diff --git a/drivers/mmc/host/sdricoh_cs.c b/drivers/mmc/host/sdricoh_cs.c
index f472c2714eb8..bbc298fd2a15 100644
--- a/drivers/mmc/host/sdricoh_cs.c
+++ b/drivers/mmc/host/sdricoh_cs.c
@@ -446,7 +446,7 @@ static int sdricoh_init_mmc(struct pci_dev *pci_dev,
 	mmc->max_seg_size = 1024 * 512;
 	mmc->max_blk_size = 512;
 
-	/* reset the controler */
+	/* reset the controller */
 	if (sdricoh_reset(host)) {
 		dev_dbg(dev, "could not reset\n");
 		result = -EIO;
@@ -478,7 +478,7 @@ static int sdricoh_pcmcia_probe(struct pcmcia_device *pcmcia_dev)
 	dev_info(&pcmcia_dev->dev, "Searching MMC controller for pcmcia device"
 		" %s %s ...\n", pcmcia_dev->prod_id[0], pcmcia_dev->prod_id[1]);
 
-	/* search pci cardbus bridge that contains the mmc controler */
+	/* search pci cardbus bridge that contains the mmc controller */
 	/* the io region is already claimed by yenta_socket... */
 	while ((pci_dev =
 		pci_get_device(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_RL5C476,
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 1f75a1b1f7c3..31bf376b82a0 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -821,7 +821,7 @@ retry:
  *
  * Wait for command done. This is a helper function for nand_wait used when
  * we are in interrupt context. May happen when in panic and trying to write
- * an oops trough mtdoops.
+ * an oops through mtdoops.
  */
 static void panic_nand_wait(struct mtd_info *mtd, struct nand_chip *chip,
 			    unsigned long timeo)
diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index e9ad16f00b56..4a6f0eac8b12 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -4947,7 +4947,7 @@ static int bnx2x_init_hw_common(struct bnx2x *bp, u32 load_code)
 		memset(&ilt_cli, 0, sizeof(struct ilt_client_info));
 		memset(&ilt, 0, sizeof(struct bnx2x_ilt));
 
-		/* initalize dummy TM client */
+		/* initialize dummy TM client */
 		ilt_cli.start = 0;
 		ilt_cli.end = ILT_NUM_PAGE_ENTRIES - 1;
 		ilt_cli.client_num = ILT_CLIENT_TM;
diff --git a/drivers/net/bnx2x/bnx2x_reg.h b/drivers/net/bnx2x/bnx2x_reg.h
index 1cefe489a955..245220af9feb 100644
--- a/drivers/net/bnx2x/bnx2x_reg.h
+++ b/drivers/net/bnx2x/bnx2x_reg.h
@@ -1604,7 +1604,7 @@
    (~misc_registers_sw_timer_cfg_4.sw_timer_cfg_4[1] ) is set */
 #define MISC_REG_SW_TIMER_RELOAD_VAL_4				 0xa2fc
 /* [RW 32] the value of the counter for sw timers1-8. there are 8 addresses
-   in this register. addres 0 - timer 1; address 1 - timer 2, ...  address 7 -
+   in this register. address 0 - timer 1; address 1 - timer 2, ...  address 7 -
    timer 8 */
 #define MISC_REG_SW_TIMER_VAL					 0xa5c0
 /* [RW 1] Set by the MCP to remember if one or more of the drivers is/are
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 881914bc4e9c..106718c1be5b 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -840,7 +840,7 @@ static int ad_lacpdu_send(struct port *port)
 	lacpdu_header = (struct lacpdu_header *)skb_put(skb, length);
 
 	memcpy(lacpdu_header->hdr.h_dest, lacpdu_mcast_addr, ETH_ALEN);
-	/* Note: source addres is set to be the member's PERMANENT address,
+	/* Note: source address is set to be the member's PERMANENT address,
 	   because we use it to identify loopback lacpdus in receive. */
 	memcpy(lacpdu_header->hdr.h_source, slave->perm_hwaddr, ETH_ALEN);
 	lacpdu_header->hdr.h_proto = PKT_TYPE_LACPDU;
@@ -881,7 +881,7 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker)
 	marker_header = (struct bond_marker_header *)skb_put(skb, length);
 
 	memcpy(marker_header->hdr.h_dest, lacpdu_mcast_addr, ETH_ALEN);
-	/* Note: source addres is set to be the member's PERMANENT address,
+	/* Note: source address is set to be the member's PERMANENT address,
 	   because we use it to identify loopback MARKERs in receive. */
 	memcpy(marker_header->hdr.h_source, slave->perm_hwaddr, ETH_ALEN);
 	marker_header->hdr.h_proto = PKT_TYPE_LACPDU;
@@ -1916,7 +1916,7 @@ int bond_3ad_bind_slave(struct slave *slave)
 		return -1;
 	}
 
-	//check that the slave has not been intialized yet.
+	//check that the slave has not been initialized yet.
 	if (SLAVE_AD_INFO(slave).port.slave != slave) {
 
 		// port initialization
diff --git a/drivers/net/chelsio/subr.c b/drivers/net/chelsio/subr.c
index 63ebf76d2390..8a43c7e19701 100644
--- a/drivers/net/chelsio/subr.c
+++ b/drivers/net/chelsio/subr.c
@@ -556,7 +556,7 @@ struct chelsio_vpd_t {
 #define EEPROM_MAX_POLL   4
 
 /*
- * Read SEEPROM. A zero is written to the flag register when the addres is
+ * Read SEEPROM. A zero is written to the flag register when the address is
  * written to the Control register. The hardware device will set the flag to a
  * one when 4B have been transferred to the Data register.
  */
diff --git a/drivers/net/cxgb3/mc5.c b/drivers/net/cxgb3/mc5.c
index a8766fb2f9ab..e13b7fe9d082 100644
--- a/drivers/net/cxgb3/mc5.c
+++ b/drivers/net/cxgb3/mc5.c
@@ -318,7 +318,7 @@ static void mc5_dbgi_mode_disable(const struct mc5 *mc5)
 
 /*
  * Initialization that requires the OS and protocol layers to already
- * be intialized goes here.
+ * be initialized goes here.
  */
 int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters,
 		unsigned int nroutes)
diff --git a/drivers/net/cxgb3/t3_hw.c b/drivers/net/cxgb3/t3_hw.c
index 3a6adf0b3e9d..0b197043bc34 100644
--- a/drivers/net/cxgb3/t3_hw.c
+++ b/drivers/net/cxgb3/t3_hw.c
@@ -607,7 +607,7 @@ struct t3_vpd {
  *
  *	Read a 32-bit word from a location in VPD EEPROM using the card's PCI
  *	VPD ROM capability.  A zero is written to the flag bit when the
- *	addres is written to the control register.  The hardware device will
+ *	address is written to the control register.  The hardware device will
  *	set the flag to 1 when 4 bytes have been read into the data register.
  */
 int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data)
diff --git a/drivers/net/e1000/e1000_hw.h b/drivers/net/e1000/e1000_hw.h
index ecd9f6c6bcd5..5e820f4e68b9 100644
--- a/drivers/net/e1000/e1000_hw.h
+++ b/drivers/net/e1000/e1000_hw.h
@@ -41,7 +41,7 @@ struct e1000_hw;
 struct e1000_hw_stats;
 
 /* Enumerated types specific to the e1000 hardware */
-/* Media Access Controlers */
+/* Media Access Controllers */
 typedef enum {
 	e1000_undefined = 0,
 	e1000_82542_rev2_0,
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 4686c3983fc3..1b8a43a29c01 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2194,7 +2194,7 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 	 * addresses take precedence to avoid disabling unicast filtering
 	 * when possible.
 	 *
-	 * RAR 0 is used for the station MAC adddress
+	 * RAR 0 is used for the station MAC address
 	 * if there are not 14 addresses, go ahead and clear the filters
 	 */
 	i = 1;
diff --git a/drivers/net/e1000e/82571.c b/drivers/net/e1000e/82571.c
index 7236f1a53ba0..a655beb69320 100644
--- a/drivers/net/e1000e/82571.c
+++ b/drivers/net/e1000e/82571.c
@@ -300,7 +300,7 @@ static s32 e1000_init_mac_params_82571(struct e1000_adapter *adapter)
 
 	/*
 	 * Ensure that the inter-port SWSM.SMBI lock bit is clear before
-	 * first NVM or PHY acess. This should be done for single-port
+	 * first NVM or PHY access. This should be done for single-port
 	 * devices, and for one port only on dual-port devices so that
 	 * for those devices we can still use the SMBI lock to synchronize
 	 * inter-port accesses to the PHY & NVM.
diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index e3374d9a2472..38c84ba3e3c1 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -321,7 +321,7 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	}
 
 	/*
-	 * Reset the PHY before any acccess to it.  Doing so, ensures that
+	 * Reset the PHY before any access to it.  Doing so, ensures that
 	 * the PHY is in a known good state before we read/write PHY registers.
 	 * The generic reset is sufficient here, because we haven't determined
 	 * the PHY type yet.
diff --git a/drivers/net/e1000e/phy.c b/drivers/net/e1000e/phy.c
index 3d3dc0c82355..b9bff5ba009f 100644
--- a/drivers/net/e1000e/phy.c
+++ b/drivers/net/e1000e/phy.c
@@ -2976,7 +2976,7 @@ s32 e1000_write_phy_reg_hv_locked(struct e1000_hw *hw, u32 offset, u16 data)
 }
 
 /**
- *  e1000_get_phy_addr_for_hv_page - Get PHY adrress based on page
+ *  e1000_get_phy_addr_for_hv_page - Get PHY address based on page
  *  @page: page to be accessed
  **/
 static u32 e1000_get_phy_addr_for_hv_page(u32 page)
diff --git a/drivers/net/eepro.c b/drivers/net/eepro.c
index 7c826319ee5a..dbaec546c428 100644
--- a/drivers/net/eepro.c
+++ b/drivers/net/eepro.c
@@ -1760,7 +1760,7 @@ module_param_array(io, int, NULL, 0);
 module_param_array(irq, int, NULL, 0);
 module_param_array(mem, int, NULL, 0);
 module_param(autodetect, int, 0);
-MODULE_PARM_DESC(io, "EtherExpress Pro/10 I/O base addres(es)");
+MODULE_PARM_DESC(io, "EtherExpress Pro/10 I/O base address(es)");
 MODULE_PARM_DESC(irq, "EtherExpress Pro/10 IRQ number(s)");
 MODULE_PARM_DESC(mem, "EtherExpress Pro/10 Rx buffer size(es) in kB (3-29)");
 MODULE_PARM_DESC(autodetect, "EtherExpress Pro/10 force board(s) detection (0-1)");
diff --git a/drivers/net/irda/donauboe.h b/drivers/net/irda/donauboe.h
index 4dc39e5f0156..77fcf4459161 100644
--- a/drivers/net/irda/donauboe.h
+++ b/drivers/net/irda/donauboe.h
@@ -30,7 +30,7 @@
  *     or the type-DO IR port.
  *
  * IrDA chip set list from Toshiba Computer Engineering Corp.
- * model			method	maker	controler		Version 
+ * model			method	maker	controller		Version 
  * Portege 320CT	FIR,SIR Toshiba Oboe(Triangle) 
  * Portege 3010CT	FIR,SIR Toshiba Oboe(Sydney) 
  * Portege 3015CT	FIR,SIR Toshiba Oboe(Sydney) 
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index 0bd8fbb5bfd0..05e6b8cafb39 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -1078,7 +1078,7 @@ s32 ixgbe_init_fdir_signature_82599(struct ixgbe_hw *hw, u32 pballoc)
 
 	/*
 	 * The defaults in the HW for RX PB 1-7 are not zero and so should be
-	 * intialized to zero for non DCB mode otherwise actual total RX PB
+	 * initialized to zero for non DCB mode otherwise actual total RX PB
 	 * would be bigger than programmed and filter space would run into
 	 * the PB 0 region.
 	 */
@@ -1169,7 +1169,7 @@ s32 ixgbe_init_fdir_perfect_82599(struct ixgbe_hw *hw, u32 pballoc)
 
 	/*
 	 * The defaults in the HW for RX PB 1-7 are not zero and so should be
-	 * intialized to zero for non DCB mode otherwise actual total RX PB
+	 * initialized to zero for non DCB mode otherwise actual total RX PB
 	 * would be bigger than programmed and filter space would run into
 	 * the PB 0 region.
 	 */
diff --git a/drivers/net/ll_temac_main.c b/drivers/net/ll_temac_main.c
index 9f8e7027b0b3..661ed1ff4c2e 100644
--- a/drivers/net/ll_temac_main.c
+++ b/drivers/net/ll_temac_main.c
@@ -238,7 +238,7 @@ static int temac_dma_bd_init(struct net_device *ndev)
 		goto out;
 	}
 	/* allocate the tx and rx ring buffer descriptors. */
-	/* returns a virtual addres and a physical address. */
+	/* returns a virtual address and a physical address. */
 	lp->tx_bd_v = dma_alloc_coherent(ndev->dev.parent,
 					 sizeof(*lp->tx_bd_v) * TX_BD_NUM,
 					 &lp->tx_bd_p, GFP_KERNEL);
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 581836867098..5976d1d51df1 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -36,7 +36,7 @@
    Rev 1.07.06 Nov.  7 2000 Jeff Garzik <jgarzik@pobox.com> some bug fix and cleaning
    Rev 1.07.05 Nov.  6 2000 metapirat<metapirat@gmx.de> contribute media type select by ifconfig
    Rev 1.07.04 Sep.  6 2000 Lei-Chun Chang added ICS1893 PHY support
-   Rev 1.07.03 Aug. 24 2000 Lei-Chun Chang (lcchang@sis.com.tw) modified 630E eqaulizer workaround rule
+   Rev 1.07.03 Aug. 24 2000 Lei-Chun Chang (lcchang@sis.com.tw) modified 630E equalizer workaround rule
    Rev 1.07.01 Aug. 08 2000 Ollie Lho minor update for SiS 630E and SiS 630E A1
    Rev 1.07    Mar. 07 2000 Ollie Lho bug fix in Rx buffer ring
    Rev 1.06.04 Feb. 11 2000 Jeff Garzik <jgarzik@pobox.com> softnet and init for kernel 2.4
diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c
index 8b3dc1eb4015..474652a2f70d 100644
--- a/drivers/net/tehuti.c
+++ b/drivers/net/tehuti.c
@@ -12,7 +12,7 @@
 /*
  * RX HW/SW interaction overview
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * There are 2 types of RX communication channels betwean driver and NIC.
+ * There are 2 types of RX communication channels between driver and NIC.
  * 1) RX Free Fifo - RXF - holds descriptors of empty buffers to accept incoming
  * traffic. This Fifo is filled by SW and is readen by HW. Each descriptor holds
  * info about buffer's location, size and ID. An ID field is used to identify a
@@ -821,7 +821,7 @@ static void bdx_setmulti(struct net_device *ndev)
 		}
 
 		/* use PMF to accept first MAC_MCST_NUM (15) addresses */
-		/* TBD: sort addreses and write them in ascending order
+		/* TBD: sort addresses and write them in ascending order
 		 * into RX_MAC_MCST regs. we skip this phase now and accept ALL
 		 * multicast frames throu IMF */
 		/* accept the rest of addresses throu IMF */
@@ -1346,7 +1346,7 @@ static void print_rxfd(struct rxf_desc *rxfd)
 /*
  * TX HW/SW interaction overview
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * There are 2 types of TX communication channels betwean driver and NIC.
+ * There are 2 types of TX communication channels between driver and NIC.
  * 1) TX Free Fifo - TXF - holds ack descriptors for sent packets
  * 2) TX Data Fifo - TXD - holds descriptors of full buffers.
  *
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 55f3a3e667a9..e625f9530cf9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1309,7 +1309,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		break;
 
 	case SIOCGIFHWADDR:
-		/* Get hw addres */
+		/* Get hw address */
 		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
 		ifr.ifr_hwaddr.sa_family = tun->dev->type;
 		if (copy_to_user(argp, &ifr, ifreq_len))
diff --git a/drivers/net/vxge/vxge-traffic.h b/drivers/net/vxge/vxge-traffic.h
index 9890d4d596d0..654295816a39 100644
--- a/drivers/net/vxge/vxge-traffic.h
+++ b/drivers/net/vxge/vxge-traffic.h
@@ -1695,7 +1695,7 @@ struct vxge_hw_device_stats_sw_err {
  * struct vxge_hw_device_stats - Contains HW per-device statistics,
  * including hw.
  * @devh: HW device handle.
- * @dma_addr: DMA addres of the %hw_info. Given to device to fill-in the stats.
+ * @dma_addr: DMA address of the %hw_info. Given to device to fill-in the stats.
  * @hw_info_dmah: DMA handle used to map hw statistics onto the device memory
  *                space.
  * @hw_info_dma_acch: One more DMA handle used subsequently to free the
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index d45b08d1dbc9..94ff9b02e28e 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -125,7 +125,7 @@ static u32 dscc4_pci_config_store[16];
 /* Module parameters */
 
 MODULE_AUTHOR("Maintainer: Francois Romieu <romieu@cogenit.fr>");
-MODULE_DESCRIPTION("Siemens PEB20534 PCI Controler");
+MODULE_DESCRIPTION("Siemens PEB20534 PCI Controller");
 MODULE_LICENSE("GPL");
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug,"Enable/disable extra messages");
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index cdedab46ba21..bcb483fdc4d9 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -280,7 +280,7 @@ int i2400m_check_mac_addr(struct i2400m *i2400m)
 			result);
 		goto error;
 	}
-	/* Extract MAC addresss */
+	/* Extract MAC address */
 	ddi = (void *) skb->data;
 	BUILD_BUG_ON(ETH_ALEN != sizeof(ddi->mac_address));
 	d_printf(2, dev, "GET DEVICE INFO: mac addr %pM\n",
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h
index 59ac7705e76e..8cde3a52d225 100644
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -698,7 +698,7 @@ enum i2400m_bm_cmd_flags {
  * @I2400M_BRI_MAC_REINIT: We need to reinitialize the boot
  *     rom after reading the MAC address. This is quite a dirty hack,
  *     if you ask me -- the device requires the bootrom to be
- *     intialized after reading the MAC address.
+ *     initialized after reading the MAC address.
  */
 enum i2400m_bri {
 	I2400M_BRI_SOFT       = 1 << 1,
diff --git a/drivers/net/wireless/ath/ath5k/reg.h b/drivers/net/wireless/ath/ath5k/reg.h
index ca79ecd832fd..2a246d13b520 100644
--- a/drivers/net/wireless/ath/ath5k/reg.h
+++ b/drivers/net/wireless/ath/ath5k/reg.h
@@ -1063,7 +1063,7 @@
 /*
  * EEPROM command register
  */
-#define AR5K_EEPROM_CMD		0x6008			/* Register Addres */
+#define AR5K_EEPROM_CMD		0x6008			/* Register Address */
 #define AR5K_EEPROM_CMD_READ	0x00000001	/* EEPROM read */
 #define AR5K_EEPROM_CMD_WRITE	0x00000002	/* EEPROM write */
 #define AR5K_EEPROM_CMD_RESET	0x00000004	/* EEPROM reset */
@@ -1083,7 +1083,7 @@
 /*
  * EEPROM config register
  */
-#define AR5K_EEPROM_CFG			0x6010			/* Register Addres */
+#define AR5K_EEPROM_CFG			0x6010			/* Register Address */
 #define AR5K_EEPROM_CFG_SIZE		0x00000003		/* Size determination override */
 #define AR5K_EEPROM_CFG_SIZE_AUTO	0
 #define AR5K_EEPROM_CFG_SIZE_4KBIT	1
@@ -1125,7 +1125,7 @@
  * Second station id register (Upper 16 bits of MAC address + PCU settings)
  */
 #define AR5K_STA_ID1			0x8004			/* Register Address */
-#define	AR5K_STA_ID1_ADDR_U16		0x0000ffff	/* Upper 16 bits of MAC addres */
+#define	AR5K_STA_ID1_ADDR_U16		0x0000ffff	/* Upper 16 bits of MAC address */
 #define AR5K_STA_ID1_AP			0x00010000	/* Set AP mode */
 #define AR5K_STA_ID1_ADHOC		0x00020000	/* Set Ad-Hoc mode */
 #define AR5K_STA_ID1_PWR_SV		0x00040000	/* Power save reporting */
diff --git a/drivers/net/wireless/b43/phy_g.c b/drivers/net/wireless/b43/phy_g.c
index 0dc33b65e86b..be4828167012 100644
--- a/drivers/net/wireless/b43/phy_g.c
+++ b/drivers/net/wireless/b43/phy_g.c
@@ -1919,7 +1919,7 @@ static void b43_hardware_pctl_init_gphy(struct b43_wldev *dev)
 	b43_hf_write(dev, b43_hf_read(dev) | B43_HF_HWPCTL);
 }
 
-/* Intialize B/G PHY power control */
+/* Initialize B/G PHY power control */
 static void b43_phy_init_pctl(struct b43_wldev *dev)
 {
 	struct ssb_bus *bus = dev->dev->bus;
diff --git a/drivers/net/wireless/b43legacy/phy.c b/drivers/net/wireless/b43legacy/phy.c
index 35033dd342ce..28e477d01587 100644
--- a/drivers/net/wireless/b43legacy/phy.c
+++ b/drivers/net/wireless/b43legacy/phy.c
@@ -153,7 +153,7 @@ void b43legacy_phy_calibrate(struct b43legacy_wldev *dev)
 	phy->calibrated = 1;
 }
 
-/* intialize B PHY power control
+/* initialize B PHY power control
  * as described in http://bcm-specs.sipsolutions.net/InitPowerControl
  */
 static void b43legacy_phy_init_pctl(struct b43legacy_wldev *dev)
diff --git a/drivers/net/wireless/iwlwifi/iwl-sta.c b/drivers/net/wireless/iwlwifi/iwl-sta.c
index 7c7f7dcb1b1e..972b738c0e4a 100644
--- a/drivers/net/wireless/iwlwifi/iwl-sta.c
+++ b/drivers/net/wireless/iwlwifi/iwl-sta.c
@@ -107,7 +107,7 @@ static int iwl_process_add_sta_resp(struct iwl_priv *priv,
 	/*
 	 * XXX: The MAC address in the command buffer is often changed from
 	 * the original sent to the device. That is, the MAC address
-	 * written to the command buffer often is not the same MAC adress
+	 * written to the command buffer often is not the same MAC address
 	 * read from the command buffer when the command returns. This
 	 * issue has not yet been resolved and this debugging is left to
 	 * observe the problem.
diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c
index 2c8cc954d1b6..ec2c75d77cea 100644
--- a/drivers/net/wireless/prism54/islpci_dev.c
+++ b/drivers/net/wireless/prism54/islpci_dev.c
@@ -630,7 +630,7 @@ islpci_alloc_memory(islpci_private *priv)
 	printk(KERN_DEBUG "islpci_alloc_memory\n");
 #endif
 
-	/* remap the PCI device base address to accessable */
+	/* remap the PCI device base address to accessible */
 	if (!(priv->device_base =
 	      ioremap(pci_resource_start(priv->pdev, 0),
 		      ISL38XX_PCI_MEM_SIZE))) {
@@ -709,7 +709,7 @@ islpci_alloc_memory(islpci_private *priv)
 				   PCI_DMA_FROMDEVICE);
 		if (!priv->pci_map_rx_address[counter]) {
 			/* error mapping the buffer to device
-			   accessable memory address */
+			   accessible memory address */
 			printk(KERN_ERR "failed to map skb DMA'able\n");
 			goto out_free;
 		}
@@ -773,7 +773,7 @@ islpci_free_memory(islpci_private *priv)
 		priv->data_low_rx[counter] = NULL;
 	}
 
-	/* Free the acces control list and the WPA list */
+	/* Free the access control list and the WPA list */
 	prism54_acl_clean(&priv->acl);
 	prism54_wpa_bss_ie_clean(priv);
 	mgt_clean(priv);
diff --git a/drivers/net/wireless/prism54/islpci_eth.c b/drivers/net/wireless/prism54/islpci_eth.c
index 2fc52bc2d7dd..d44f8e20cce0 100644
--- a/drivers/net/wireless/prism54/islpci_eth.c
+++ b/drivers/net/wireless/prism54/islpci_eth.c
@@ -450,7 +450,7 @@ islpci_eth_receive(islpci_private *priv)
 				   MAX_FRAGMENT_SIZE_RX + 2,
 				   PCI_DMA_FROMDEVICE);
 		if (unlikely(!priv->pci_map_rx_address[index])) {
-			/* error mapping the buffer to device accessable memory address */
+			/* error mapping the buffer to device accessible memory address */
 			DEBUG(SHOW_ERROR_MESSAGES,
 			      "Error mapping DMA address\n");
 
diff --git a/drivers/net/wireless/rt2x00/rt2x00mac.c b/drivers/net/wireless/rt2x00/rt2x00mac.c
index c3c206a97d54..a7d3f4c3ee0d 100644
--- a/drivers/net/wireless/rt2x00/rt2x00mac.c
+++ b/drivers/net/wireless/rt2x00/rt2x00mac.c
@@ -274,7 +274,7 @@ int rt2x00mac_add_interface(struct ieee80211_hw *hw,
 	intf->beacon = entry;
 
 	/*
-	 * The MAC adddress must be configured after the device
+	 * The MAC address must be configured after the device
 	 * has been initialized. Otherwise the device can reset
 	 * the MAC registers.
 	 * The BSSID address must only be configured in AP mode,
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index ee82df62e646..3e5befe4d03b 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -192,7 +192,7 @@ static inline void wl3501_switch_page(struct wl3501_card *this, u8 page)
 }
 
 /*
- * Get Ethernet MAC addresss.
+ * Get Ethernet MAC address.
  *
  * WARNING: We switch to FPAGE0 and switc back again.
  *          Making sure there is no other WL function beening called by ISR.
diff --git a/drivers/pcmcia/m32r_cfc.h b/drivers/pcmcia/m32r_cfc.h
index 8146e3bee2e8..f558e1adf954 100644
--- a/drivers/pcmcia/m32r_cfc.h
+++ b/drivers/pcmcia/m32r_cfc.h
@@ -9,7 +9,7 @@
 #endif
 
 /*
- * M32R PC Card Controler
+ * M32R PC Card Controller
  */
 #define M32R_PCC0_BASE        0x00ef7000
 #define M32R_PCC1_BASE        0x00ef7020
diff --git a/drivers/pcmcia/m32r_pcc.h b/drivers/pcmcia/m32r_pcc.h
index e4fffe417ba9..f95c58563bc8 100644
--- a/drivers/pcmcia/m32r_pcc.h
+++ b/drivers/pcmcia/m32r_pcc.h
@@ -5,7 +5,7 @@
 #define M32R_MAX_PCC	2
 
 /*
- * M32R PC Card Controler
+ * M32R PC Card Controller
  */
 #define M32R_PCC0_BASE        0x00ef7000
 #define M32R_PCC1_BASE        0x00ef7020
diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 99d4f23cb435..0db482771fb5 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -1198,7 +1198,7 @@ static int __init m8xx_probe(struct platform_device *ofdev,
 	out_be32(M8XX_PGCRX(1),
 		 M8XX_PGCRX_CXOE | (mk_int_int_mask(hwirq) << 16));
 
-	/* intialize the fixed memory windows */
+	/* initialize the fixed memory windows */
 
 	for (i = 0; i < PCMCIA_SOCKETS_NO; i++) {
 		for (m = 0; m < PCMCIA_MEM_WIN_NO; m++) {
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 2d61186ad5a2..304333faf441 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -7193,7 +7193,7 @@ static struct ibm_struct volume_driver_data = {
  * 		TPACPI_FAN_WR_ACPI_FANS (X31/X40/X41)
  *
  *	FIRMWARE BUG: on some models, EC 0x2f might not be initialized at
- *	boot. Apparently the EC does not intialize it, so unless ACPI DSDT
+ *	boot. Apparently the EC does not initialize it, so unless ACPI DSDT
  *	does so, its initial value is meaningless (0x07).
  *
  *	For firmware bugs, refer to:
diff --git a/drivers/power/s3c_adc_battery.c b/drivers/power/s3c_adc_battery.c
index fe16b482e912..4a8ae3935b3b 100644
--- a/drivers/power/s3c_adc_battery.c
+++ b/drivers/power/s3c_adc_battery.c
@@ -1,5 +1,5 @@
 /*
- *	iPAQ h1930/h1940/rx1950 battery controler driver
+ *	iPAQ h1930/h1940/rx1950 battery controller driver
  *	Copyright (c) Vasily Khoruzhick
  *	Based on h1940_battery.c by Arnaud Patard
  *
@@ -427,5 +427,5 @@ static void __exit s3c_adc_bat_exit(void)
 module_exit(s3c_adc_bat_exit);
 
 MODULE_AUTHOR("Vasily Khoruzhick <anarsoul@gmail.com>");
-MODULE_DESCRIPTION("iPAQ H1930/H1940/RX1950 battery controler driver");
+MODULE_DESCRIPTION("iPAQ H1930/H1940/RX1950 battery controller driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c
index 0f19d540b655..122382dc4684 100644
--- a/drivers/s390/net/lcs.c
+++ b/drivers/s390/net/lcs.c
@@ -840,7 +840,7 @@ lcs_notify_lancmd_waiters(struct lcs_card *card, struct lcs_cmd *cmd)
 }
 
 /**
- * Emit buffer of a lan comand.
+ * Emit buffer of a lan command.
  */
 static void
 lcs_lancmd_timeout(unsigned long data)
diff --git a/drivers/s390/scsi/zfcp_cfdc.c b/drivers/s390/scsi/zfcp_cfdc.c
index d692e229ecba..4c09d03bcb32 100644
--- a/drivers/s390/scsi/zfcp_cfdc.c
+++ b/drivers/s390/scsi/zfcp_cfdc.c
@@ -317,7 +317,7 @@ static void zfcp_act_eval_err(struct zfcp_adapter *adapter, u32 table)
 
 /**
  * zfcp_cfdc_port_denied - Process "access denied" for port
- * @port: The port where the acces has been denied
+ * @port: The port where the access has been denied
  * @qual: The FSF status qualifier for the access denied FSF status
  */
 void zfcp_cfdc_port_denied(struct zfcp_port *port,
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index dbbc601948e5..7cb575654e40 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -416,7 +416,7 @@ static u8 orc_load_firmware(struct orc_host * host)
 	/* Go back and check they match */
 
 	outb(PRGMRST | DOWNLOAD, host->base + ORC_RISCCTL);	/* Reset program count 0 */
-	bios_addr -= 0x1000;	/* Reset the BIOS adddress      */
+	bios_addr -= 0x1000;	/* Reset the BIOS address */
 	for (i = 0, data32_ptr = (u8 *) & data32;	/* Check the code       */
 	     i < 0x1000;	/* Firmware code size = 4K      */
 	     i++, bios_addr++) {
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index afc9aeba5edb..060ac4bd5a14 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -91,7 +91,7 @@ void aac_fib_map_free(struct aac_dev *dev)
  *	aac_fib_setup	-	setup the fibs
  *	@dev: Adapter to set up
  *
- *	Allocate the PCI space for the fibs, map it and then intialise the
+ *	Allocate the PCI space for the fibs, map it and then initialise the
  *	fib area, the unmapped fib data and also the free list
  */
 
diff --git a/drivers/scsi/aic7xxx_old/aic7xxx.seq b/drivers/scsi/aic7xxx_old/aic7xxx.seq
index 5997e7c3a191..1565be9ebd49 100644
--- a/drivers/scsi/aic7xxx_old/aic7xxx.seq
+++ b/drivers/scsi/aic7xxx_old/aic7xxx.seq
@@ -1178,7 +1178,7 @@ notFound:
 /*
  * Retrieve an SCB by SCBID first searching the disconnected list falling
  * back to DMA'ing the SCB down from the host.  This routine assumes that
- * ARG_1 is the SCBID of interrest and that SINDEX is the position in the
+ * ARG_1 is the SCBID of interest and that SINDEX is the position in the
  * disconnected list to start the search from.  If SINDEX is SCB_LIST_NULL,
  * we go directly to the host for the SCB.
  */
diff --git a/drivers/scsi/aic94xx/aic94xx_reg_def.h b/drivers/scsi/aic94xx/aic94xx_reg_def.h
index 28aaf349c111..40273a747d29 100644
--- a/drivers/scsi/aic94xx/aic94xx_reg_def.h
+++ b/drivers/scsi/aic94xx/aic94xx_reg_def.h
@@ -1689,7 +1689,7 @@
 #define		PHY_START_CAL		0x01
 
 /*
- * HST_PCIX2 Registers, Addresss Range: (0x00-0xFC)
+ * HST_PCIX2 Registers, Address Range: (0x00-0xFC)
  */
 #define PCIX_REG_BASE_ADR		0xB8040000
 
@@ -1802,7 +1802,7 @@
 #define PCIC_TP_CTRL	0xFC
 
 /*
- * EXSI Registers, Addresss Range: (0x00-0xFC)
+ * EXSI Registers, Address Range: (0x00-0xFC)
  */
 #define EXSI_REG_BASE_ADR		REG_BASE_ADDR_EXSI
 
diff --git a/drivers/scsi/aic94xx/aic94xx_seq.c b/drivers/scsi/aic94xx/aic94xx_seq.c
index 74374618010c..390168f62a13 100644
--- a/drivers/scsi/aic94xx/aic94xx_seq.c
+++ b/drivers/scsi/aic94xx/aic94xx_seq.c
@@ -797,7 +797,7 @@ static void asd_init_lseq_mdp(struct asd_ha_struct *asd_ha,  int lseq)
 		int j;
 		/* Start from Page 1 of Mode 0 and 1. */
 		moffs = LSEQ_PAGE_SIZE + i*LSEQ_MODE_SCRATCH_SIZE;
-		/* All the fields of page 1 can be intialized to 0. */
+		/* All the fields of page 1 can be initialized to 0. */
 		for (j = 0; j < LSEQ_PAGE_SIZE; j += 4)
 			asd_write_reg_dword(asd_ha, LmSCRATCH(lseq)+moffs+j,0);
 	}
@@ -938,7 +938,7 @@ static void asd_init_cseq_cio(struct asd_ha_struct *asd_ha)
 	asd_write_reg_dword(asd_ha, SCBPRO, 0);
 	asd_write_reg_dword(asd_ha, CSEQCON, 0);
 
-	/* Intialize CSEQ Mode 11 Interrupt Vectors.
+	/* Initialize CSEQ Mode 11 Interrupt Vectors.
 	 * The addresses are 16 bit wide and in dword units.
 	 * The values of their macros are in byte units.
 	 * Thus we have to divide by 4. */
@@ -961,7 +961,7 @@ static void asd_init_cseq_cio(struct asd_ha_struct *asd_ha)
 	asd_write_reg_word(asd_ha, CPRGMCNT, cseq_idle_loop);
 
 	for (i = 0; i < 8; i++) {
-		/* Intialize Mode n Link m Interrupt Enable. */
+		/* Initialize Mode n Link m Interrupt Enable. */
 		asd_write_reg_dword(asd_ha, CMnINTEN(i), EN_CMnRSPMBXF);
 		/* Initialize Mode n Request Mailbox. */
 		asd_write_reg_dword(asd_ha, CMnREQMBX(i), 0);
diff --git a/drivers/scsi/bfa/bfa_fcpim.c b/drivers/scsi/bfa/bfa_fcpim.c
index 33c8dd51f474..aabbf3d3b394 100644
--- a/drivers/scsi/bfa/bfa_fcpim.c
+++ b/drivers/scsi/bfa/bfa_fcpim.c
@@ -2092,7 +2092,7 @@ bfa_ioim_sm_cleanup_qfull(struct bfa_ioim_s *ioim, enum bfa_ioim_event event)
 
 	case BFA_IOIM_SM_ABORT:
 		/**
-		 * IO is alraedy being cleaned up implicitly
+		 * IO is already being cleaned up implicitly
 		 */
 		ioim->io_cbfn = __bfa_cb_ioim_abort;
 		break;
diff --git a/drivers/scsi/bfa/bfa_fcs_lport.c b/drivers/scsi/bfa/bfa_fcs_lport.c
index b522bf30247a..b7aa23a3b1e8 100644
--- a/drivers/scsi/bfa/bfa_fcs_lport.c
+++ b/drivers/scsi/bfa/bfa_fcs_lport.c
@@ -5742,7 +5742,7 @@ bfa_cb_lps_fdisc_comp(void *bfad, void *uarg, bfa_status_t status)
 	switch (status) {
 	case BFA_STATUS_OK:
 		/*
-		 * Initialiaze the V-Port fields
+		 * Initialize the V-Port fields
 		 */
 		__vport_fcid(vport) = bfa_lps_get_pid(vport->lps);
 		vport->vport_stats.fdisc_accepts++;
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index 54f50b07dac7..31bbfb244bc4 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -3795,7 +3795,7 @@ static struct DeviceCtlBlk *device_alloc(struct AdapterCtlBlk *acb,
  * adapter_add_device - Adds the device instance to the adaptor instance.
  *
  * @acb: The adapter device to be updated
- * @dcb: A newly created and intialised device instance to add.
+ * @dcb: A newly created and initialised device instance to add.
  **/
 static void adapter_add_device(struct AdapterCtlBlk *acb,
 		struct DeviceCtlBlk *dcb)
@@ -4497,7 +4497,7 @@ static void __devinit adapter_init_chip(struct AdapterCtlBlk *acb)
  * init_adapter - Grab the resource for the card, setup the adapter
  * information, set the card into a known state, create the various
  * tables etc etc. This basically gets all adapter information all up
- * to date, intialised and gets the chip in sync with it.
+ * to date, initialised and gets the chip in sync with it.
  *
  * @host:	This hosts adapter structure
  * @io_port:	The base I/O port
@@ -4788,7 +4788,7 @@ static void banner_display(void)
  * that it finds in the system. The pci_dev strcuture indicates which
  * instance we are being called from.
  * 
- * @dev: The PCI device to intialize.
+ * @dev: The PCI device to initialize.
  * @id: Looks like a pointer to the entry in our pci device table
  * that was actually matched by the PCI subsystem.
  *
@@ -4859,7 +4859,7 @@ fail:
  * dc395x_remove_one - Called to remove a single instance of the
  * adapter.
  *
- * @dev: The PCI device to intialize.
+ * @dev: The PCI device to initialize.
  **/
 static void __devexit dc395x_remove_one(struct pci_dev *dev)
 {
diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c
index c797f6b48f05..54a6ec8e131d 100644
--- a/drivers/scsi/libfc/fc_fcp.c
+++ b/drivers/scsi/libfc/fc_fcp.c
@@ -1212,7 +1212,7 @@ static void fc_lun_reset_send(unsigned long data)
 /**
  * fc_lun_reset() - Send a LUN RESET command to a device
  *		    and wait for the reply
- * @lport: The local port to sent the comand on
+ * @lport: The local port to sent the command on
  * @fsp:   The FCP packet that identifies the LUN to be reset
  * @id:	   The SCSI command ID
  * @lun:   The LUN ID to be reset
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index f681eea57730..0e4abb96d68e 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -1339,7 +1339,7 @@ lpfc_##attr##_show(struct device *dev, struct device_attribute *attr, \
 }
 
 /**
- * lpfc_param_init - Intializes a cfg attribute
+ * lpfc_param_init - Initializes a cfg attribute
  *
  * Description:
  * Macro that given an attr e.g. hba_queue_depth expands
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index a345dde16c86..2a6866e63aae 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -2614,7 +2614,7 @@ lpfc_mbx_process_link_up(struct lpfc_hba *phba, READ_LA_VAR *la)
 			if (unlikely(!fcf_record)) {
 				lpfc_printf_log(phba, KERN_ERR,
 					LOG_MBOX | LOG_SLI,
-					"2554 Could not allocate memmory for "
+					"2554 Could not allocate memory for "
 					"fcf record\n");
 				rc = -ENODEV;
 				goto out;
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 0d1e187b005d..c5614cfcc6e9 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -9619,7 +9619,7 @@ lpfc_sli4_intr_handler(int irq, void *dev_id)
  * lpfc_sli4_queue_free - free a queue structure and associated memory
  * @queue: The queue structure to free.
  *
- * This function frees a queue structure and the DMAable memeory used for
+ * This function frees a queue structure and the DMAable memory used for
  * the host resident queue. This function must be called after destroying the
  * queue on the HBA.
  **/
diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h
index 2b4a048cadf1..deb24ad1b8d2 100644
--- a/drivers/scsi/megaraid.h
+++ b/drivers/scsi/megaraid.h
@@ -13,7 +13,7 @@
  */
 
 /*
- * Comand coalescing - This feature allows the driver to be able to combine
+ * Command coalescing - This feature allows the driver to be able to combine
  * two or more commands and issue as one command in order to boost I/O
  * performance. Useful if the nature of the I/O is sequential. It is not very
  * useful for random natured I/Os.
diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c
index f8c86b28f03f..b95285f3383f 100644
--- a/drivers/scsi/pm8001/pm8001_init.c
+++ b/drivers/scsi/pm8001/pm8001_init.c
@@ -603,7 +603,7 @@ static u32 pm8001_request_irq(struct pm8001_hba_info *pm8001_ha)
 #endif
 
 intx:
-	/* intialize the INT-X interrupt */
+	/* initialize the INT-X interrupt */
 	rc = request_irq(pdev->irq, irq_handler, IRQF_SHARED, DRV_NAME,
 		SHOST_TO_SAS_HA(pm8001_ha->shost));
 	return rc;
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index d53e6503c6d5..a2ed201885ae 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -477,7 +477,7 @@ EXPORT_SYMBOL_GPL(scsi_nl_remove_driver);
 
 
 /**
- * scsi_netlink_init - Called by SCSI subsystem to intialize
+ * scsi_netlink_init - Called by SCSI subsystem to initialize
  * 	the SCSI transport netlink interface
  *
  **/
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 8b955b534a36..4afa81ad2caa 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -1864,7 +1864,7 @@ static pci_ers_result_t sym2_io_slot_dump(struct pci_dev *pdev)
  *
  * This routine is similar to sym_set_workarounds(), except
  * that, at this point, we already know that the device was
- * successfully intialized at least once before, and so most
+ * successfully initialized at least once before, and so most
  * of the steps taken there are un-needed here.
  */
 static void sym2_reset_workarounds(struct pci_dev *pdev)
diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index 154529aacc03..49596a18f953 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -341,9 +341,9 @@ static void atmel_spi_next_message(struct spi_master *master)
 /*
  * For DMA, tx_buf/tx_dma have the same relationship as rx_buf/rx_dma:
  *  - The buffer is either valid for CPU access, else NULL
- *  - If the buffer is valid, so is its DMA addresss
+ *  - If the buffer is valid, so is its DMA address
  *
- * This driver manages the dma addresss unless message->is_dma_mapped.
+ * This driver manages the dma address unless message->is_dma_mapped.
  */
 static int
 atmel_spi_dma_map_xfer(struct atmel_spi *as, struct spi_transfer *xfer)
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 4e6245e67995..603428213d21 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -38,7 +38,7 @@
 
 
 /*
- * This supports acccess to SPI devices using normal userspace I/O calls.
+ * This supports access to SPI devices using normal userspace I/O calls.
  * Note that while traditional UNIX/POSIX I/O semantics are half duplex,
  * and often mask message boundaries, full SPI support requires full duplex
  * transfers.  There are several kinds of internal message boundaries to
diff --git a/drivers/staging/stradis/stradis.c b/drivers/staging/stradis/stradis.c
index a057824e7ebc..dd3874b78263 100644
--- a/drivers/staging/stradis/stradis.c
+++ b/drivers/staging/stradis/stradis.c
@@ -745,7 +745,7 @@ static void set_out_format(struct saa7146 *saa, int mode)
 	}
 }
 
-/* Intialize bitmangler to map from a byte value to the mangled word that
+/* Initialize bitmangler to map from a byte value to the mangled word that
  * must be output to program the Xilinx part through the DEBI port.
  * Xilinx Data Bit->DEBI Bit: 0->15 1->7 2->6 3->12 4->11 5->2 6->1 7->0
  * transfer FPGA code, init IBM chip, transfer IBM microcode
diff --git a/drivers/usb/gadget/imx_udc.c b/drivers/usb/gadget/imx_udc.c
index ed0266462c57..1f8084d59a43 100644
--- a/drivers/usb/gadget/imx_udc.c
+++ b/drivers/usb/gadget/imx_udc.c
@@ -1316,7 +1316,7 @@ static struct imx_udc_struct controller = {
 };
 
 /*******************************************************************************
- * USB gadged driver functions
+ * USB gadget driver functions
  *******************************************************************************
  */
 int usb_gadget_probe_driver(struct usb_gadget_driver *driver,
diff --git a/drivers/usb/host/imx21-hcd.c b/drivers/usb/host/imx21-hcd.c
index e49b75a78000..f90d003f2302 100644
--- a/drivers/usb/host/imx21-hcd.c
+++ b/drivers/usb/host/imx21-hcd.c
@@ -1658,7 +1658,7 @@ static int imx21_hc_reset(struct usb_hcd *hcd)
 
 	spin_lock_irqsave(&imx21->lock, flags);
 
-	/* Reset the Host controler modules */
+	/* Reset the Host controller modules */
 	writel(USBOTG_RST_RSTCTRL | USBOTG_RST_RSTRH |
 		USBOTG_RST_RSTHSIE | USBOTG_RST_RSTHC,
 		imx21->regs + USBOTG_RST_CTRL);
diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c
index 32149be4ad8e..e0cb12b573f9 100644
--- a/drivers/usb/host/oxu210hp-hcd.c
+++ b/drivers/usb/host/oxu210hp-hcd.c
@@ -3094,7 +3094,7 @@ static int oxu_hub_status_data(struct usb_hcd *hcd, char *buf)
 
 	/* Some boards (mostly VIA?) report bogus overcurrent indications,
 	 * causing massive log spam unless we completely ignore them.  It
-	 * may be relevant that VIA VT8235 controlers, where PORT_POWER is
+	 * may be relevant that VIA VT8235 controllers, where PORT_POWER is
 	 * always set, seem to clear PORT_OCC and PORT_CSC when writing to
 	 * PORT_POWER; that's surprising, but maybe within-spec.
 	 */
diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c
index 44f8b9225054..a6afd15f6a46 100644
--- a/drivers/usb/misc/adutux.c
+++ b/drivers/usb/misc/adutux.c
@@ -717,7 +717,7 @@ static int adu_probe(struct usb_interface *interface,
 		goto exit;
 	}
 
-	/* allocate memory for our device state and intialize it */
+	/* allocate memory for our device state and initialize it */
 	dev = kzalloc(sizeof(struct adu_device), GFP_KERNEL);
 	if (dev == NULL) {
 		dev_err(&interface->dev, "Out of memory\n");
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 375664198776..7a10e48b78f9 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -768,7 +768,7 @@ static int iowarrior_probe(struct usb_interface *interface,
 	int i;
 	int retval = -ENOMEM;
 
-	/* allocate memory for our device state and intialize it */
+	/* allocate memory for our device state and initialize it */
 	dev = kzalloc(sizeof(struct iowarrior), GFP_KERNEL);
 	if (dev == NULL) {
 		dev_err(&interface->dev, "Out of memory\n");
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index edffef642337..eefb8275bb7e 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -642,7 +642,7 @@ static int ld_usb_probe(struct usb_interface *intf, const struct usb_device_id *
 	int i;
 	int retval = -ENOMEM;
 
-	/* allocate memory for our device state and intialize it */
+	/* allocate memory for our device state and initialize it */
 
 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (dev == NULL) {
diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c
index 5d815049cbaa..1cbeaa17ffff 100644
--- a/drivers/usb/musb/musb_gadget.c
+++ b/drivers/usb/musb/musb_gadget.c
@@ -1636,7 +1636,7 @@ static inline void __init musb_g_init_endpoints(struct musb *musb)
 	struct musb_hw_ep	*hw_ep;
 	unsigned		count = 0;
 
-	/* intialize endpoint list just once */
+	/* initialize endpoint list just once */
 	INIT_LIST_HEAD(&(musb->g.ep_list));
 
 	for (epnum = 0, hw_ep = musb->endpoints;
@@ -1715,7 +1715,7 @@ void musb_gadget_cleanup(struct musb *musb)
  *
  * -EINVAL something went wrong (not driver)
  * -EBUSY another gadget is already using the controller
- * -ENOMEM no memeory to perform the operation
+ * -ENOMEM no memory to perform the operation
  *
  * @param driver the gadget driver
  * @param bind the driver's bind function
diff --git a/drivers/usb/wusbcore/wa-rpipe.c b/drivers/usb/wusbcore/wa-rpipe.c
index c7b1d8108de9..8cb9d80207fa 100644
--- a/drivers/usb/wusbcore/wa-rpipe.c
+++ b/drivers/usb/wusbcore/wa-rpipe.c
@@ -49,7 +49,7 @@
  *
  *  USB Stack port number    4 (1 based)
  *  WUSB code port index     3 (0 based)
- *  USB Addresss             5 (2 based -- 0 is for default, 1 for root hub)
+ *  USB Address             5 (2 based -- 0 is for default, 1 for root hub)
  *
  *  Now, because we don't use the concept as default address exactly
  *  like the (wired) USB code does, we need to kind of skip it. So we
diff --git a/drivers/video/sstfb.c b/drivers/video/sstfb.c
index dee64c3b1e67..2ab704118c44 100644
--- a/drivers/video/sstfb.c
+++ b/drivers/video/sstfb.c
@@ -536,7 +536,7 @@ static int sstfb_set_par(struct fb_info *info)
 	fbiinit2 = sst_read(FBIINIT2);
 	fbiinit3 = sst_read(FBIINIT3);
 
-	/* everything is reset. we enable fbiinit2/3 remap : dac acces ok */
+	/* everything is reset. we enable fbiinit2/3 remap : dac access ok */
 	pci_write_config_dword(sst_dev, PCI_INIT_ENABLE,
 	                       PCI_EN_INIT_WR | PCI_REMAP_DAC );
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f82..47162de0b957 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -577,7 +577,7 @@ struct ext4_mount_options {
 #endif
 };
 
-/* Max physical block we can addres w/o extents */
+/* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
 
 /*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1fd..966ecb0d8f86 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2825,14 +2825,14 @@ fix_extent_len:
  * to an uninitialized extent.
  *
  * Writing to an uninitized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * extent into multiple /initialized uninitialized extents (up to three)
  * There are three possibilities:
  *   a> There is no split required: Entire extent should be uninitialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  *
  * One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
  * complete, we need to split the uninitialized extent before DIO submit
  * the IO. The uninitialized extent called at this time will be split
  * into three uninitialized extent(at most). After IO complete, the part
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 191616470466..4bc84b8adb7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3740,9 +3740,9 @@ retry:
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
  * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
  *
  * The unwrritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f935fd6600dd..4068c6c4c6f6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	 * #1 and #2 can be simply solved by never taking the lock
 	 * here for system files (which are the only type we read
 	 * during mount). It's a heavier approach, but our main
-	 * concern is user-accesible files anyway.
+	 * concern is user-accessible files anyway.
 	 *
 	 * #3 works itself out because we'll eventually take the
 	 * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5fed60de7630..71998d4d61d5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	if (res->sr_bg_blkno) {
 		/* Attempt to short-circuit the usual search mechanism
 		 * by jumping straight to the most recently used
-		 * allocation group. This helps us mantain some
+		 * allocation group. This helps us maintain some
 		 * contiguousness across allocations. */
 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
 						min_bits, res, &bits_left);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae4..7465a7ffc4fd 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -938,7 +938,7 @@ out_reclaim:
  * Slab object creation initialisation for the XFS inode.
  * This covers only the idempotent fields in the XFS inode;
  * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
  * fields in the xfs inode that left in the initialise state
  * when freeing the inode.
  */
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index c637b75b9f3f..cd77aa75c962 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -119,7 +119,7 @@ struct acpi_whea_header {
 struct acpi_table_bert {
 	struct acpi_table_header header;	/* Common ACPI table header */
 	u32 region_length;	/* Length of the boot error region */
-	u64 address;		/* Physical addresss of the error region */
+	u64 address;		/* Physical address of the error region */
 };
 
 /* Boot Error Region (not a subtable, pointed to by Address field above) */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed4ba111bc8d..ce104e33cd22 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -564,7 +564,7 @@ struct cgroup_iter {
 /*
  * To iterate across the tasks in a cgroup:
  *
- * 1) call cgroup_iter_start to intialize an iterator
+ * 1) call cgroup_iter_start to initialize an iterator
  *
  * 2) call cgroup_iter_next() to retrieve member tasks until it
  *    returns NULL or until you want to end the iteration
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 68c642d8843d..59ea406be7f6 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -273,7 +273,7 @@ struct fw_cdev_event_iso_interrupt {
  * @closure:	See &fw_cdev_event_common;
  *		set by %FW_CDEV_CREATE_ISO_CONTEXT ioctl
  * @type:	%FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL
- * @completed:	Offset into the receive buffer; data before this offest is valid
+ * @completed:	Offset into the receive buffer; data before this offset is valid
  *
  * This event is sent in multichannel contexts (context type
  * %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL) for &fw_cdev_iso_packet buffer
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index cb93d80aa642..5582ab3d3e48 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -39,7 +39,7 @@ struct mfd_cell {
 	size_t			data_size;
 
 	/*
-	 * This resources can be specified relatievly to the parent device.
+	 * This resources can be specified relatively to the parent device.
 	 * For accessing device you should use resources from device
 	 */
 	int			num_resources;
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 2b2769c5ca9f..2a128c8c2718 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -99,8 +99,8 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_SOCKOPT_PEELOFF	102	/* peel off association. */
 /* Options 104-106 are deprecated and removed. Do not use this space */
 #define SCTP_SOCKOPT_CONNECTX_OLD	107	/* CONNECTX old requests. */
-#define SCTP_GET_PEER_ADDRS	108		/* Get all peer addresss. */
-#define SCTP_GET_LOCAL_ADDRS	109		/* Get all local addresss. */
+#define SCTP_GET_PEER_ADDRS	108		/* Get all peer address. */
+#define SCTP_GET_LOCAL_ADDRS	109		/* Get all local address. */
 #define SCTP_SOCKOPT_CONNECTX	110		/* CONNECTX requests. */
 #define SCTP_SOCKOPT_CONNECTX3	111	/* CONNECTX requests (updated) */
 
diff --git a/include/scsi/fc/fc_fcp.h b/include/scsi/fc/fc_fcp.h
index 8e9b222251c2..8a143ca79878 100644
--- a/include/scsi/fc/fc_fcp.h
+++ b/include/scsi/fc/fc_fcp.h
@@ -46,7 +46,7 @@
  */
 struct fcp_cmnd {
 	__u8		fc_lun[8];	/* logical unit number */
-	__u8		fc_cmdref;	/* commmand reference number */
+	__u8		fc_cmdref;	/* command reference number */
 	__u8		fc_pri_ta;	/* priority and task attribute */
 	__u8		fc_tm_flags;	/* task management flags */
 	__u8		fc_flags;	/* additional len & flags */
@@ -58,7 +58,7 @@ struct fcp_cmnd {
 
 struct fcp_cmnd32 {
 	__u8		fc_lun[8];	/* logical unit number */
-	__u8		fc_cmdref;	/* commmand reference number */
+	__u8		fc_cmdref;	/* command reference number */
 	__u8		fc_pri_ta;	/* priority and task attribute */
 	__u8		fc_tm_flags;	/* task management flags */
 	__u8		fc_flags;	/* additional len & flags */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 37755d621924..7242cc71bb7e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2913,7 +2913,7 @@ static void __init kdb_cmd_init(void)
 	}
 }
 
-/* Intialize kdb_printf, breakpoint tables and kdb state */
+/* Initialize kdb_printf, breakpoint tables and kdb state */
 void __init kdb_init(int lvl)
 {
 	static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	 * just verifies it is an address we can use.
 	 *
 	 * Since the kernel does everything in page size chunks ensure
-	 * the destination addreses are page aligned.  Too many
+	 * the destination addresses are page aligned.  Too many
 	 * special cases crop of when we don't do this.  The most
 	 * insidious is getting overlapping destination addresses
 	 * simply because addresses are changed to page size
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..cd09c22de03d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -865,7 +865,7 @@ out_finish:
 /**
  *	swsusp_read - read the hibernation image.
  *	@flags_p: flags passed by the "frozen" kernel in the image header should
- *		  be written into this memeory location
+ *		  be written into this memory location
  */
 
 int swsusp_read(unsigned int *flags_p)
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..554c0d6c489e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2568,7 +2568,7 @@ out:
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not already there.  The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.  this_rq() stays locked over invocation.
  */
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..d9c5fe4ff1b2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1193,7 +1193,7 @@ static ssize_t bin_dn_node_address(struct file *file,
 
 		buf[result] = '\0';
 
-		/* Convert the decnet addresss to binary */
+		/* Convert the decnet address to binary */
 		result = -EIO;
 		nodep = strchr(buf, '.') + 1;
 		if (!nodep)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..ddbbaf2950ef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -678,7 +678,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 
-	/* Intialize mult/shift and max_idle_ns */
+	/* Initialize mult/shift and max_idle_ns */
 	__clocksource_updatefreq_scale(cs, scale, freq);
 
 	/* Add clocksource to the clcoksource list */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
  */
 
 /*
- * Function trace entry - function address and parent function addres:
+ * Function trace entry - function address and parent function address:
  */
 FTRACE_ENTRY(function, ftrace_entry,
 
diff --git a/lib/nlattr.c b/lib/nlattr.c
index c4706eb98d3d..18d158df6f0e 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -167,7 +167,7 @@ nla_policy_len(const struct nla_policy *p, int n)
  * @policy: validation policy
  *
  * Parses a stream of attributes and stores a pointer to each attribute in
- * the tb array accessable via the attribute type. Attributes with a type
+ * the tb array accessible via the attribute type. Attributes with a type
  * exceeding maxtype will be silently ignored for backwards compatibility
  * reasons. policy may be set to NULL if no validation is required.
  *
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 7c06ee51a29a..c47bbe11b804 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -60,7 +60,7 @@ int swiotlb_force;
 static char *io_tlb_start, *io_tlb_end;
 
 /*
- * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+ * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
  * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
  */
 static unsigned long io_tlb_nslabs;
diff --git a/mm/percpu.c b/mm/percpu.c
index efe816856a9d..f715d01d5ba3 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 
 /*
  * (Un)populated page region iterators.  Iterate over (un)populated
- * page regions betwen @start and @end in @chunk.  @rs and @re should
+ * page regions between @start and @end in @chunk.  @rs and @re should
  * be integer variables and will be set to start and end page index of
  * the current region.
  */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 29d6cbffb283..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
  *
  * However, virtual mappings need a page table and TLBs. Many Linux
  * architectures already map their physical space using 1-1 mappings
- * via TLBs. For those arches the virtual memmory map is essentially
+ * via TLBs. For those arches the virtual memory map is essentially
  * for free if we use the same page size as the 1-1 mappings. In that
  * case the overhead consists of a few additional pages that are
  * allocated to create a view of memory for vmemmap.
diff --git a/net/core/dev.c b/net/core/dev.c
index 35dfb8318483..89204e8c0e14 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6096,7 +6096,7 @@ static void __net_exit default_device_exit(struct net *net)
 static void __net_exit default_device_exit_batch(struct list_head *net_list)
 {
 	/* At exit all network devices most be removed from a network
-	 * namespace.  Do this in the reverse order of registeration.
+	 * namespace.  Do this in the reverse order of registration.
 	 * Do this across as many network namespaces as possible to
 	 * improve batching efficiency.
 	 */
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 4c409b46aa35..9b73e0b03e34 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1112,7 +1112,7 @@ static struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
 /*
  * This processes a device up event. We only start up
  * the loopback device & ethernet devices with correct
- * MAC addreses automatically. Others must be started
+ * MAC addresses automatically. Others must be started
  * specifically.
  *
  * FIXME: How should we configure the loopback address ? If we could dispense
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 05b1ecf36763..e96152207164 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1336,7 +1336,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
 	return 0;
 }
 
-/* Intialize TSO state of a skb.
+/* Initialize TSO state of a skb.
  * This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index 9df80114b47b..c06f1503aa8b 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -946,7 +946,7 @@ static void randomize_choice_values(struct symbol *csym)
 	int cnt, def;
 
 	/*
-	 * If choice is mod then we may have more items slected
+	 * If choice is mod then we may have more items selected
 	 * and if no then no-one.
 	 * In both cases stop.
 	 */
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 33122ca04e7c..6619ed8843f9 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1614,7 +1614,7 @@ static void section_rel(const char *modname, struct elf_info *elf,
  * A module includes a number of sections that are discarded
  * either when loaded or when used as built-in.
  * For loaded modules all functions marked __init and all data
- * marked __initdata will be discarded when the module has been intialized.
+ * marked __initdata will be discarded when the module has been initialized.
  * Likewise for modules used built-in the sections marked __exit
  * are discarded because __exit marked function are supposed to be called
  * only when a module is unloaded which never happens for built-in modules.
diff --git a/security/apparmor/include/match.h b/security/apparmor/include/match.h
index 734a6d35112c..e00c3c7c96be 100644
--- a/security/apparmor/include/match.h
+++ b/security/apparmor/include/match.h
@@ -27,7 +27,7 @@
  * The format used for transition tables is based on the GNU flex table
  * file format (--tables-file option; see Table File Format in the flex
  * info pages and the flex sources for documentation). The magic number
- * used in the header is 0x1B5E783D insted of 0xF13C57B1 though, because
+ * used in the header is 0x1B5E783D instead of 0xF13C57B1 though, because
  * the YY_ID_CHK (check) and YY_ID_DEF (default) tables are used
  * slightly differently (see the apparmor-parser package).
  */
diff --git a/sound/core/init.c b/sound/core/init.c
index 57b792e2439a..3e65da21a08c 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -642,7 +642,7 @@ static struct device_attribute card_number_attrs =
  *  external accesses.  Thus, you should call this function at the end
  *  of the initialization of the card.
  *
- *  Returns zero otherwise a negative error code if the registrain failed.
+ *  Returns zero otherwise a negative error code if the registration failed.
  */
 int snd_card_register(struct snd_card *card)
 {
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 8bc7cb3db330..8e022fc70f55 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -985,7 +985,7 @@ static int snd_pcm_do_pause(struct snd_pcm_substream *substream, int push)
 	if (push)
 		snd_pcm_update_hw_ptr(substream);
 	/* The jiffies check in snd_pcm_update_hw_ptr*() is done by
-	 * a delta betwen the current jiffies, this gives a large enough
+	 * a delta between the current jiffies, this gives a large enough
 	 * delta, effectively to skip the check once.
 	 */
 	substream->runtime->hw_ptr_jiffies = jiffies - HZ * 1000;
diff --git a/sound/isa/opl3sa2.c b/sound/isa/opl3sa2.c
index 265abcce9dba..9b915e27b5bd 100644
--- a/sound/isa/opl3sa2.c
+++ b/sound/isa/opl3sa2.c
@@ -264,7 +264,7 @@ static int __devinit snd_opl3sa2_detect(struct snd_card *card)
 		snd_printd("OPL3-SA [0x%lx] detect (1) = 0x%x (0x%x)\n", port, tmp, tmp1);
 		return -ENODEV;
 	}
-	/* try if the MIC register is accesible */
+	/* try if the MIC register is accessible */
 	tmp = snd_opl3sa2_read(chip, OPL3SA2_MIC);
 	snd_opl3sa2_write(chip, OPL3SA2_MIC, 0x8a);
 	if (((tmp1 = snd_opl3sa2_read(chip, OPL3SA2_MIC)) & 0x9f) != 0x8a) {
diff --git a/sound/pci/ca0106/ca0106.h b/sound/pci/ca0106/ca0106.h
index f19c11077255..fc53b9bca26d 100644
--- a/sound/pci/ca0106/ca0106.h
+++ b/sound/pci/ca0106/ca0106.h
@@ -188,7 +188,7 @@
 #define PLAYBACK_LIST_PTR	0x02		/* Pointer to the current period being played */
 						/* PTR[5:0], Default: 0x0 */
 #define PLAYBACK_UNKNOWN3	0x03		/* Not used ?? */
-#define PLAYBACK_DMA_ADDR	0x04		/* Playback DMA addresss */
+#define PLAYBACK_DMA_ADDR	0x04		/* Playback DMA address */
 						/* DMA[31:0], Default: 0x0 */
 #define PLAYBACK_PERIOD_SIZE	0x05		/* Playback period size. win2000 uses 0x04000000 */
 						/* SIZE[31:16], Default: 0x0 */
diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c
index df47f738098d..0c701e4ec8a5 100644
--- a/sound/pci/emu10k1/emu10k1x.c
+++ b/sound/pci/emu10k1/emu10k1x.c
@@ -114,7 +114,7 @@ MODULE_PARM_DESC(enable, "Enable the EMU10K1X soundcard.");
 						 */
 #define PLAYBACK_LIST_SIZE	0x01		/* Size of list in bytes << 16. E.g. 8 periods -> 0x00380000  */
 #define PLAYBACK_LIST_PTR	0x02		/* Pointer to the current period being played */
-#define PLAYBACK_DMA_ADDR	0x04		/* Playback DMA addresss */
+#define PLAYBACK_DMA_ADDR	0x04		/* Playback DMA address */
 #define PLAYBACK_PERIOD_SIZE	0x05		/* Playback period size */
 #define PLAYBACK_POINTER	0x06		/* Playback period pointer. Sample currently in DAC */
 #define PLAYBACK_UNKNOWN1       0x07
diff --git a/sound/pci/emu10k1/p16v.h b/sound/pci/emu10k1/p16v.h
index 153214940336..00f4817533b1 100644
--- a/sound/pci/emu10k1/p16v.h
+++ b/sound/pci/emu10k1/p16v.h
@@ -96,7 +96,7 @@
 #define PLAYBACK_LIST_SIZE	0x01		/* Size of list in bytes << 16. E.g. 8 periods -> 0x00380000  */
 #define PLAYBACK_LIST_PTR	0x02		/* Pointer to the current period being played */
 #define PLAYBACK_UNKNOWN3	0x03		/* Not used */
-#define PLAYBACK_DMA_ADDR	0x04		/* Playback DMA addresss */
+#define PLAYBACK_DMA_ADDR	0x04		/* Playback DMA address */
 #define PLAYBACK_PERIOD_SIZE	0x05		/* Playback period size. win2000 uses 0x04000000 */
 #define PLAYBACK_POINTER	0x06		/* Playback period pointer. Used with PLAYBACK_LIST_PTR to determine buffer position currently in DAC */
 #define PLAYBACK_FIFO_END_ADDRESS	0x07		/* Playback FIFO end address */
diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c
index 23a58f0d6cb9..7c17f45d876d 100644
--- a/sound/pci/es1968.c
+++ b/sound/pci/es1968.c
@@ -220,7 +220,7 @@ MODULE_PARM_DESC(joystick, "Enable joystick.");
 #define	RINGB_EN_2CODEC		0x0020
 #define RINGB_SING_BIT_DUAL	0x0040
 
-/* ****Port Adresses**** */
+/* ****Port Addresses**** */
 
 /*   Write & Read */
 #define ESM_INDEX		0x02
diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index 0c98ef9156d8..f5eadfc0672a 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -487,7 +487,7 @@ struct hdspm {
 	struct snd_kcontrol *playback_mixer_ctls[HDSPM_MAX_CHANNELS];
 	/* but input to much, so not used */
 	struct snd_kcontrol *input_mixer_ctls[HDSPM_MAX_CHANNELS];
-	/* full mixer accessable over mixer ioctl or hwdep-device */
+	/* full mixer accessible over mixer ioctl or hwdep-device */
 	struct hdspm_mixer *mixer;
 
 };
@@ -550,7 +550,7 @@ static inline int HDSPM_bit2freq(int n)
 	return bit2freq_tab[n];
 }
 
-/* Write/read to/from HDSPM with Adresses in Bytes
+/* Write/read to/from HDSPM with Addresses in Bytes
    not words but only 32Bit writes are allowed */
 
 static inline void hdspm_write(struct hdspm * hdspm, unsigned int reg,
@@ -2908,7 +2908,7 @@ static int snd_hdspm_create_controls(struct snd_card *card, struct hdspm * hdspm
 
 	/* Channel playback mixer as default control 
 	   Note: the whole matrix would be 128*HDSPM_MIXER_CHANNELS Faders,
-	   thats too * big for any alsamixer they are accesible via special
+	   thats too * big for any alsamixer they are accessible via special
 	   IOCTL on hwdep and the mixer 2dimensional mixer control
 	*/
 
diff --git a/sound/soc/codecs/max98088.c b/sound/soc/codecs/max98088.c
index bc22ee93a75d..43316a70be05 100644
--- a/sound/soc/codecs/max98088.c
+++ b/sound/soc/codecs/max98088.c
@@ -1953,7 +1953,7 @@ static int max98088_probe(struct snd_soc_codec *codec)
                return ret;
        }
 
-       /* initalize private data */
+       /* initialize private data */
 
        max98088->sysclk = (unsigned)-1;
        max98088->eq_textcnt = 0;
diff --git a/sound/soc/s3c24xx/smdk_spdif.c b/sound/soc/s3c24xx/smdk_spdif.c
index f31d22ad7c88..11c88b13dd2e 100644
--- a/sound/soc/s3c24xx/smdk_spdif.c
+++ b/sound/soc/s3c24xx/smdk_spdif.c
@@ -61,7 +61,7 @@ static int set_audio_clock_heirachy(struct platform_device *pdev)
 		goto out3;
 	}
 
-	/* Set audio clock heirachy for S/PDIF */
+	/* Set audio clock hierarchy for S/PDIF */
 	clk_set_parent(mout_epll, fout_epll);
 	clk_set_parent(sclk_audio0, mout_epll);
 	clk_set_parent(sclk_spdif, sclk_audio0);
@@ -79,7 +79,7 @@ out1:
 
 /* We should haved to set clock directly on this part because of clock
  * scheme of Samsudng SoCs did not support to set rates from abstrct
- * clock of it's heirachy.
+ * clock of it's hierarchy.
  */
 static int set_audio_clock_rate(unsigned long epll_rate,
 				unsigned long audio_rate)
@@ -197,7 +197,7 @@ static int __init smdk_init(void)
 	if (ret)
 		goto err1;
 
-	/* Set audio clock heirachy manually */
+	/* Set audio clock hierarchy manually */
 	ret = set_audio_clock_heirachy(smdk_snd_spdif_device);
 	if (ret)
 		goto err1;
-- 
cgit v1.2.3-71-gd317


From 4a92379bdfb48680a5e6775dd53a586df7b6b0b1 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Thu, 21 Oct 2010 10:29:19 +0100
Subject: slub tracing: move trace calls out of always inlined functions to
 reduce kernel code size

Having the trace calls defined in the always inlined kmalloc functions
in include/linux/slub_def.h causes a lot of code duplication as the
trace functions get instantiated for each kamalloc call site. This can
simply be removed by pushing the trace calls down into the functions in
slub.c.

On my x86_64 built this patch shrinks the code size of the kernel by
approx 36K and also shrinks the code size of many modules -- too many to
list here ;)

size vmlinux (2.6.36) reports
       text        data     bss     dec     hex filename
    5410611	 743172	 828928	6982711	 6a8c37	vmlinux
    5373738	 744244	 828928	6946910	 6a005e	vmlinux + patch

The resulting kernel has had some testing & kmalloc trace still seems to
work.

This patch
- moves trace_kmalloc out of the inlined kmalloc() and pushes it down
into kmem_cache_alloc_trace() so this it only get instantiated once.

- rename kmem_cache_alloc_notrace()  to kmem_cache_alloc_trace() to
indicate that now is does have tracing. (maybe this would better being
called something like kmalloc_kmem_cache ?)

- adds a new function kmalloc_order() to handle allocation and tracing
of large allocations of page order.

- removes tracing from the inlined kmalloc_large() replacing them with a
call to kmalloc_order();

- move tracing out of inlined kmalloc_node() and pushing it down into
kmem_cache_alloc_node_trace

- rename kmem_cache_alloc_node_notrace() to
kmem_cache_alloc_node_trace()

- removes the include of trace/events/kmem.h from slub_def.h.

v2
- keep kmalloc_order_trace inline when !CONFIG_TRACE

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h | 55 +++++++++++++++++++++++-------------------------
 mm/slub.c                | 30 ++++++++++++++++++++------
 2 files changed, 49 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index e4f5ed180b9b..8b6e8ae5d5ca 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,9 +10,8 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemleak.h>
 
-#include <trace/events/kmem.h>
+#include <linux/kmemleak.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
@@ -216,31 +215,40 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
+static __always_inline void *
+kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
+	kmemleak_alloc(ret, size, 1, flags);
+	return ret;
+}
+
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags);
+extern void *
+kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size);
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
 #else
 static __always_inline void *
-kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	return kmem_cache_alloc(s, gfpflags);
 }
+
+static __always_inline void *
+kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+	return kmalloc_order(size, flags, order);
+}
 #endif
 
 static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 {
 	unsigned int order = get_order(size);
-	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
-
-	kmemleak_alloc(ret, size, 1, flags);
-	trace_kmalloc(_THIS_IP_, ret, size, PAGE_SIZE << order, flags);
-
-	return ret;
+	return kmalloc_order_trace(size, flags, order);
 }
 
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
-	void *ret;
-
 	if (__builtin_constant_p(size)) {
 		if (size > SLUB_MAX_SIZE)
 			return kmalloc_large(size, flags);
@@ -251,11 +259,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 			if (!s)
 				return ZERO_SIZE_PTR;
 
-			ret = kmem_cache_alloc_notrace(s, flags);
-
-			trace_kmalloc(_THIS_IP_, ret, size, s->size, flags);
-
-			return ret;
+			return kmem_cache_alloc_trace(s, flags, size);
 		}
 	}
 	return __kmalloc(size, flags);
@@ -266,14 +270,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 					   gfp_t gfpflags,
-					   int node);
+					   int node, size_t size);
 #else
 static __always_inline void *
-kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+kmem_cache_alloc_node_trace(struct kmem_cache *s,
 			      gfp_t gfpflags,
-			      int node)
+			      int node, size_t size)
 {
 	return kmem_cache_alloc_node(s, gfpflags, node);
 }
@@ -281,8 +285,6 @@ kmem_cache_alloc_node_notrace(struct kmem_cache *s,
 
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
-	void *ret;
-
 	if (__builtin_constant_p(size) &&
 		size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
@@ -290,12 +292,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 		if (!s)
 			return ZERO_SIZE_PTR;
 
-		ret = kmem_cache_alloc_node_notrace(s, flags, node);
-
-		trace_kmalloc_node(_THIS_IP_, ret,
-				   size, s->size, flags, node);
-
-		return ret;
+		return kmem_cache_alloc_node_trace(s, flags, node, size);
 	}
 	return __kmalloc_node(size, flags, node);
 }
diff --git a/mm/slub.c b/mm/slub.c
index 8fd5401bb071..7e657aa19475 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
 
+#include <trace/events/kmem.h>
+
 /*
  * Lock order:
  *   1. slab_lock(page)
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+	void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+	return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
 {
-	return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+	void *ret = kmalloc_order(size, flags, order);
+	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+	return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmalloc_order_trace);
 #endif
 
 #ifdef CONFIG_NUMA
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 				    gfp_t gfpflags,
-				    int node)
+				    int node, size_t size)
 {
-	return slab_alloc(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+	trace_kmalloc_node(_RET_IP_, ret,
+			   size, s->size, gfpflags, node);
+	return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 074e61ec3751da9ab88ee66d3818574556c03489 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Wed, 10 Nov 2010 09:01:31 +1100
Subject: kernel: add roundup() code comment from akpm

Add roundup() code comment from akpm.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b526947bdf48..3f648d204c37 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,6 +58,8 @@ extern const char linux_proc_banner[];
 
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */
 #define roundup(x, y) (					\
 {							\
 	const typeof(y) __y = y;			\
-- 
cgit v1.2.3-71-gd317


From e09b457bdb7e8d23fc54dcef0930ac697d8de895 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: simplify holder symlink handling

Code to manage symlinks in /sys/block/*/{holders|slaves} are overly
complex with multiple holder considerations, redundant extra
references to all involved kobjects, unused generic kobject holder
support and unnecessary mixup with bd_claim/release functionalities.

Strip it down to what's necessary (single gendisk holder) and make it
use a separate interface.  This is a step for cleaning up
bd_claim/release.  This patch makes dm-table slightly more complex but
it will be simplified again with further changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
---
 drivers/md/dm-table.c |  23 +++-
 drivers/md/md.c       |   4 +-
 fs/block_dev.c        | 322 +++++++-------------------------------------------
 include/linux/fs.h    |  16 ++-
 4 files changed, 74 insertions(+), 291 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 90267f8d64ee..2c876ffc63df 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -328,12 +328,22 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 	bdev = open_by_devnum(dev, d->dm_dev.mode);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
-	r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
-	if (r)
+
+	r = bd_claim(bdev, _claim_ptr);
+	if (r) {
 		blkdev_put(bdev, d->dm_dev.mode);
-	else
-		d->dm_dev.bdev = bdev;
-	return r;
+		return r;
+	}
+
+	r = bd_link_disk_holder(bdev, dm_disk(md));
+	if (r) {
+		bd_release(bdev);
+		blkdev_put(bdev, d->dm_dev.mode);
+		return r;
+	}
+
+	d->dm_dev.bdev = bdev;
+	return 0;
 }
 
 /*
@@ -344,7 +354,8 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 	if (!d->dm_dev.bdev)
 		return;
 
-	bd_release_from_disk(d->dm_dev.bdev, dm_disk(md));
+	bd_unlink_disk_holder(d->dm_dev.bdev);
+	bd_release(d->dm_dev.bdev);
 	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
 	d->dm_dev.bdev = NULL;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e957f3140a8..c47644fca1a1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1880,7 +1880,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
 
 	list_add_rcu(&rdev->same_set, &mddev->disks);
-	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
 
 	/* May as well allow recovery to be retried once */
 	mddev->recovery_disabled = 0;
@@ -1907,7 +1907,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
-	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
+	bd_unlink_disk_holder(rdev->bdev);
 	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97c..9329068684d2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -426,9 +426,6 @@ static void init_once(void *foo)
 	mutex_init(&bdev->bd_mutex);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
-#ifdef CONFIG_SYSFS
-	INIT_LIST_HEAD(&bdev->bd_holder_list);
-#endif
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
@@ -881,314 +878,83 @@ void bd_release(struct block_device *bdev)
 EXPORT_SYMBOL(bd_release);
 
 #ifdef CONFIG_SYSFS
-/*
- * Functions for bd_claim_by_kobject / bd_release_from_kobject
- *
- *     If a kobject is passed to bd_claim_by_kobject()
- *     and the kobject has a parent directory,
- *     following symlinks are created:
- *        o from the kobject to the claimed bdev
- *        o from "holders" directory of the bdev to the parent of the kobject
- *     bd_release_from_kobject() removes these symlinks.
- *
- *     Example:
- *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
- *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
- *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
- *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- */
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
-	if (!from || !to)
-		return 0;
 	return sysfs_create_link(from, to, kobject_name(to));
 }
 
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
-	if (!from || !to)
-		return;
 	sysfs_remove_link(from, kobject_name(to));
 }
 
-/*
- * 'struct bd_holder' contains pointers to kobjects symlinked by
- * bd_claim_by_kobject.
- * It's connected to bd_holder_list which is protected by bdev->bd_sem.
- */
-struct bd_holder {
-	struct list_head list;	/* chain of holders of the bdev */
-	int count;		/* references from the holder */
-	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */
-	struct kobject *hdev;	/* e.g. "/block/dm-0" */
-	struct kobject *hdir;	/* e.g. "/block/sda/holders" */
-	struct kobject *sdev;	/* e.g. "/block/sda" */
-};
-
-/*
- * Get references of related kobjects at once.
- * Returns 1 on success. 0 on failure.
- *
- * Should call bd_holder_release_dirs() after successful use.
- */
-static int bd_holder_grab_dirs(struct block_device *bdev,
-			struct bd_holder *bo)
-{
-	if (!bdev || !bo)
-		return 0;
-
-	bo->sdir = kobject_get(bo->sdir);
-	if (!bo->sdir)
-		return 0;
-
-	bo->hdev = kobject_get(bo->sdir->parent);
-	if (!bo->hdev)
-		goto fail_put_sdir;
-
-	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-	if (!bo->sdev)
-		goto fail_put_hdev;
-
-	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
-	if (!bo->hdir)
-		goto fail_put_sdev;
-
-	return 1;
-
-fail_put_sdev:
-	kobject_put(bo->sdev);
-fail_put_hdev:
-	kobject_put(bo->hdev);
-fail_put_sdir:
-	kobject_put(bo->sdir);
-
-	return 0;
-}
-
-/* Put references of related kobjects at once. */
-static void bd_holder_release_dirs(struct bd_holder *bo)
-{
-	kobject_put(bo->hdir);
-	kobject_put(bo->sdev);
-	kobject_put(bo->hdev);
-	kobject_put(bo->sdir);
-}
-
-static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
-{
-	struct bd_holder *bo;
-
-	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
-	if (!bo)
-		return NULL;
-
-	bo->count = 1;
-	bo->sdir = kobj;
-
-	return bo;
-}
-
-static void free_bd_holder(struct bd_holder *bo)
-{
-	kfree(bo);
-}
-
 /**
- * find_bd_holder - find matching struct bd_holder from the block device
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
  *
- * @bdev:	struct block device to be searched
- * @bo:		target struct bd_holder
+ * This functions creates the following sysfs symlinks.
  *
- * Returns matching entry with @bo in @bdev->bd_holder_list.
- * If found, increment the reference count and return the pointer.
- * If not found, returns NULL.
- */
-static struct bd_holder *find_bd_holder(struct block_device *bdev,
-					struct bd_holder *bo)
-{
-	struct bd_holder *tmp;
-
-	list_for_each_entry(tmp, &bdev->bd_holder_list, list)
-		if (tmp->sdir == bo->sdir) {
-			tmp->count++;
-			return tmp;
-		}
-
-	return NULL;
-}
-
-/**
- * add_bd_holder - create sysfs symlinks for bd_claim() relationship
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
  *
- * @bdev:	block device to be bd_claimed
- * @bo:		preallocated and initialized by alloc_bd_holder()
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
  *
- * Add @bo to @bdev->bd_holder_list, create symlinks.
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  *
- * Returns 0 if symlinks are created.
- * Returns -ve if something fails.
- */
-static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
-{
-	int err;
-
-	if (!bo)
-		return -EINVAL;
-
-	if (!bd_holder_grab_dirs(bdev, bo))
-		return -EBUSY;
-
-	err = add_symlink(bo->sdir, bo->sdev);
-	if (err)
-		return err;
-
-	err = add_symlink(bo->hdir, bo->hdev);
-	if (err) {
-		del_symlink(bo->sdir, bo->sdev);
-		return err;
-	}
-
-	list_add_tail(&bo->list, &bdev->bd_holder_list);
-	return 0;
-}
-
-/**
- * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
- *
- * @bdev:	block device to be bd_claimed
- * @kobj:	holder's kobject
- *
- * If there is matching entry with @kobj in @bdev->bd_holder_list
- * and no other bd_claim() from the same kobject,
- * remove the struct bd_holder from the list, delete symlinks for it.
- *
- * Returns a pointer to the struct bd_holder when it's removed from the list
- * and ready to be freed.
- * Returns NULL if matching claim isn't found or there is other bd_claim()
- * by the same kobject.
- */
-static struct bd_holder *del_bd_holder(struct block_device *bdev,
-					struct kobject *kobj)
-{
-	struct bd_holder *bo;
-
-	list_for_each_entry(bo, &bdev->bd_holder_list, list) {
-		if (bo->sdir == kobj) {
-			bo->count--;
-			BUG_ON(bo->count < 0);
-			if (!bo->count) {
-				list_del(&bo->list);
-				del_symlink(bo->sdir, bo->sdev);
-				del_symlink(bo->hdir, bo->hdev);
-				bd_holder_release_dirs(bo);
-				return bo;
-			}
-			break;
-		}
-	}
-
-	return NULL;
-}
-
-/**
- * bd_claim_by_kobject - bd_claim() with additional kobject signature
- *
- * @bdev:	block device to be claimed
- * @holder:	holder's signature
- * @kobj:	holder's kobject
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
  *
- * Do bd_claim() and if it succeeds, create sysfs symlinks between
- * the bdev and the holder's kobject.
- * Use bd_release_from_kobject() when relesing the claimed bdev.
+ * CONTEXT:
+ * Might sleep.
  *
- * Returns 0 on success. (same as bd_claim())
- * Returns errno on failure.
+ * RETURNS:
+ * 0 on success, -errno on failure.
  */
-static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
-				struct kobject *kobj)
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-	int err;
-	struct bd_holder *bo, *found;
-
-	if (!kobj)
-		return -EINVAL;
-
-	bo = alloc_bd_holder(kobj);
-	if (!bo)
-		return -ENOMEM;
+	int ret = 0;
 
 	mutex_lock(&bdev->bd_mutex);
 
-	err = bd_claim(bdev, holder);
-	if (err)
-		goto fail;
+	WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk);
 
-	found = find_bd_holder(bdev, bo);
-	if (found)
-		goto fail;
+	/* FIXME: remove the following once add_disk() handles errors */
+	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+		goto out_unlock;
 
-	err = add_bd_holder(bdev, bo);
-	if (err)
-		bd_release(bdev);
-	else
-		bo = NULL;
-fail:
-	mutex_unlock(&bdev->bd_mutex);
-	free_bd_holder(bo);
-	return err;
-}
+	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	if (ret)
+		goto out_unlock;
 
-/**
- * bd_release_from_kobject - bd_release() with additional kobject signature
- *
- * @bdev:	block device to be released
- * @kobj:	holder's kobject
- *
- * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
- */
-static void bd_release_from_kobject(struct block_device *bdev,
-					struct kobject *kobj)
-{
-	if (!kobj)
-		return;
+	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	if (ret) {
+		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		goto out_unlock;
+	}
 
-	mutex_lock(&bdev->bd_mutex);
-	bd_release(bdev);
-	free_bd_holder(del_bd_holder(bdev, kobj));
+	bdev->bd_holder_disk = disk;
+out_unlock:
 	mutex_unlock(&bdev->bd_mutex);
+	return ret;
 }
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
-/**
- * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
- *
- * @bdev:	block device to be claimed
- * @holder:	holder's signature
- * @disk:	holder's gendisk
- *
- * Call bd_claim_by_kobject() with getting @disk->slave_dir.
- */
-int bd_claim_by_disk(struct block_device *bdev, void *holder,
-			struct gendisk *disk)
+void bd_unlink_disk_holder(struct block_device *bdev)
 {
-	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
-}
-EXPORT_SYMBOL_GPL(bd_claim_by_disk);
+	struct gendisk *disk = bdev->bd_holder_disk;
 
-/**
- * bd_release_from_disk - wrapper function for bd_release_from_kobject()
- *
- * @bdev:	block device to be claimed
- * @disk:	holder's gendisk
- *
- * Call bd_release_from_kobject() and put @disk->slave_dir.
- */
-void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
-{
-	bd_release_from_kobject(bdev, disk->slave_dir);
-	kobject_put(disk->slave_dir);
+	bdev->bd_holder_disk = NULL;
+	if (!disk)
+		return;
+
+	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 }
-EXPORT_SYMBOL_GPL(bd_release_from_disk);
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 #endif
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 334d68a17108..66b7f2c5d7e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -663,7 +663,7 @@ struct block_device {
 	void *			bd_holder;
 	int			bd_holders;
 #ifdef CONFIG_SYSFS
-	struct list_head	bd_holder_list;
+	struct gendisk *	bd_holder_disk;	/* for sysfs slave linkng */
 #endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
@@ -2042,11 +2042,17 @@ extern int blkdev_put(struct block_device *, fmode_t);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
 #ifdef CONFIG_SYSFS
-extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
-extern void bd_release_from_disk(struct block_device *, struct gendisk *);
+extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
+extern void bd_unlink_disk_holder(struct block_device *bdev);
 #else
-#define bd_claim_by_disk(bdev, holder, disk)	bd_claim(bdev, holder)
-#define bd_release_from_disk(bdev, disk)	bd_release(bdev)
+static inline int bd_link_disk_holder(struct block_device *bdev,
+				      struct gendisk *disk)
+{
+	return 0;
+}
+static inline void bd_unlink_disk_holder(struct block_device *bdev)
+{
+}
 #endif
 #endif
 
-- 
cgit v1.2.3-71-gd317


From e525fd89d380c4a94c0d63913a1dd1a593ed25e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: make blkdev_get/put() handle exclusive access

Over time, block layer has accumulated a set of APIs dealing with bdev
open, close, claim and release.

* blkdev_get/put() are the primary open and close functions.

* bd_claim/release() deal with exclusive open.

* open/close_bdev_exclusive() are combination of open and claim and
  the other way around, respectively.

* bd_link/unlink_disk_holder() to create and remove holder/slave
  symlinks.

* open_by_devnum() wraps bdget() + blkdev_get().

The interface is a bit confusing and the decoupling of open and claim
makes it impossible to properly guarantee exclusive access as
in-kernel open + claim sequence can disturb the existing exclusive
open even before the block layer knows the current open if for another
exclusive access.  Reorganize the interface such that,

* blkdev_get() is extended to include exclusive access management.
  @holder argument is added and, if is @FMODE_EXCL specified, it will
  gain exclusive access atomically w.r.t. other exclusive accesses.

* blkdev_put() is similarly extended.  It now takes @mode argument and
  if @FMODE_EXCL is set, it releases an exclusive access.  Also, when
  the last exclusive claim is released, the holder/slave symlinks are
  removed automatically.

* bd_claim/release() and close_bdev_exclusive() are no longer
  necessary and either made static or removed.

* bd_link_disk_holder() remains the same but bd_unlink_disk_holder()
  is no longer necessary and removed.

* open_bdev_exclusive() becomes a simple wrapper around lookup_bdev()
  and blkdev_get().  It also has an unexpected extra bdev_read_only()
  test which probably should be moved into blkdev_get().

* open_by_devnum() is modified to take @holder argument and pass it to
  blkdev_get().

Most of bdev open/close operations are unified into blkdev_get/put()
and most exclusive accesses are tested atomically at the open time (as
it should).  This cleans up code and removes some, both valid and
invalid, but unnecessary all the same, corner cases.

open_bdev_exclusive() and open_by_devnum() can use further cleanup -
rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop
special features.  Well, let's leave them for another day.

Most conversions are straight-forward.  drbd conversion is a bit more
involved as there was some reordering, but the logic should stay the
same.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: dm-devel@redhat.com
Cc: drbd-dev@lists.linbit.com
Cc: Leo Chen <leochen@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Joern Engel <joern@logfs.org>
Cc: reiserfs-devel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 block/ioctl.c                   |   5 +-
 drivers/block/drbd/drbd_int.h   |   2 -
 drivers/block/drbd/drbd_main.c  |   7 +-
 drivers/block/drbd/drbd_nl.c    | 103 ++++++++++-----------------
 drivers/block/pktcdvd.c         |  22 +++---
 drivers/char/raw.c              |  14 ++--
 drivers/md/dm-table.c           |  15 +---
 drivers/md/md.c                 |  14 +---
 drivers/mtd/devices/block2mtd.c |  17 ++---
 drivers/s390/block/dasd_genhd.c |   2 +-
 fs/block_dev.c                  | 149 ++++++++++++++--------------------------
 fs/btrfs/volumes.c              |  14 ++--
 fs/ext3/super.c                 |  12 +---
 fs/ext4/super.c                 |  12 +---
 fs/gfs2/ops_fstype.c            |   4 +-
 fs/jfs/jfs_logmgr.c             |  17 ++---
 fs/logfs/dev_bdev.c             |   4 +-
 fs/nilfs2/super.c               |   4 +-
 fs/ocfs2/cluster/heartbeat.c    |   2 +-
 fs/partitions/check.c           |   2 +-
 fs/reiserfs/journal.c           |  17 ++---
 fs/super.c                      |  14 ++--
 fs/xfs/linux-2.6/xfs_super.c    |   2 +-
 include/linux/fs.h              |  14 ++--
 kernel/power/swap.c             |   5 +-
 mm/swapfile.c                   |   7 +-
 26 files changed, 162 insertions(+), 318 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index d724ceb1d465..cc46d499fd27 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			return -EINVAL;
 		if (get_user(n, (int __user *) arg))
 			return -EFAULT;
-		if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
+		if (!(mode & FMODE_EXCL) &&
+		    blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
 			return -EBUSY;
 		ret = set_blocksize(bdev, n);
 		if (!(mode & FMODE_EXCL))
-			bd_release(bdev);
+			blkdev_put(bdev, mode | FMODE_EXCL);
 		return ret;
 	case BLKPG:
 		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9bdcf4393c0a..0590b9f67ec6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -923,8 +923,6 @@ struct drbd_md {
 struct drbd_backing_dev {
 	struct block_device *backing_bdev;
 	struct block_device *md_bdev;
-	struct file *lo_file;
-	struct file *md_file;
 	struct drbd_md md;
 	struct disk_conf dc; /* The user provided config... */
 	sector_t known_size; /* last known size of that backing device */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 25c7a73c5062..7ec1a82064a9 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3361,11 +3361,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
 	if (ldev == NULL)
 		return;
 
-	bd_release(ldev->backing_bdev);
-	bd_release(ldev->md_bdev);
-
-	fput(ldev->lo_file);
-	fput(ldev->md_file);
+	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 
 	kfree(ldev);
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 87925e97e613..fd0346090289 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -855,7 +855,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	sector_t max_possible_sectors;
 	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
-	struct inode *inode, *inode2;
+	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	union drbd_state ns, os;
 	unsigned int max_seg_s;
@@ -902,46 +902,40 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		}
 	}
 
-	nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
-	if (IS_ERR(nbc->lo_file)) {
+	bdev = open_bdev_exclusive(nbc->dc.backing_dev,
+				   FMODE_READ | FMODE_WRITE, mdev);
+	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
-		    PTR_ERR(nbc->lo_file));
-		nbc->lo_file = NULL;
+			PTR_ERR(bdev));
 		retcode = ERR_OPEN_DISK;
 		goto fail;
 	}
+	nbc->backing_bdev = bdev;
 
-	inode = nbc->lo_file->f_dentry->d_inode;
-
-	if (!S_ISBLK(inode->i_mode)) {
-		retcode = ERR_DISK_NOT_BDEV;
-		goto fail;
-	}
-
-	nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
-	if (IS_ERR(nbc->md_file)) {
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = open_bdev_exclusive(nbc->dc.meta_dev,
+				   FMODE_READ | FMODE_WRITE,
+				   (nbc->dc.meta_dev_idx < 0) ?
+				   (void *)mdev : (void *)drbd_m_holder);
+	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
-		    PTR_ERR(nbc->md_file));
-		nbc->md_file = NULL;
+			PTR_ERR(bdev));
 		retcode = ERR_OPEN_MD_DISK;
 		goto fail;
 	}
+	nbc->md_bdev = bdev;
 
-	inode2 = nbc->md_file->f_dentry->d_inode;
-
-	if (!S_ISBLK(inode2->i_mode)) {
-		retcode = ERR_MD_NOT_BDEV;
-		goto fail;
-	}
-
-	nbc->backing_bdev = inode->i_bdev;
-	if (bd_claim(nbc->backing_bdev, mdev)) {
-		printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
-		       nbc->backing_bdev, mdev,
-		       nbc->backing_bdev->bd_holder,
-		       nbc->backing_bdev->bd_contains->bd_holder,
-		       nbc->backing_bdev->bd_holders);
-		retcode = ERR_BDCLAIM_DISK;
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
 		goto fail;
 	}
 
@@ -950,28 +944,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 			offsetof(struct bm_extent, lce));
 	if (!resync_lru) {
 		retcode = ERR_NOMEM;
-		goto release_bdev_fail;
-	}
-
-	/* meta_dev_idx >= 0: external fixed size,
-	 * possibly multiple drbd sharing one meta device.
-	 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
-	 * not yet used by some other drbd minor!
-	 * (if you use drbd.conf + drbdadm,
-	 * that should check it for you already; but if you don't, or someone
-	 * fooled it, we need to double check here) */
-	nbc->md_bdev = inode2->i_bdev;
-	if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
-				: (void *) drbd_m_holder)) {
-		retcode = ERR_BDCLAIM_MD_DISK;
-		goto release_bdev_fail;
-	}
-
-	if ((nbc->backing_bdev == nbc->md_bdev) !=
-	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
-	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
-		retcode = ERR_MD_IDX_INVALID;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
@@ -982,7 +955,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) nbc->dc.disk_size);
 		retcode = ERR_DISK_TO_SMALL;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	if (nbc->dc.meta_dev_idx < 0) {
@@ -999,7 +972,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		dev_warn(DEV, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	/* Make sure the new disk is big enough
@@ -1007,7 +980,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(mdev->this_bdev)) {
 		retcode = ERR_DISK_TO_SMALL;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
@@ -1030,7 +1003,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
 	drbd_resume_io(mdev);
 	if (retcode < SS_SUCCESS)
-		goto release_bdev2_fail;
+		goto fail;
 
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
 		goto force_diskless;
@@ -1264,18 +1237,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
  force_diskless:
 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
 	drbd_md_sync(mdev);
- release_bdev2_fail:
-	if (nbc)
-		bd_release(nbc->md_bdev);
- release_bdev_fail:
-	if (nbc)
-		bd_release(nbc->backing_bdev);
  fail:
 	if (nbc) {
-		if (nbc->lo_file)
-			fput(nbc->lo_file);
-		if (nbc->md_file)
-			fput(nbc->md_file);
+		if (nbc->backing_bdev)
+			blkdev_put(nbc->backing_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		if (nbc->md_bdev)
+			blkdev_put(nbc->md_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		kfree(nbc);
 	}
 	lc_destroy(resync_lru);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 19b3568e9326..77d70eebb6b2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2296,15 +2296,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	 * so bdget() can't fail.
 	 */
 	bdget(pd->bdev->bd_dev);
-	if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
+	if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
 		goto out;
 
-	if ((ret = bd_claim(pd->bdev, pd)))
-		goto out_putdev;
-
 	if ((ret = pkt_get_last_written(pd, &lba))) {
 		printk(DRIVER_NAME": pkt_get_last_written failed\n");
-		goto out_unclaim;
+		goto out_putdev;
 	}
 
 	set_capacity(pd->disk, lba << 2);
@@ -2314,7 +2311,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	q = bdev_get_queue(pd->bdev);
 	if (write) {
 		if ((ret = pkt_open_write(pd)))
-			goto out_unclaim;
+			goto out_putdev;
 		/*
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
@@ -2329,23 +2326,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	}
 
 	if ((ret = pkt_set_segment_merging(pd, q)))
-		goto out_unclaim;
+		goto out_putdev;
 
 	if (write) {
 		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
 			printk(DRIVER_NAME": not enough memory for buffers\n");
 			ret = -ENOMEM;
-			goto out_unclaim;
+			goto out_putdev;
 		}
 		printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
 	}
 
 	return 0;
 
-out_unclaim:
-	bd_release(pd->bdev);
 out_putdev:
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 out:
 	return ret;
 }
@@ -2362,8 +2357,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	bd_release(pd->bdev);
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 
 	pkt_shrink_pktlist(pd);
 }
@@ -2733,7 +2727,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	bdev = bdget(dev);
 	if (!bdev)
 		return -ENOMEM;
-	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
+	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index bfe25ea9766b..b4b9d5a47885 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -65,15 +65,12 @@ static int raw_open(struct inode *inode, struct file *filp)
 	if (!bdev)
 		goto out;
 	igrab(bdev->bd_inode);
-	err = blkdev_get(bdev, filp->f_mode);
+	err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open);
 	if (err)
 		goto out;
-	err = bd_claim(bdev, raw_open);
-	if (err)
-		goto out1;
 	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
 	if (err)
-		goto out2;
+		goto out1;
 	filp->f_flags |= O_DIRECT;
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	if (++raw_devices[minor].inuse == 1)
@@ -83,10 +80,8 @@ static int raw_open(struct inode *inode, struct file *filp)
 	mutex_unlock(&raw_mutex);
 	return 0;
 
-out2:
-	bd_release(bdev);
 out1:
-	blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
 out:
 	mutex_unlock(&raw_mutex);
 	return err;
@@ -110,8 +105,7 @@ static int raw_release(struct inode *inode, struct file *filp)
 	}
 	mutex_unlock(&raw_mutex);
 
-	bd_release(bdev);
-	blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
 	return 0;
 }
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2c876ffc63df..9e88ca0c55e9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -325,20 +325,13 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 
 	BUG_ON(d->dm_dev.bdev);
 
-	bdev = open_by_devnum(dev, d->dm_dev.mode);
+	bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
-	r = bd_claim(bdev, _claim_ptr);
-	if (r) {
-		blkdev_put(bdev, d->dm_dev.mode);
-		return r;
-	}
-
 	r = bd_link_disk_holder(bdev, dm_disk(md));
 	if (r) {
-		bd_release(bdev);
-		blkdev_put(bdev, d->dm_dev.mode);
+		blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
 		return r;
 	}
 
@@ -354,9 +347,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 	if (!d->dm_dev.bdev)
 		return;
 
-	bd_unlink_disk_holder(d->dm_dev.bdev);
-	bd_release(d->dm_dev.bdev);
-	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
+	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
 	d->dm_dev.bdev = NULL;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c47644fca1a1..6af951ffe0bb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1907,7 +1907,6 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
-	bd_unlink_disk_holder(rdev->bdev);
 	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
@@ -1935,19 +1934,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+			      shared ? (mdk_rdev_t *)lock_rdev : rdev);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
 		return PTR_ERR(bdev);
 	}
-	err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
-	if (err) {
-		printk(KERN_ERR "md: could not bd_claim %s.\n",
-			bdevname(bdev, b));
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return err;
-	}
 	if (!shared)
 		set_bit(AllReserved, &rdev->flags);
 	rdev->bdev = bdev;
@@ -1960,8 +1953,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
 	rdev->bdev = NULL;
 	if (!bdev)
 		MD_BUG();
-	bd_release(bdev);
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 void md_autodetect_dev(dev_t dev);
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index a9e2d3b38aeb..aa557beb8f51 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -224,7 +224,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 	if (dev->blkdev) {
 		invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping,
 					0, -1);
-		close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 	}
 
 	kfree(dev);
@@ -234,7 +234,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 /* FIXME: ensure that mtd->size % erase_size == 0 */
 static struct block2mtd_dev *add_device(char *devname, int erase_size)
 {
-	const fmode_t mode = FMODE_READ | FMODE_WRITE;
+	const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
 	struct block_device *bdev;
 	struct block2mtd_dev *dev;
 	char *name;
@@ -255,17 +255,8 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		   to resolve the device name by other means. */
 
 		dev_t devt = name_to_dev_t(devname);
-		if (devt) {
-			bdev = open_by_devnum(devt, mode);
-			if (!IS_ERR(bdev)) {
-				int ret;
-				ret = bd_claim(bdev, dev);
-				if (ret) {
-					blkdev_put(bdev, mode);
-					bdev = ERR_PTR(ret);
-				}
-			}
-		}
+		if (devt)
+			bdev = open_by_devnum(devt, mode, dev);
 	}
 #endif
 
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 30a1ca3d08b7..5505bc07e1e7 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -103,7 +103,7 @@ int dasd_scan_partitions(struct dasd_block *block)
 	struct block_device *bdev;
 
 	bdev = bdget_disk(block->gdp, 0);
-	if (!bdev || blkdev_get(bdev, FMODE_READ) < 0)
+	if (!bdev || blkdev_get(bdev, FMODE_READ, NULL) < 0)
 		return -ENODEV;
 	/*
 	 * See fs/partition/check.c:register_disk,rescan_partitions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9329068684d2..fc48912354d1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -660,7 +660,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 	else if (bdev->bd_contains == bdev)
 		return true;  	 /* is a whole device which isn't held */
 
-	else if (whole->bd_holder == bd_claim)
+	else if (whole->bd_holder == bd_may_claim)
 		return true; 	 /* is a partition of a device that is being partitioned */
 	else if (whole->bd_holder != NULL)
 		return false;	 /* is a partition of a held device */
@@ -807,10 +807,10 @@ static void __bd_claim(struct block_device *bdev, struct block_device *whole,
 {
 	/* note that for a whole device bd_holders
 	 * will be incremented twice, and bd_holder will
-	 * be set to bd_claim before being set to holder
+	 * be set to bd_may_claim before being set to holder
 	 */
 	whole->bd_holders++;
-	whole->bd_holder = bd_claim;
+	whole->bd_holder = bd_may_claim;
 	bdev->bd_holders++;
 	bdev->bd_holder = holder;
 }
@@ -835,37 +835,7 @@ static void bd_finish_claiming(struct block_device *bdev,
 	__bd_abort_claiming(whole, holder); /* not actually an abort */
 }
 
-/**
- * bd_claim - claim a block device
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
-{
-	struct block_device *whole = bdev->bd_contains;
-	int res;
-
-	might_sleep();
-
-	spin_lock(&bdev_lock);
-	res = bd_prepare_to_claim(bdev, whole, holder);
-	if (res == 0)
-		__bd_claim(bdev, whole, holder);
-	spin_unlock(&bdev_lock);
-
-	return res;
-}
-EXPORT_SYMBOL(bd_claim);
-
-void bd_release(struct block_device *bdev)
+static void bd_release(struct block_device *bdev)
 {
 	spin_lock(&bdev_lock);
 	if (!--bdev->bd_contains->bd_holders)
@@ -875,8 +845,6 @@ void bd_release(struct block_device *bdev)
 	spin_unlock(&bdev_lock);
 }
 
-EXPORT_SYMBOL(bd_release);
-
 #ifdef CONFIG_SYSFS
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
@@ -943,7 +911,7 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
-void bd_unlink_disk_holder(struct block_device *bdev)
+static void bd_unlink_disk_holder(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_holder_disk;
 
@@ -954,7 +922,9 @@ void bd_unlink_disk_holder(struct block_device *bdev)
 	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 }
-EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+#else
+static inline void bd_unlink_disk_holder(struct block_device *bdev)
+{ }
 #endif
 
 /*
@@ -964,12 +934,12 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
  * to be used for internal purposes.  If you ever need it - reconsider
  * your API.
  */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
+struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
 	if (bdev)
-		err = blkdev_get(bdev, mode);
+		err = blkdev_get(bdev, mode, holder);
 	return err ? ERR_PTR(err) : bdev;
 }
 
@@ -1235,17 +1205,37 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return ret;
 }
 
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
-	return __blkdev_get(bdev, mode, 0);
+	struct block_device *whole = NULL;
+	int res;
+
+	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+
+	if ((mode & FMODE_EXCL) && holder) {
+		whole = bd_start_claiming(bdev, holder);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
+	res = __blkdev_get(bdev, mode, 0);
+
+	if (whole) {
+		if (res == 0)
+			bd_finish_claiming(bdev, whole, holder);
+		else
+			bd_abort_claiming(whole, holder);
+	}
+
+	return res;
 }
 EXPORT_SYMBOL(blkdev_get);
 
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
-	struct block_device *whole = NULL;
 	struct block_device *bdev;
-	int res;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -1266,26 +1256,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (bdev == NULL)
 		return -ENOMEM;
 
-	if (filp->f_mode & FMODE_EXCL) {
-		whole = bd_start_claiming(bdev, filp);
-		if (IS_ERR(whole)) {
-			bdput(bdev);
-			return PTR_ERR(whole);
-		}
-	}
-
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 
-	res = blkdev_get(bdev, filp->f_mode);
-
-	if (whole) {
-		if (res == 0)
-			bd_finish_claiming(bdev, whole, filp);
-		else
-			bd_abort_claiming(whole, filp);
-	}
-
-	return res;
+	return blkdev_get(bdev, filp->f_mode, filp);
 }
 
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1329,6 +1302,13 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+	if (mode & FMODE_EXCL) {
+		mutex_lock(&bdev->bd_mutex);
+		bd_release(bdev);
+		if (!bdev->bd_holders)
+			bd_unlink_disk_holder(bdev);
+		mutex_unlock(&bdev->bd_mutex);
+	}
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1336,8 +1316,7 @@ EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-	if (bdev->bd_holder == filp)
-		bd_release(bdev);
+
 	return blkdev_put(bdev, filp->f_mode);
 }
 
@@ -1494,55 +1473,27 @@ EXPORT_SYMBOL(lookup_bdev);
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-	struct block_device *bdev, *whole;
+	struct block_device *bdev;
 	int error;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
-	whole = bd_start_claiming(bdev, holder);
-	if (IS_ERR(whole)) {
-		bdput(bdev);
-		return whole;
-	}
-
-	error = blkdev_get(bdev, mode);
+	error = blkdev_get(bdev, mode | FMODE_EXCL, holder);
 	if (error)
-		goto out_abort_claiming;
+		return ERR_PTR(error);
 
-	error = -EACCES;
-	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-		goto out_blkdev_put;
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		blkdev_put(bdev, mode);
+		return ERR_PTR(-EACCES);
+	}
 
-	bd_finish_claiming(bdev, whole, holder);
 	return bdev;
-
-out_blkdev_put:
-	blkdev_put(bdev, mode);
-out_abort_claiming:
-	bd_abort_claiming(whole, holder);
-	return ERR_PTR(error);
 }
 
 EXPORT_SYMBOL(open_bdev_exclusive);
 
-/**
- * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev:	blockdevice to close
- * @mode:	mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
-	bd_release(bdev);
-	blkdev_put(bdev, mode);
-}
-
-EXPORT_SYMBOL(close_bdev_exclusive);
-
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d39596224d21..f1b729d3b883 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -489,7 +489,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -638,7 +638,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, flags);
+		blkdev_put(bdev, flags | FMODE_EXCL);
 error:
 		continue;
 	}
@@ -716,7 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	close_bdev_exclusive(bdev, flags);
+	blkdev_put(bdev, flags | FMODE_EXCL);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1244,7 +1244,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		close_bdev_exclusive(device->bdev, device->mode);
+		blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 		device->bdev = NULL;
 		device->fs_devices->open_devices--;
 	}
@@ -1287,7 +1287,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ);
+		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -1565,7 +1565,7 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
-	close_bdev_exclusive(bdev, 0);
+	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b5012..23e7513dba9c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -364,8 +364,7 @@ fail:
  */
 static int ext3_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -2136,13 +2135,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext3_msg(sb, KERN_ERR,
-			"error: failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61182fe6254e..5dd0b3e76fa8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -663,8 +663,7 @@ fail:
  */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -3758,13 +3757,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext4_msg(sb, KERN_ERR,
-			"failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b81..c1f0763a022b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		goto error_bdev;
 
 	if (s->s_root)
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode | FMODE_EXCL);
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
 	deactivate_locked_super(s);
 	return ERR_PTR(error);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aaa..5a290f22dcc3 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+			      log);
 	if (IS_ERR(bdev)) {
 		rc = -PTR_ERR(bdev);
 		goto free;
 	}
 
-	if ((rc = bd_claim(bdev, log))) {
-		goto close;
-	}
-
 	log->bdev = bdev;
 	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
 
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
 	 * initialize log:
 	 */
 	if ((rc = lmLogInit(log)))
-		goto unclaim;
+		goto close;
 
 	list_add(&log->journal_list, &jfs_external_logs);
 
@@ -1163,11 +1160,8 @@ journal_found:
 	list_del(&log->journal_list);
 	lbmLogShutdown(log);
 
-      unclaim:
-	bd_release(bdev);
-
       close:		/* close external log device */
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
 	bdev = log->bdev;
 	rc = lmLogShutdown(log);
 
-	bd_release(bdev);
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
 	kfree(log);
 
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09bd..734b9025858e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 
 static void bdev_put_device(struct logfs_super *s)
 {
-	close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -331,7 +331,7 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		int mtdnr = MINOR(bdev->bd_dev);
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 		return logfs_get_sb_mtd(p, mtdnr);
 	}
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d3..756a6798d7c8 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode | FMODE_EXCL);
 
 	return root_dentry;
 
@@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 
  failed:
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode | FMODE_EXCL);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e25..d0a2721eaceb 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1674,7 +1674,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out;
 
 	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
 	if (ret) {
 		reg->hr_bdev = NULL;
 		goto out;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e2..2e6501d034ab 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -549,7 +549,7 @@ void register_disk(struct gendisk *disk)
 		goto exit;
 
 	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ);
+	err = blkdev_get(bdev, FMODE_READ, NULL);
 	if (err < 0)
 		goto exit;
 	blkdev_put(bdev, FMODE_READ);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b194682..b488136f5ace 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2552,8 +2552,6 @@ static int release_journal_dev(struct super_block *super,
 	result = 0;
 
 	if (journal->j_dev_bd != NULL) {
-		if (journal->j_dev_bd->bd_dev != super->s_dev)
-			bd_release(journal->j_dev_bd);
 		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
@@ -2571,7 +2569,7 @@ static int journal_init_dev(struct super_block *super,
 {
 	int result;
 	dev_t jdev;
-	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
 	char b[BDEVNAME_SIZE];
 
 	result = 0;
@@ -2585,7 +2583,9 @@ static int journal_init_dev(struct super_block *super,
 
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
-		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+		if (jdev == super->s_dev)
+			blkdev_mode &= ~FMODE_EXCL;
+		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal);
 		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
@@ -2594,15 +2594,8 @@ static int journal_init_dev(struct super_block *super,
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
-		} else if (jdev != super->s_dev) {
-			result = bd_claim(journal->j_dev_bd, journal);
-			if (result) {
-				blkdev_put(journal->j_dev_bd, blkdev_mode);
-				return result;
-			}
-
+		} else if (jdev != super->s_dev)
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
-		}
 
 		return 0;
 	}
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9a..22374bf0ba87 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -801,13 +801,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 
 		/*
 		 * s_umount nests inside bd_mutex during
-		 * __invalidate_device().  close_bdev_exclusive()
-		 * acquires bd_mutex and can't be called under
-		 * s_umount.  Drop s_umount temporarily.  This is safe
-		 * as we're holding an active reference.
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
 		 */
 		up_write(&s->s_umount);
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode | FMODE_EXCL);
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 error:
 	return ERR_PTR(error);
 }
@@ -862,7 +862,7 @@ void kill_block_super(struct super_block *sb)
 	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
 EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae4..a1a6e5ceea67 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -623,7 +623,7 @@ xfs_blkdev_put(
 	struct block_device	*bdev)
 {
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 66b7f2c5d7e9..1a033e8ebe4c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2006,7 +2006,8 @@ extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
-extern struct block_device *open_by_devnum(dev_t, fmode_t);
+extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode,
+					   void *holder);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
@@ -2037,22 +2038,16 @@ extern const struct file_operations def_fifo_fops;
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-extern int blkdev_get(struct block_device *, fmode_t);
-extern int blkdev_put(struct block_device *, fmode_t);
-extern int bd_claim(struct block_device *, void *);
-extern void bd_release(struct block_device *);
+extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
+extern int blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
-extern void bd_unlink_disk_holder(struct block_device *bdev);
 #else
 static inline int bd_link_disk_holder(struct block_device *bdev,
 				      struct gendisk *disk)
 {
 	return 0;
 }
-static inline void bd_unlink_disk_holder(struct block_device *bdev)
-{
-}
 #endif
 #endif
 
@@ -2089,7 +2084,6 @@ extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
-extern void close_bdev_exclusive(struct block_device *, fmode_t);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..513a77f1a0b3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -223,7 +223,7 @@ static int swsusp_swap_check(void)
 		return res;
 
 	root_swap = res;
-	res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+	res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
 	if (res)
 		return res;
 
@@ -907,7 +907,8 @@ int swsusp_check(void)
 {
 	int error;
 
-	hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	hib_resume_bdev = open_by_devnum(swsusp_resume_device,
+					 FMODE_READ, NULL);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67ddaaf98c74..b6adcfbf6f48 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1677,7 +1677,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
 		set_blocksize(bdev, p->old_block_size);
-		bd_release(bdev);
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 	} else {
 		mutex_lock(&inode->i_mutex);
 		inode->i_flags &= ~S_SWAPFILE;
@@ -1939,7 +1939,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	error = -EINVAL;
 	if (S_ISBLK(inode->i_mode)) {
 		bdev = I_BDEV(inode);
-		error = bd_claim(bdev, sys_swapon);
+		error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				   sys_swapon);
 		if (error < 0) {
 			bdev = NULL;
 			error = -EINVAL;
@@ -2136,7 +2137,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap:
 	if (bdev) {
 		set_blocksize(bdev, p->old_block_size);
-		bd_release(bdev);
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 	}
 	destroy_swap_extents(p);
 	swap_cgroup_swapoff(type);
-- 
cgit v1.2.3-71-gd317


From d4d77629953eabd3c14f6fa5746f6b28babfc55f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:18 +0100
Subject: block: clean up blkdev_get() wrappers and their users

After recent blkdev_get() modifications, open_by_devnum() and
open_bdev_exclusive() are simple wrappers around blkdev_get().
Replace them with blkdev_get_by_dev() and blkdev_get_by_path().

blkdev_get_by_dev() is identical to open_by_devnum().
blkdev_get_by_path() is slightly different in that it doesn't
automatically add %FMODE_EXCL to @mode.

All users are converted.  Most conversions are mechanical and don't
introduce any behavior difference.  There are several exceptions.

* btrfs now sets FMODE_EXCL in btrfs_device->mode, so there's no
  reason to OR it explicitly on blkdev_put().

* gfs2, nilfs2 and the generic mount_bdev() now set FMODE_EXCL in
  sb->s_mode.

* With the above changes, sb->s_mode now always should contain
  FMODE_EXCL.  WARN_ON_ONCE() added to kill_block_super() to detect
  errors.

The new blkdev_get_*() functions are with proper docbook comments.
While at it, add function description to blkdev_get() too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Joern Engel <joern@lazybastard.org>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: reiserfs-devel@vger.kernel.org
Cc: xfs-masters@oss.sgi.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/drbd/drbd_nl.c    |  12 ++--
 drivers/md/dm-table.c           |   2 +-
 drivers/md/md.c                 |   4 +-
 drivers/mtd/devices/block2mtd.c |   4 +-
 fs/block_dev.c                  | 139 +++++++++++++++++++++++++++-------------
 fs/btrfs/volumes.c              |  24 ++++---
 fs/btrfs/volumes.h              |   2 +-
 fs/ext3/super.c                 |   2 +-
 fs/ext4/super.c                 |   2 +-
 fs/gfs2/ops_fstype.c            |   8 +--
 fs/jfs/jfs_logmgr.c             |   4 +-
 fs/logfs/dev_bdev.c             |   3 +-
 fs/nilfs2/super.c               |   8 +--
 fs/reiserfs/journal.c           |   6 +-
 fs/super.c                      |   9 +--
 fs/xfs/linux-2.6/xfs_super.c    |   3 +-
 include/linux/fs.h              |   7 +-
 kernel/power/swap.c             |   4 +-
 18 files changed, 149 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fd0346090289..650e43ba4f7c 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -902,8 +902,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		}
 	}
 
-	bdev = open_bdev_exclusive(nbc->dc.backing_dev,
-				   FMODE_READ | FMODE_WRITE, mdev);
+	bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
 	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
 			PTR_ERR(bdev));
@@ -920,10 +920,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * should check it for you already; but if you don't, or
 	 * someone fooled it, we need to double check here)
 	 */
-	bdev = open_bdev_exclusive(nbc->dc.meta_dev,
-				   FMODE_READ | FMODE_WRITE,
-				   (nbc->dc.meta_dev_idx < 0) ?
-				   (void *)mdev : (void *)drbd_m_holder);
+	bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				  (nbc->dc.meta_dev_idx < 0) ?
+				  (void *)mdev : (void *)drbd_m_holder);
 	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
 			PTR_ERR(bdev));
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9e88ca0c55e9..67150c32986c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -325,7 +325,7 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 
 	BUG_ON(d->dm_dev.bdev);
 
-	bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
+	bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6af951ffe0bb..5aaa6bfbe638 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1934,8 +1934,8 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-			      shared ? (mdk_rdev_t *)lock_rdev : rdev);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 shared ? (mdk_rdev_t *)lock_rdev : rdev);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index aa557beb8f51..f29a6f9df6e7 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -247,7 +247,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		return NULL;
 
 	/* Get a handle on the device */
-	bdev = open_bdev_exclusive(devname, mode, dev);
+	bdev = blkdev_get_by_path(devname, mode, dev);
 #ifndef MODULE
 	if (IS_ERR(bdev)) {
 
@@ -256,7 +256,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 
 		dev_t devt = name_to_dev_t(devname);
 		if (devt)
-			bdev = open_by_devnum(devt, mode, dev);
+			bdev = blkdev_get_by_dev(devt, mode, dev);
 	}
 #endif
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 606a5259f87f..c1c1b8c3fb99 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -854,24 +854,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev)
 { }
 #endif
 
-/*
- * Tries to open block device by device number.  Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number.  _Never_
- * to be used for internal purposes.  If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder)
-{
-	struct block_device *bdev = bdget(dev);
-	int err = -ENOMEM;
-	if (bdev)
-		err = blkdev_get(bdev, mode, holder);
-	return err ? ERR_PTR(err) : bdev;
-}
-
-EXPORT_SYMBOL(open_by_devnum);
-
 /**
  * flush_disk - invalidates all buffer-cache entries on a disk
  *
@@ -1132,6 +1114,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return ret;
 }
 
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
 	struct block_device *whole = NULL;
@@ -1186,6 +1187,80 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 }
 EXPORT_SYMBOL(blkdev_get);
 
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev))
+		return bdev;
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return ERR_PTR(-ENOMEM);
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
@@ -1436,34 +1511,6 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
 
-/**
- * open_bdev_exclusive  -  open a block device by name and set it up for use
- *
- * @path:	special file representing the block device
- * @mode:	FMODE_... combination to pass be used
- * @holder:	owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
-	struct block_device *bdev;
-	int error;
-
-	bdev = lookup_bdev(path);
-	if (IS_ERR(bdev))
-		return bdev;
-
-	error = blkdev_get(bdev, mode | FMODE_EXCL, holder);
-	if (error)
-		return ERR_PTR(error);
-
-	return bdev;
-}
-
-EXPORT_SYMBOL(open_bdev_exclusive);
-
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1b729d3b883..95324e9f9280 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -489,7 +489,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+			blkdev_put(device->bdev, device->mode);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
-			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+			blkdev_put(device->bdev, device->mode);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -580,13 +580,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	int seeding = 1;
 	int ret = 0;
 
+	flags |= FMODE_EXCL;
+
 	list_for_each_entry(device, head, dev_list) {
 		if (device->bdev)
 			continue;
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_exclusive(device->name, flags, holder);
+		bdev = blkdev_get_by_path(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
 			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
@@ -638,7 +640,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		blkdev_put(bdev, flags | FMODE_EXCL);
+		blkdev_put(bdev, flags);
 error:
 		continue;
 	}
@@ -684,7 +686,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	bdev = open_bdev_exclusive(path, flags, holder);
+	flags |= FMODE_EXCL;
+	bdev = blkdev_get_by_path(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
@@ -716,7 +719,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	blkdev_put(bdev, flags | FMODE_EXCL);
+	blkdev_put(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1179,8 +1182,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_exclusive(device_path, FMODE_READ,
-				      root->fs_info->bdev_holder);
+		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+					  root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
 			goto out;
@@ -1244,7 +1247,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+		blkdev_put(device->bdev, device->mode);
 		device->bdev = NULL;
 		device->fs_devices->open_devices--;
 	}
@@ -1439,7 +1442,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+				  root->fs_info->bdev_holder);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4eea..856e75770304 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -49,7 +49,7 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
-	/* the mode sent to open_bdev_exclusive */
+	/* the mode sent to blkdev_get */
 	fmode_t mode;
 
 	char *name;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 23e7513dba9c..123720ba786d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5dd0b3e76fa8..bd63e6927219 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1f0763a022b..bc56ccf98ffd 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error;
 	struct gfs2_args args;
 	struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		goto error_bdev;
 
 	if (s->s_root)
-		blkdev_put(bdev, mode | FMODE_EXCL);
+		blkdev_put(bdev, mode);
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
 	deactivate_locked_super(s);
 	return ERR_PTR(error);
 error_bdev:
-	blkdev_put(bdev, mode | FMODE_EXCL);
+	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 5a290f22dcc3..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,8 +1120,8 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-			      log);
+	bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 log);
 	if (IS_ERR(bdev)) {
 		rc = -PTR_ERR(bdev);
 		goto free;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 734b9025858e..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -325,7 +325,8 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 {
 	struct block_device *bdev;
 
-	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+	bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  type);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 756a6798d7c8..0030640e2d72 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1147,14 +1147,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 {
 	struct nilfs_super_data sd;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(sd.bdev))
 		return ERR_CAST(sd.bdev);
 
@@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s_new)
-		blkdev_put(sd.bdev, mode | FMODE_EXCL);
+		blkdev_put(sd.bdev, mode);
 
 	return root_dentry;
 
@@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 
  failed:
 	if (!s_new)
-		blkdev_put(sd.bdev, mode | FMODE_EXCL);
+		blkdev_put(sd.bdev, mode);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b488136f5ace..e2fce519c0f2 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2585,7 +2585,8 @@ static int journal_init_dev(struct super_block *super,
 	if ((!jdev_name || !jdev_name[0])) {
 		if (jdev == super->s_dev)
 			blkdev_mode &= ~FMODE_EXCL;
-		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal);
+		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+						      journal);
 		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
@@ -2601,8 +2602,7 @@ static int journal_init_dev(struct super_block *super,
 	}
 
 	journal->j_dev_mode = blkdev_mode;
-	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
-						blkdev_mode, journal);
+	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
diff --git a/fs/super.c b/fs/super.c
index 22374bf0ba87..5d9a4497849a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -766,13 +766,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -807,7 +807,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		 * holding an active reference.
 		 */
 		up_write(&s->s_umount);
-		blkdev_put(bdev, mode | FMODE_EXCL);
+		blkdev_put(bdev, mode);
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	blkdev_put(bdev, mode | FMODE_EXCL);
+	blkdev_put(bdev, mode);
 error:
 	return ERR_PTR(error);
 }
@@ -862,6 +862,7 @@ void kill_block_super(struct super_block *sb)
 	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
+	WARN_ON_ONCE(!(mode & FMODE_EXCL));
 	blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a1a6e5ceea67..9209cd199c47 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -609,7 +609,8 @@ xfs_blkdev_get(
 {
 	int			error = 0;
 
-	*bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+	*bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				    mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
 		printk("XFS: Invalid device [%s], error=%d\n", name, error);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1a033e8ebe4c..f48501563917 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2006,8 +2006,6 @@ extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
-extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode,
-					   void *holder);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
@@ -2039,6 +2037,10 @@ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
+extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					       void *holder);
+extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
+					      void *holder);
 extern int blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
@@ -2083,7 +2085,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name)
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
-extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 513a77f1a0b3..b019609d1b45 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -907,8 +907,8 @@ int swsusp_check(void)
 {
 	int error;
 
-	hib_resume_bdev = open_by_devnum(swsusp_resume_device,
-					 FMODE_READ, NULL);
+	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+					    FMODE_READ, NULL);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-- 
cgit v1.2.3-71-gd317


From 073ef1f6e508688392580e4f35dcad9aabd1e100 Mon Sep 17 00:00:00 2001
From: Lionel Debroux <lionel_debroux@yahoo.fr>
Date: Tue, 9 Nov 2010 21:48:49 +0100
Subject: hibernation: constify platform_hibernation_ops

Patch against mainline.

Changes since v1: added one hunk; no longer adding "const" qualifier to
pointers in platform_hibernation_ops after seeing
b4144e4f6e3b448d322095ca08af393682a69e33.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/acpi/sleep.c     | 4 ++--
 include/linux/suspend.h  | 4 ++--
 kernel/power/hibernate.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 721d93b3ceee..5149c9bf7015 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -498,7 +498,7 @@ static void acpi_pm_thaw(void)
 	acpi_enable_all_runtime_gpes();
 }
 
-static struct platform_hibernation_ops acpi_hibernation_ops = {
+static const struct platform_hibernation_ops acpi_hibernation_ops = {
 	.begin = acpi_hibernation_begin,
 	.end = acpi_pm_end,
 	.pre_snapshot = acpi_pm_prepare,
@@ -541,7 +541,7 @@ static int acpi_hibernation_begin_old(void)
  * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
  * been requested.
  */
-static struct platform_hibernation_ops acpi_hibernation_ops_old = {
+static const struct platform_hibernation_ops acpi_hibernation_ops_old = {
 	.begin = acpi_hibernation_begin_old,
 	.end = acpi_pm_end,
 	.pre_snapshot = acpi_pm_pre_suspend,
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 26697514c5ec..40ead943fd6a 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -245,7 +245,7 @@ extern void swsusp_set_page_free(struct page *);
 extern void swsusp_unset_page_free(struct page *);
 extern unsigned long get_safe_page(gfp_t gfp_mask);
 
-extern void hibernation_set_ops(struct platform_hibernation_ops *ops);
+extern void hibernation_set_ops(const struct platform_hibernation_ops *ops);
 extern int hibernate(void);
 extern bool system_entering_hibernation(void);
 #else /* CONFIG_HIBERNATION */
@@ -253,7 +253,7 @@ static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
 
-static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {}
+static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..491b81a27111 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,14 +51,14 @@ enum {
 
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
 
-static struct platform_hibernation_ops *hibernation_ops;
+static const struct platform_hibernation_ops *hibernation_ops;
 
 /**
  * hibernation_set_ops - set the global hibernate operations
  * @ops: the hibernation operations to use in subsequent hibernation transitions
  */
 
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 {
 	if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
 	    && ops->prepare && ops->finish && ops->enter && ops->pre_restore
-- 
cgit v1.2.3-71-gd317


From 2f55ac072f5344519348c0c94b3d2f4cca46847b Mon Sep 17 00:00:00 2001
From: Lionel Debroux <lionel_debroux@yahoo.fr>
Date: Tue, 16 Nov 2010 14:14:02 +0100
Subject: suspend: constify platform_suspend_ops

While at it, fix two checkpatch errors.
Several non-const struct instances constified by this patch were added after
the introduction of platform_suspend_ops in checkpatch.pl's list of "should
be const" structs (79404849e90a41ea2109bd0e2f7c7164b0c4ce73).

Patch against mainline.
Inspired by hunks of the grsecurity patch, updated for newer kernels.

Signed-off-by: Lionel Debroux <lionel_debroux@yahoo.fr>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-at91/pm.c                   | 2 +-
 arch/arm/mach-davinci/pm.c                | 2 +-
 arch/arm/mach-imx/pm-imx27.c              | 2 +-
 arch/arm/mach-lpc32xx/pm.c                | 2 +-
 arch/arm/mach-omap1/pm.c                  | 2 +-
 arch/arm/mach-omap2/pm24xx.c              | 2 +-
 arch/arm/mach-omap2/pm34xx.c              | 2 +-
 arch/arm/mach-omap2/pm44xx.c              | 2 +-
 arch/arm/mach-pnx4008/pm.c                | 2 +-
 arch/arm/mach-pxa/pm.c                    | 2 +-
 arch/arm/mach-pxa/sharpsl_pm.c            | 2 +-
 arch/arm/mach-sa1100/pm.c                 | 2 +-
 arch/arm/plat-samsung/pm.c                | 2 +-
 arch/avr32/mach-at32ap/pm.c               | 2 +-
 arch/blackfin/mach-common/pm.c            | 2 +-
 arch/mips/alchemy/devboards/pm.c          | 2 +-
 arch/mips/jz4740/pm.c                     | 2 +-
 arch/mips/loongson/common/pm.c            | 2 +-
 arch/powerpc/platforms/52xx/lite5200_pm.c | 2 +-
 arch/powerpc/platforms/52xx/mpc52xx_pm.c  | 2 +-
 arch/powerpc/platforms/83xx/suspend.c     | 2 +-
 arch/powerpc/platforms/pseries/suspend.c  | 2 +-
 arch/powerpc/sysdev/fsl_pmc.c             | 2 +-
 arch/sh/boards/mach-hp6xx/pm.c            | 2 +-
 arch/sh/kernel/cpu/shmobile/pm.c          | 2 +-
 drivers/acpi/sleep.c                      | 4 ++--
 drivers/macintosh/via-pmu.c               | 2 +-
 include/linux/suspend.h                   | 4 ++--
 kernel/power/suspend.c                    | 4 ++--
 29 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index dafbacc25eb1..ea53f4d9b283 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -301,7 +301,7 @@ static void at91_pm_end(void)
 }
 
 
-static struct platform_suspend_ops at91_pm_ops ={
+static const struct platform_suspend_ops at91_pm_ops = {
 	.valid	= at91_pm_valid_state,
 	.begin	= at91_pm_begin,
 	.enter	= at91_pm_enter,
diff --git a/arch/arm/mach-davinci/pm.c b/arch/arm/mach-davinci/pm.c
index fab953b43dea..1bd73a04be20 100644
--- a/arch/arm/mach-davinci/pm.c
+++ b/arch/arm/mach-davinci/pm.c
@@ -110,7 +110,7 @@ static int davinci_pm_enter(suspend_state_t state)
 	return ret;
 }
 
-static struct platform_suspend_ops davinci_pm_ops = {
+static const struct platform_suspend_ops davinci_pm_ops = {
 	.enter		= davinci_pm_enter,
 	.valid		= suspend_valid_only_mem,
 };
diff --git a/arch/arm/mach-imx/pm-imx27.c b/arch/arm/mach-imx/pm-imx27.c
index afc17ce0bb54..2153ca71bb82 100644
--- a/arch/arm/mach-imx/pm-imx27.c
+++ b/arch/arm/mach-imx/pm-imx27.c
@@ -32,7 +32,7 @@ static int mx27_suspend_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops mx27_suspend_ops = {
+static const struct platform_suspend_ops mx27_suspend_ops = {
 	.enter = mx27_suspend_enter,
 	.valid = suspend_valid_only_mem,
 };
diff --git a/arch/arm/mach-lpc32xx/pm.c b/arch/arm/mach-lpc32xx/pm.c
index a6e2aed9a49f..e76d41bb7056 100644
--- a/arch/arm/mach-lpc32xx/pm.c
+++ b/arch/arm/mach-lpc32xx/pm.c
@@ -123,7 +123,7 @@ static int lpc32xx_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops lpc32xx_pm_ops = {
+static const struct platform_suspend_ops lpc32xx_pm_ops = {
 	.valid	= suspend_valid_only_mem,
 	.enter	= lpc32xx_pm_enter,
 };
diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c
index b1d3f9fade23..4cf3b67dd998 100644
--- a/arch/arm/mach-omap1/pm.c
+++ b/arch/arm/mach-omap1/pm.c
@@ -647,7 +647,7 @@ static struct irqaction omap_wakeup_irq = {
 
 
-static struct platform_suspend_ops omap_pm_ops ={
+static const struct platform_suspend_ops omap_pm_ops = {
 	.prepare	= omap_pm_prepare,
 	.enter		= omap_pm_enter,
 	.finish		= omap_pm_finish,
diff --git a/arch/arm/mach-omap2/pm24xx.c b/arch/arm/mach-omap2/pm24xx.c
index a40457d81927..aa9764eeb941 100644
--- a/arch/arm/mach-omap2/pm24xx.c
+++ b/arch/arm/mach-omap2/pm24xx.c
@@ -326,7 +326,7 @@ static void omap2_pm_finish(void)
 	enable_hlt();
 }
 
-static struct platform_suspend_ops omap_pm_ops = {
+static const struct platform_suspend_ops omap_pm_ops = {
 	.prepare	= omap2_pm_prepare,
 	.enter		= omap2_pm_enter,
 	.finish		= omap2_pm_finish,
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 75c0cd13ad8e..4000c3c8bbd5 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -594,7 +594,7 @@ static void omap3_pm_end(void)
 	return;
 }
 
-static struct platform_suspend_ops omap_pm_ops = {
+static const struct platform_suspend_ops omap_pm_ops = {
 	.begin		= omap3_pm_begin,
 	.end		= omap3_pm_end,
 	.prepare	= omap3_pm_prepare,
diff --git a/arch/arm/mach-omap2/pm44xx.c b/arch/arm/mach-omap2/pm44xx.c
index 54544b4fc76b..dc8b1ef9f84c 100644
--- a/arch/arm/mach-omap2/pm44xx.c
+++ b/arch/arm/mach-omap2/pm44xx.c
@@ -75,7 +75,7 @@ static void omap4_pm_end(void)
 	return;
 }
 
-static struct platform_suspend_ops omap_pm_ops = {
+static const struct platform_suspend_ops omap_pm_ops = {
 	.begin		= omap4_pm_begin,
 	.end		= omap4_pm_end,
 	.prepare	= omap4_pm_prepare,
diff --git a/arch/arm/mach-pnx4008/pm.c b/arch/arm/mach-pnx4008/pm.c
index ee3c29c57ae3..f3e60a049f98 100644
--- a/arch/arm/mach-pnx4008/pm.c
+++ b/arch/arm/mach-pnx4008/pm.c
@@ -119,7 +119,7 @@ static int pnx4008_pm_valid(suspend_state_t state)
 	       (state == PM_SUSPEND_MEM);
 }
 
-static struct platform_suspend_ops pnx4008_pm_ops = {
+static const struct platform_suspend_ops pnx4008_pm_ops = {
 	.enter = pnx4008_pm_enter,
 	.valid = pnx4008_pm_valid,
 };
diff --git a/arch/arm/mach-pxa/pm.c b/arch/arm/mach-pxa/pm.c
index 166c15f62916..978e1b289544 100644
--- a/arch/arm/mach-pxa/pm.c
+++ b/arch/arm/mach-pxa/pm.c
@@ -96,7 +96,7 @@ void pxa_pm_finish(void)
 		pxa_cpu_pm_fns->finish();
 }
 
-static struct platform_suspend_ops pxa_pm_ops = {
+static const struct platform_suspend_ops pxa_pm_ops = {
 	.valid		= pxa_pm_valid,
 	.enter		= pxa_pm_enter,
 	.prepare	= pxa_pm_prepare,
diff --git a/arch/arm/mach-pxa/sharpsl_pm.c b/arch/arm/mach-pxa/sharpsl_pm.c
index 8fed027b12dc..02874e96ec72 100644
--- a/arch/arm/mach-pxa/sharpsl_pm.c
+++ b/arch/arm/mach-pxa/sharpsl_pm.c
@@ -868,7 +868,7 @@ static void sharpsl_apm_get_power_status(struct apm_power_info *info)
 }
 
 #ifdef CONFIG_PM
-static struct platform_suspend_ops sharpsl_pm_ops = {
+static const struct platform_suspend_ops sharpsl_pm_ops = {
 	.prepare	= pxa_pm_prepare,
 	.finish		= pxa_pm_finish,
 	.enter		= corgi_pxa_pm_enter,
diff --git a/arch/arm/mach-sa1100/pm.c b/arch/arm/mach-sa1100/pm.c
index c83fdc80edfd..ab9fc4470d36 100644
--- a/arch/arm/mach-sa1100/pm.c
+++ b/arch/arm/mach-sa1100/pm.c
@@ -120,7 +120,7 @@ unsigned long sleep_phys_sp(void *sp)
 	return virt_to_phys(sp);
 }
 
-static struct platform_suspend_ops sa11x0_pm_ops = {
+static const struct platform_suspend_ops sa11x0_pm_ops = {
 	.enter		= sa11x0_pm_enter,
 	.valid		= suspend_valid_only_mem,
 };
diff --git a/arch/arm/plat-samsung/pm.c b/arch/arm/plat-samsung/pm.c
index 27cfca597699..5bf3f2f09e74 100644
--- a/arch/arm/plat-samsung/pm.c
+++ b/arch/arm/plat-samsung/pm.c
@@ -355,7 +355,7 @@ static void s3c_pm_finish(void)
 	s3c_pm_check_cleanup();
 }
 
-static struct platform_suspend_ops s3c_pm_ops = {
+static const struct platform_suspend_ops s3c_pm_ops = {
 	.enter		= s3c_pm_enter,
 	.prepare	= s3c_pm_prepare,
 	.finish		= s3c_pm_finish,
diff --git a/arch/avr32/mach-at32ap/pm.c b/arch/avr32/mach-at32ap/pm.c
index f021edfeaab0..32d680eb6f48 100644
--- a/arch/avr32/mach-at32ap/pm.c
+++ b/arch/avr32/mach-at32ap/pm.c
@@ -176,7 +176,7 @@ out:
 	return 0;
 }
 
-static struct platform_suspend_ops avr32_pm_ops = {
+static const struct platform_suspend_ops avr32_pm_ops = {
 	.valid	= avr32_pm_valid_state,
 	.enter	= avr32_pm_enter,
 };
diff --git a/arch/blackfin/mach-common/pm.c b/arch/blackfin/mach-common/pm.c
index 80884b136a0c..745af2d96553 100644
--- a/arch/blackfin/mach-common/pm.c
+++ b/arch/blackfin/mach-common/pm.c
@@ -233,7 +233,7 @@ static int bfin_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-struct platform_suspend_ops bfin_pm_ops = {
+static const struct platform_suspend_ops bfin_pm_ops = {
 	.enter = bfin_pm_enter,
 	.valid	= bfin_pm_valid,
 };
diff --git a/arch/mips/alchemy/devboards/pm.c b/arch/mips/alchemy/devboards/pm.c
index 4bbd3133e451..acaf91b5e461 100644
--- a/arch/mips/alchemy/devboards/pm.c
+++ b/arch/mips/alchemy/devboards/pm.c
@@ -110,7 +110,7 @@ static void db1x_pm_end(void)
 
 }
 
-static struct platform_suspend_ops db1x_pm_ops = {
+static const struct platform_suspend_ops db1x_pm_ops = {
 	.valid		= suspend_valid_only_mem,
 	.begin		= db1x_pm_begin,
 	.enter		= db1x_pm_enter,
diff --git a/arch/mips/jz4740/pm.c b/arch/mips/jz4740/pm.c
index a9994585424d..902d5b50124c 100644
--- a/arch/mips/jz4740/pm.c
+++ b/arch/mips/jz4740/pm.c
@@ -42,7 +42,7 @@ static int jz4740_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops jz4740_pm_ops = {
+static const struct platform_suspend_ops jz4740_pm_ops = {
 	.valid		= suspend_valid_only_mem,
 	.enter		= jz4740_pm_enter,
 };
diff --git a/arch/mips/loongson/common/pm.c b/arch/mips/loongson/common/pm.c
index 6c1fd9001712..f55e07aee071 100644
--- a/arch/mips/loongson/common/pm.c
+++ b/arch/mips/loongson/common/pm.c
@@ -147,7 +147,7 @@ static int loongson_pm_valid_state(suspend_state_t state)
 	}
 }
 
-static struct platform_suspend_ops loongson_pm_ops = {
+static const struct platform_suspend_ops loongson_pm_ops = {
 	.valid	= loongson_pm_valid_state,
 	.enter	= loongson_pm_enter,
 };
diff --git a/arch/powerpc/platforms/52xx/lite5200_pm.c b/arch/powerpc/platforms/52xx/lite5200_pm.c
index 80234e5921f5..eda0fc2a3914 100644
--- a/arch/powerpc/platforms/52xx/lite5200_pm.c
+++ b/arch/powerpc/platforms/52xx/lite5200_pm.c
@@ -232,7 +232,7 @@ static void lite5200_pm_end(void)
 	lite5200_pm_target_state = PM_SUSPEND_ON;
 }
 
-static struct platform_suspend_ops lite5200_pm_ops = {
+static const struct platform_suspend_ops lite5200_pm_ops = {
 	.valid		= lite5200_pm_valid,
 	.begin		= lite5200_pm_begin,
 	.prepare	= lite5200_pm_prepare,
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pm.c b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
index 568cef636275..8310e8b5b57f 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pm.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
@@ -186,7 +186,7 @@ void mpc52xx_pm_finish(void)
 	iounmap(mbar);
 }
 
-static struct platform_suspend_ops mpc52xx_pm_ops = {
+static const struct platform_suspend_ops mpc52xx_pm_ops = {
 	.valid		= mpc52xx_pm_valid,
 	.prepare	= mpc52xx_pm_prepare,
 	.enter		= mpc52xx_pm_enter,
diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c
index 75ae77f1af6a..fd4f2f2f19e6 100644
--- a/arch/powerpc/platforms/83xx/suspend.c
+++ b/arch/powerpc/platforms/83xx/suspend.c
@@ -311,7 +311,7 @@ static int mpc83xx_is_pci_agent(void)
 	return ret;
 }
 
-static struct platform_suspend_ops mpc83xx_suspend_ops = {
+static const struct platform_suspend_ops mpc83xx_suspend_ops = {
 	.valid = mpc83xx_suspend_valid,
 	.begin = mpc83xx_suspend_begin,
 	.enter = mpc83xx_suspend_enter,
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
index ed72098bb4e3..a8ca289ff267 100644
--- a/arch/powerpc/platforms/pseries/suspend.c
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -153,7 +153,7 @@ static struct sysdev_class suspend_sysdev_class = {
 	.name = "power",
 };
 
-static struct platform_suspend_ops pseries_suspend_ops = {
+static const struct platform_suspend_ops pseries_suspend_ops = {
 	.valid		= suspend_valid_only_mem,
 	.begin		= pseries_suspend_begin,
 	.prepare_late	= pseries_prepare_late,
diff --git a/arch/powerpc/sysdev/fsl_pmc.c b/arch/powerpc/sysdev/fsl_pmc.c
index 44de8559c975..e9381bfefb21 100644
--- a/arch/powerpc/sysdev/fsl_pmc.c
+++ b/arch/powerpc/sysdev/fsl_pmc.c
@@ -53,7 +53,7 @@ static int pmc_suspend_valid(suspend_state_t state)
 	return 1;
 }
 
-static struct platform_suspend_ops pmc_suspend_ops = {
+static const struct platform_suspend_ops pmc_suspend_ops = {
 	.valid = pmc_suspend_valid,
 	.enter = pmc_suspend_enter,
 };
diff --git a/arch/sh/boards/mach-hp6xx/pm.c b/arch/sh/boards/mach-hp6xx/pm.c
index 4499a3749d40..adc9b4bba828 100644
--- a/arch/sh/boards/mach-hp6xx/pm.c
+++ b/arch/sh/boards/mach-hp6xx/pm.c
@@ -143,7 +143,7 @@ static int hp6x0_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops hp6x0_pm_ops = {
+static const struct platform_suspend_ops hp6x0_pm_ops = {
 	.enter		= hp6x0_pm_enter,
 	.valid		= suspend_valid_only_mem,
 };
diff --git a/arch/sh/kernel/cpu/shmobile/pm.c b/arch/sh/kernel/cpu/shmobile/pm.c
index e55968712706..a6f95ae4aae7 100644
--- a/arch/sh/kernel/cpu/shmobile/pm.c
+++ b/arch/sh/kernel/cpu/shmobile/pm.c
@@ -141,7 +141,7 @@ static int sh_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops sh_pm_ops = {
+static const struct platform_suspend_ops sh_pm_ops = {
 	.enter          = sh_pm_enter,
 	.valid          = suspend_valid_only_mem,
 };
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 5149c9bf7015..ba1caf724a16 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -319,7 +319,7 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state)
 	}
 }
 
-static struct platform_suspend_ops acpi_suspend_ops = {
+static const struct platform_suspend_ops acpi_suspend_ops = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin,
 	.prepare_late = acpi_pm_prepare,
@@ -347,7 +347,7 @@ static int acpi_suspend_begin_old(suspend_state_t pm_state)
  * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
  * been requested.
  */
-static struct platform_suspend_ops acpi_suspend_ops_old = {
+static const struct platform_suspend_ops acpi_suspend_ops_old = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin_old,
 	.prepare_late = acpi_pm_pre_suspend,
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index cd29c8248386..8b021eb0d48c 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2257,7 +2257,7 @@ static int pmu_sleep_valid(suspend_state_t state)
 		&& (pmac_call_feature(PMAC_FTR_SLEEP_STATE, NULL, 0, -1) >= 0);
 }
 
-static struct platform_suspend_ops pmu_pm_ops = {
+static const struct platform_suspend_ops pmu_pm_ops = {
 	.enter = powerbook_sleep,
 	.valid = pmu_sleep_valid,
 };
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 40ead943fd6a..f45f3ccfdfa9 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -122,7 +122,7 @@ struct platform_suspend_ops {
  * suspend_set_ops - set platform dependent suspend operations
  * @ops: The new suspend operations to set.
  */
-extern void suspend_set_ops(struct platform_suspend_ops *ops);
+extern void suspend_set_ops(const struct platform_suspend_ops *ops);
 extern int suspend_valid_only_mem(suspend_state_t state);
 
 /**
@@ -147,7 +147,7 @@ extern int pm_suspend(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 #define suspend_valid_only_mem	NULL
 
-static inline void suspend_set_ops(struct platform_suspend_ops *ops) {}
+static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
 #endif /* !CONFIG_SUSPEND */
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..80051bdde6f1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,13 +30,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_MEM]	= "mem",
 };
 
-static struct platform_suspend_ops *suspend_ops;
+static const struct platform_suspend_ops *suspend_ops;
 
 /**
  *	suspend_set_ops - Set the global suspend method table.
  *	@ops:	Pointer to ops structure.
  */
-void suspend_set_ops(struct platform_suspend_ops *ops)
+void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
 	mutex_lock(&pm_mutex);
 	suspend_ops = ops;
-- 
cgit v1.2.3-71-gd317


From 6b6e39a6a8da7234c538d14c43d3583da8875f9c Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Mon, 15 Nov 2010 23:13:18 +0100
Subject: driver-core: merge private parts of class and bus

As classes and busses are pretty much the same thing, and we want to
merge them together into a 'subsystem' in the future, let us share the
same private data parts to make that merge easier.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    | 62 ++++++++++++++++++++------------------------------
 drivers/base/bus.c     | 13 +++++------
 drivers/base/class.c   | 38 +++++++++++++++----------------
 drivers/base/core.c    | 22 +++++++++---------
 include/linux/device.h |  7 +++---
 5 files changed, 64 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 2ca7f5b7b824..19f49e41ce5d 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -1,31 +1,46 @@
 
 /**
- * struct bus_type_private - structure to hold the private to the driver core portions of the bus_type structure.
+ * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
  *
- * @subsys - the struct kset that defines this bus.  This is the main kobject
- * @drivers_kset - the list of drivers associated with this bus
- * @devices_kset - the list of devices associated with this bus
+ * @subsys - the struct kset that defines this subsystem
+ * @devices_kset - the list of devices associated
+ *
+ * @drivers_kset - the list of drivers associated
  * @klist_devices - the klist to iterate over the @devices_kset
  * @klist_drivers - the klist to iterate over the @drivers_kset
  * @bus_notifier - the bus notifier list for anything that cares about things
- * on this bus.
+ *                 on this bus.
  * @bus - pointer back to the struct bus_type that this structure is associated
- * with.
+ *        with.
+ *
+ * @class_interfaces - list of class_interfaces associated
+ * @glue_dirs - "glue" directory to put in-between the parent device to
+ *              avoid namespace conflicts
+ * @class_mutex - mutex to protect the children, devices, and interfaces lists.
+ * @class - pointer back to the struct class that this structure is associated
+ *          with.
  *
  * This structure is the one that is the actual kobject allowing struct
- * bus_type to be statically allocated safely.  Nothing outside of the driver
- * core should ever touch these fields.
+ * bus_type/class to be statically allocated safely.  Nothing outside of the
+ * driver core should ever touch these fields.
  */
-struct bus_type_private {
+struct subsys_private {
 	struct kset subsys;
-	struct kset *drivers_kset;
 	struct kset *devices_kset;
+
+	struct kset *drivers_kset;
 	struct klist klist_devices;
 	struct klist klist_drivers;
 	struct blocking_notifier_head bus_notifier;
 	unsigned int drivers_autoprobe:1;
 	struct bus_type *bus;
+
+	struct list_head class_interfaces;
+	struct kset glue_dirs;
+	struct mutex class_mutex;
+	struct class *class;
 };
+#define to_subsys_private(obj) container_of(obj, struct subsys_private, subsys.kobj)
 
 struct driver_private {
 	struct kobject kobj;
@@ -36,33 +51,6 @@ struct driver_private {
 };
 #define to_driver(obj) container_of(obj, struct driver_private, kobj)
 
-
-/**
- * struct class_private - structure to hold the private to the driver core portions of the class structure.
- *
- * @class_subsys - the struct kset that defines this class.  This is the main kobject
- * @class_devices - list of devices associated with this class
- * @class_interfaces - list of class_interfaces associated with this class
- * @class_dirs - "glue" directory for virtual devices associated with this class
- * @class_mutex - mutex to protect the children, devices, and interfaces lists.
- * @class - pointer back to the struct class that this structure is associated
- * with.
- *
- * This structure is the one that is the actual kobject allowing struct
- * class to be statically allocated safely.  Nothing outside of the driver
- * core should ever touch these fields.
- */
-struct class_private {
-	struct kset class_subsys;
-	struct klist class_devices;
-	struct list_head class_interfaces;
-	struct kset class_dirs;
-	struct mutex class_mutex;
-	struct class *class;
-};
-#define to_class(obj)	\
-	container_of(obj, struct class_private, class_subsys.kobj)
-
 /**
  * struct device_private - structure to hold the private to the driver core portions of the device structure.
  *
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 33c270a64db7..e243bd49764b 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -20,7 +20,6 @@
 #include "power/power.h"
 
 #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr)
-#define to_bus(obj) container_of(obj, struct bus_type_private, subsys.kobj)
 
 /*
  * sysfs bindings for drivers
@@ -96,11 +95,11 @@ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr,
 			     char *buf)
 {
 	struct bus_attribute *bus_attr = to_bus_attr(attr);
-	struct bus_type_private *bus_priv = to_bus(kobj);
+	struct subsys_private *subsys_priv = to_subsys_private(kobj);
 	ssize_t ret = 0;
 
 	if (bus_attr->show)
-		ret = bus_attr->show(bus_priv->bus, buf);
+		ret = bus_attr->show(subsys_priv->bus, buf);
 	return ret;
 }
 
@@ -108,11 +107,11 @@ static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr,
 			      const char *buf, size_t count)
 {
 	struct bus_attribute *bus_attr = to_bus_attr(attr);
-	struct bus_type_private *bus_priv = to_bus(kobj);
+	struct subsys_private *subsys_priv = to_subsys_private(kobj);
 	ssize_t ret = 0;
 
 	if (bus_attr->store)
-		ret = bus_attr->store(bus_priv->bus, buf, count);
+		ret = bus_attr->store(subsys_priv->bus, buf, count);
 	return ret;
 }
 
@@ -858,9 +857,9 @@ static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store);
 int bus_register(struct bus_type *bus)
 {
 	int retval;
-	struct bus_type_private *priv;
+	struct subsys_private *priv;
 
-	priv = kzalloc(sizeof(struct bus_type_private), GFP_KERNEL);
+	priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
 
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 7975a52bdf5b..4f1df2e8fd74 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -27,7 +27,7 @@ static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr,
 			       char *buf)
 {
 	struct class_attribute *class_attr = to_class_attr(attr);
-	struct class_private *cp = to_class(kobj);
+	struct subsys_private *cp = to_subsys_private(kobj);
 	ssize_t ret = -EIO;
 
 	if (class_attr->show)
@@ -39,7 +39,7 @@ static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr,
 				const char *buf, size_t count)
 {
 	struct class_attribute *class_attr = to_class_attr(attr);
-	struct class_private *cp = to_class(kobj);
+	struct subsys_private *cp = to_subsys_private(kobj);
 	ssize_t ret = -EIO;
 
 	if (class_attr->store)
@@ -49,7 +49,7 @@ static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr,
 
 static void class_release(struct kobject *kobj)
 {
-	struct class_private *cp = to_class(kobj);
+	struct subsys_private *cp = to_subsys_private(kobj);
 	struct class *class = cp->class;
 
 	pr_debug("class '%s': release.\n", class->name);
@@ -65,7 +65,7 @@ static void class_release(struct kobject *kobj)
 
 static const struct kobj_ns_type_operations *class_child_ns_type(struct kobject *kobj)
 {
-	struct class_private *cp = to_class(kobj);
+	struct subsys_private *cp = to_subsys_private(kobj);
 	struct class *class = cp->class;
 
 	return class->ns_type;
@@ -82,7 +82,7 @@ static struct kobj_type class_ktype = {
 	.child_ns_type	= class_child_ns_type,
 };
 
-/* Hotplug events for classes go to the class class_subsys */
+/* Hotplug events for classes go to the class subsys */
 static struct kset *class_kset;
 
 
@@ -90,7 +90,7 @@ int class_create_file(struct class *cls, const struct class_attribute *attr)
 {
 	int error;
 	if (cls)
-		error = sysfs_create_file(&cls->p->class_subsys.kobj,
+		error = sysfs_create_file(&cls->p->subsys.kobj,
 					  &attr->attr);
 	else
 		error = -EINVAL;
@@ -100,20 +100,20 @@ int class_create_file(struct class *cls, const struct class_attribute *attr)
 void class_remove_file(struct class *cls, const struct class_attribute *attr)
 {
 	if (cls)
-		sysfs_remove_file(&cls->p->class_subsys.kobj, &attr->attr);
+		sysfs_remove_file(&cls->p->subsys.kobj, &attr->attr);
 }
 
 static struct class *class_get(struct class *cls)
 {
 	if (cls)
-		kset_get(&cls->p->class_subsys);
+		kset_get(&cls->p->subsys);
 	return cls;
 }
 
 static void class_put(struct class *cls)
 {
 	if (cls)
-		kset_put(&cls->p->class_subsys);
+		kset_put(&cls->p->subsys);
 }
 
 static int add_class_attrs(struct class *cls)
@@ -162,7 +162,7 @@ static void klist_class_dev_put(struct klist_node *n)
 
 int __class_register(struct class *cls, struct lock_class_key *key)
 {
-	struct class_private *cp;
+	struct subsys_private *cp;
 	int error;
 
 	pr_debug("device class '%s': registering\n", cls->name);
@@ -170,11 +170,11 @@ int __class_register(struct class *cls, struct lock_class_key *key)
 	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
 	if (!cp)
 		return -ENOMEM;
-	klist_init(&cp->class_devices, klist_class_dev_get, klist_class_dev_put);
+	klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put);
 	INIT_LIST_HEAD(&cp->class_interfaces);
-	kset_init(&cp->class_dirs);
+	kset_init(&cp->glue_dirs);
 	__mutex_init(&cp->class_mutex, "struct class mutex", key);
-	error = kobject_set_name(&cp->class_subsys.kobj, "%s", cls->name);
+	error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name);
 	if (error) {
 		kfree(cp);
 		return error;
@@ -187,15 +187,15 @@ int __class_register(struct class *cls, struct lock_class_key *key)
 #if defined(CONFIG_BLOCK)
 	/* let the block class directory show up in the root of sysfs */
 	if (!sysfs_deprecated || cls != &block_class)
-		cp->class_subsys.kobj.kset = class_kset;
+		cp->subsys.kobj.kset = class_kset;
 #else
-	cp->class_subsys.kobj.kset = class_kset;
+	cp->subsys.kobj.kset = class_kset;
 #endif
-	cp->class_subsys.kobj.ktype = &class_ktype;
+	cp->subsys.kobj.ktype = &class_ktype;
 	cp->class = cls;
 	cls->p = cp;
 
-	error = kset_register(&cp->class_subsys);
+	error = kset_register(&cp->subsys);
 	if (error) {
 		kfree(cp);
 		return error;
@@ -210,7 +210,7 @@ void class_unregister(struct class *cls)
 {
 	pr_debug("device class '%s': unregistering\n", cls->name);
 	remove_class_attrs(cls);
-	kset_unregister(&cls->p->class_subsys);
+	kset_unregister(&cls->p->subsys);
 }
 
 static void class_create_release(struct class *cls)
@@ -295,7 +295,7 @@ void class_dev_iter_init(struct class_dev_iter *iter, struct class *class,
 
 	if (start)
 		start_knode = &start->knode_class;
-	klist_iter_init_node(&class->p->class_devices, &iter->ki, start_knode);
+	klist_iter_init_node(&class->p->klist_devices, &iter->ki, start_knode);
 	iter->type = type;
 }
 EXPORT_SYMBOL_GPL(class_dev_iter_init);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 6ed645411c40..46ff6c251932 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -610,7 +610,7 @@ class_dir_create_and_add(struct class *class, struct kobject *parent_kobj)
 	dir->class = class;
 	kobject_init(&dir->kobj, &class_dir_ktype);
 
-	dir->kobj.kset = &class->p->class_dirs;
+	dir->kobj.kset = &class->p->glue_dirs;
 
 	retval = kobject_add(&dir->kobj, parent_kobj, "%s", class->name);
 	if (retval < 0) {
@@ -635,7 +635,7 @@ static struct kobject *get_device_parent(struct device *dev,
 		if (sysfs_deprecated && dev->class == &block_class) {
 			if (parent && parent->class == &block_class)
 				return &parent->kobj;
-			return &block_class.p->class_subsys.kobj;
+			return &block_class.p->subsys.kobj;
 		}
 #endif
 
@@ -654,13 +654,13 @@ static struct kobject *get_device_parent(struct device *dev,
 		mutex_lock(&gdp_mutex);
 
 		/* find our class-directory at the parent and reference it */
-		spin_lock(&dev->class->p->class_dirs.list_lock);
-		list_for_each_entry(k, &dev->class->p->class_dirs.list, entry)
+		spin_lock(&dev->class->p->glue_dirs.list_lock);
+		list_for_each_entry(k, &dev->class->p->glue_dirs.list, entry)
 			if (k->parent == parent_kobj) {
 				kobj = kobject_get(k);
 				break;
 			}
-		spin_unlock(&dev->class->p->class_dirs.list_lock);
+		spin_unlock(&dev->class->p->glue_dirs.list_lock);
 		if (kobj) {
 			mutex_unlock(&gdp_mutex);
 			return kobj;
@@ -682,7 +682,7 @@ static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
 {
 	/* see if we live in a "glue" directory */
 	if (!glue_dir || !dev->class ||
-	    glue_dir->kset != &dev->class->p->class_dirs)
+	    glue_dir->kset != &dev->class->p->glue_dirs)
 		return;
 
 	kobject_put(glue_dir);
@@ -709,7 +709,7 @@ static int device_add_class_symlinks(struct device *dev)
 		return 0;
 
 	error = sysfs_create_link(&dev->kobj,
-				  &dev->class->p->class_subsys.kobj,
+				  &dev->class->p->subsys.kobj,
 				  "subsystem");
 	if (error)
 		goto out;
@@ -728,7 +728,7 @@ static int device_add_class_symlinks(struct device *dev)
 #endif
 
 	/* link in the class directory pointing to the device */
-	error = sysfs_create_link(&dev->class->p->class_subsys.kobj,
+	error = sysfs_create_link(&dev->class->p->subsys.kobj,
 				  &dev->kobj, dev_name(dev));
 	if (error)
 		goto out_device;
@@ -756,7 +756,7 @@ static void device_remove_class_symlinks(struct device *dev)
 	if (sysfs_deprecated && dev->class == &block_class)
 		return;
 #endif
-	sysfs_delete_link(&dev->class->p->class_subsys.kobj, &dev->kobj, dev_name(dev));
+	sysfs_delete_link(&dev->class->p->subsys.kobj, &dev->kobj, dev_name(dev));
 }
 
 /**
@@ -947,7 +947,7 @@ int device_add(struct device *dev)
 		mutex_lock(&dev->class->p->class_mutex);
 		/* tie the class to the device */
 		klist_add_tail(&dev->knode_class,
-			       &dev->class->p->class_devices);
+			       &dev->class->p->klist_devices);
 
 		/* notify any interfaces that the device is here */
 		list_for_each_entry(class_intf,
@@ -1535,7 +1535,7 @@ int device_rename(struct device *dev, const char *new_name)
 	}
 
 	if (dev->class) {
-		error = sysfs_rename_link(&dev->class->p->class_subsys.kobj,
+		error = sysfs_rename_link(&dev->class->p->subsys.kobj,
 			&dev->kobj, old_device_name, new_name);
 		if (error)
 			goto out;
diff --git a/include/linux/device.h b/include/linux/device.h
index dd4895313468..1e2d335ab683 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -30,9 +30,8 @@ struct device_private;
 struct device_driver;
 struct driver_private;
 struct class;
-struct class_private;
+struct subsys_private;
 struct bus_type;
-struct bus_type_private;
 struct device_node;
 
 struct bus_attribute {
@@ -65,7 +64,7 @@ struct bus_type {
 
 	const struct dev_pm_ops *pm;
 
-	struct bus_type_private *p;
+	struct subsys_private *p;
 };
 
 extern int __must_check bus_register(struct bus_type *bus);
@@ -213,7 +212,7 @@ struct class {
 
 	const struct dev_pm_ops *pm;
 
-	struct class_private *p;
+	struct subsys_private *p;
 };
 
 struct class_dev_iter {
-- 
cgit v1.2.3-71-gd317


From 5070437cd99511f69ae561f2ab417142a47a85ec Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Thu, 21 Oct 2010 17:55:01 +0200
Subject: power_supply: Add gpio charger driver

This patch adds a simple driver for chargers indicating their online
status through a GPIO pin.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/Kconfig              |  10 ++
 drivers/power/Makefile             |   1 +
 drivers/power/gpio-charger.c       | 185 +++++++++++++++++++++++++++++++++++++
 include/linux/power/gpio-charger.h |  41 ++++++++
 4 files changed, 237 insertions(+)
 create mode 100644 drivers/power/gpio-charger.c
 create mode 100644 include/linux/power/gpio-charger.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 60d83d983a36..32165295cb1b 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -185,4 +185,14 @@ config CHARGER_TWL4030
 	help
 	  Say Y here to enable support for TWL4030 Battery Charge Interface.
 
+config CHARGER_GPIO
+	tristate "GPIO charger"
+	depends on GPIOLIB
+	help
+	  Say Y to include support for chargers which report their online status
+	  through a GPIO pin.
+
+	  This driver can be build as a module. If so, the module will be
+	  called gpio-charger.
+
 endif # POWER_SUPPLY
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index c75772eb157c..545459f9f356 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_BATTERY_JZ4740)	+= jz4740-battery.o
 obj-$(CONFIG_BATTERY_INTEL_MID)	+= intel_mid_battery.o
 obj-$(CONFIG_CHARGER_ISP1704)	+= isp1704_charger.o
 obj-$(CONFIG_CHARGER_TWL4030)	+= twl4030_charger.o
+obj-$(CONFIG_CHARGER_GPIO)	+= gpio-charger.o
diff --git a/drivers/power/gpio-charger.c b/drivers/power/gpio-charger.c
new file mode 100644
index 000000000000..fccbe99b619c
--- /dev/null
+++ b/drivers/power/gpio-charger.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (C) 2010, Lars-Peter Clausen <lars@metafoo.de>
+ *  Driver for chargers which report their online status through a GPIO pin
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under  the terms of the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/slab.h>
+
+#include <linux/power/gpio-charger.h>
+
+struct gpio_charger {
+	const struct gpio_charger_platform_data *pdata;
+	unsigned int irq;
+
+	struct power_supply charger;
+};
+
+static irqreturn_t gpio_charger_irq(int irq, void *devid)
+{
+	struct power_supply *charger = devid;
+
+	power_supply_changed(charger);
+
+	return IRQ_HANDLED;
+}
+
+static inline struct gpio_charger *psy_to_gpio_charger(struct power_supply *psy)
+{
+	return container_of(psy, struct gpio_charger, charger);
+}
+
+static int gpio_charger_get_property(struct power_supply *psy,
+		enum power_supply_property psp, union power_supply_propval *val)
+{
+	struct gpio_charger *gpio_charger = psy_to_gpio_charger(psy);
+	const struct gpio_charger_platform_data *pdata = gpio_charger->pdata;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_ONLINE:
+		val->intval = gpio_get_value(pdata->gpio);
+		val->intval ^= pdata->gpio_active_low;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static enum power_supply_property gpio_charger_properties[] = {
+	POWER_SUPPLY_PROP_ONLINE,
+};
+
+static int __devinit gpio_charger_probe(struct platform_device *pdev)
+{
+	const struct gpio_charger_platform_data *pdata = pdev->dev.platform_data;
+	struct gpio_charger *gpio_charger;
+	struct power_supply *charger;
+	int ret;
+	int irq;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "No platform data\n");
+		return -EINVAL;
+	}
+
+	if (!gpio_is_valid(pdata->gpio)) {
+		dev_err(&pdev->dev, "Invalid gpio pin\n");
+		return -EINVAL;
+	}
+
+	gpio_charger = kzalloc(sizeof(*gpio_charger), GFP_KERNEL);
+
+	charger = &gpio_charger->charger;
+
+	charger->name = pdata->name;
+	charger->type = pdata->type;
+	charger->properties = gpio_charger_properties;
+	charger->num_properties = ARRAY_SIZE(gpio_charger_properties);
+	charger->get_property  = gpio_charger_get_property;
+	charger->supplied_to = pdata->supplied_to;
+	charger->num_supplicants = pdata->num_supplicants;
+
+	ret = gpio_request(pdata->gpio, dev_name(&pdev->dev));
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to request gpio pin: %d\n", ret);
+		goto err_free;
+	}
+	ret = gpio_direction_input(pdata->gpio);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to set gpio to input: %d\n", ret);
+		goto err_gpio_free;
+	}
+
+	irq = gpio_to_irq(pdata->gpio);
+	if (irq > 0) {
+		ret = request_irq(irq, gpio_charger_irq,
+				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
+				dev_name(&pdev->dev), charger);
+		if (ret)
+			dev_warn(&pdev->dev, "Failed to request irq: %d\n", ret);
+		else
+			gpio_charger->irq = irq;
+	}
+
+	gpio_charger->pdata = pdata;
+
+	ret = power_supply_register(&pdev->dev, charger);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to register power supply: %d\n", ret);
+		goto err_irq_free;
+	}
+
+	platform_set_drvdata(pdev, gpio_charger);
+
+	return 0;
+
+err_irq_free:
+	if (gpio_charger->irq)
+		free_irq(gpio_charger->irq, charger);
+err_gpio_free:
+	gpio_free(pdata->gpio);
+err_free:
+	kfree(gpio_charger);
+	return ret;
+}
+
+static int __devexit gpio_charger_remove(struct platform_device *pdev)
+{
+	struct gpio_charger *gpio_charger = platform_get_drvdata(pdev);
+
+	power_supply_unregister(&gpio_charger->charger);
+
+	if (gpio_charger->irq)
+		free_irq(gpio_charger->irq, &gpio_charger->charger);
+	gpio_free(gpio_charger->pdata->gpio);
+
+	platform_set_drvdata(pdev, NULL);
+	kfree(gpio_charger);
+
+	return 0;
+}
+
+static struct platform_driver gpio_charger_driver = {
+	.probe = gpio_charger_probe,
+	.remove = __devexit_p(gpio_charger_remove),
+	.driver = {
+		.name = "gpio-charger",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init gpio_charger_init(void)
+{
+	return platform_driver_register(&gpio_charger_driver);
+}
+module_init(gpio_charger_init);
+
+static void __exit gpio_charger_exit(void)
+{
+	platform_driver_unregister(&gpio_charger_driver);
+}
+module_exit(gpio_charger_exit);
+
+MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
+MODULE_DESCRIPTION("Driver for chargers which report their online status through a GPIO");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:gpio-charger");
diff --git a/include/linux/power/gpio-charger.h b/include/linux/power/gpio-charger.h
new file mode 100644
index 000000000000..de1dfe09a03d
--- /dev/null
+++ b/include/linux/power/gpio-charger.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (C) 2010, Lars-Peter Clausen <lars@metafoo.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under  the terms of the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LINUX_POWER_GPIO_CHARGER_H__
+#define __LINUX_POWER_GPIO_CHARGER_H__
+
+#include <linux/power_supply.h>
+#include <linux/types.h>
+
+/**
+ * struct gpio_charger_platform_data - platform_data for gpio_charger devices
+ * @name:		Name for the chargers power_supply device
+ * @type:		Type of the charger
+ * @gpio:		GPIO which is used to indicate the chargers status
+ * @gpio_active_low:	Should be set to 1 if the GPIO is active low otherwise 0
+ * @supplied_to:	Array of battery names to which this chargers supplies power
+ * @num_supplicants:	Number of entries in the supplied_to array
+ */
+struct gpio_charger_platform_data {
+	const char *name;
+	enum power_supply_type type;
+
+	int gpio;
+	int gpio_active_low;
+
+	char **supplied_to;
+	size_t num_supplicants;
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From c2f9bff5ace07fbea03a53c6c3253f6c3a81e9f9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 19 Oct 2010 21:04:42 +0800
Subject: net - Add AF_ALG macros

This patch adds the socket family/level macros for the yet-to-be-born
AF_ALG family.  The AF_ALG family provides the user-space interface
for the kernel crypto API.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5146b50202ce..ebc081b18da6 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -193,7 +193,8 @@ struct ucred {
 #define AF_PHONET	35	/* Phonet sockets		*/
 #define AF_IEEE802154	36	/* IEEE802154 sockets		*/
 #define AF_CAIF		37	/* CAIF sockets			*/
-#define AF_MAX		38	/* For now.. */
+#define AF_ALG		38	/* Algorithm sockets		*/
+#define AF_MAX		39	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -234,6 +235,7 @@ struct ucred {
 #define PF_PHONET	AF_PHONET
 #define PF_IEEE802154	AF_IEEE802154
 #define PF_CAIF		AF_CAIF
+#define PF_ALG		AF_ALG
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -307,6 +309,7 @@ struct ucred {
 #define SOL_RDS		276
 #define SOL_IUCV	277
 #define SOL_CAIF	278
+#define SOL_ALG		279
 
 /* IPX options */
 #define IPX_TYPE	1
-- 
cgit v1.2.3-71-gd317


From 03c8efc1ffeb6b82a22c1af8dd908af349563314 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 19 Oct 2010 21:12:39 +0800
Subject: crypto: af_alg - User-space interface for Crypto API

This patch creates the backbone of the user-space interface for
the Crypto API, through a new socket family AF_ALG.

Each session corresponds to one or more connections obtained from
that socket.  The number depends on the number of inputs/outputs
of that particular type of operation.  For most types there will
be a s ingle connection/file descriptor that is used for both input
and output.  AEAD is one of the few that require two inputs.

Each algorithm type will provide its own implementation that plugs
into af_alg.  They're keyed using a string such as "skcipher" or
"hash".

IOW this patch only contains the boring bits that is required
to hold everything together.

Thakns to Miloslav Trmac for reviewing this and contributing
fixes and improvements.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
Tested-by: Martin Willi <martin@strongswan.org>
---
 crypto/Kconfig          |   3 +
 crypto/Makefile         |   1 +
 crypto/af_alg.c         | 482 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/crypto/if_alg.h |  92 +++++++++
 include/linux/if_alg.h  |  40 ++++
 5 files changed, 618 insertions(+)
 create mode 100644 crypto/af_alg.c
 create mode 100644 include/crypto/if_alg.h
 create mode 100644 include/linux/if_alg.h

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index e4bac29a32e7..357e3caf4cbe 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -841,6 +841,9 @@ config CRYPTO_ANSI_CPRNG
 	  ANSI X9.31 A.2.4. Note that this option must be enabled if
 	  CRYPTO_FIPS is selected
 
+config CRYPTO_USER_API
+	tristate
+
 source "drivers/crypto/Kconfig"
 
 endif	# if CRYPTO
diff --git a/crypto/Makefile b/crypto/Makefile
index 423b7de61f93..0b1319734bc0 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_CRYPTO_RNG2) += krng.o
 obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o
 obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
 obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o
+obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 
 #
 # generic algorithms and the async_tx api
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
new file mode 100644
index 000000000000..cabed0e9c9d8
--- /dev/null
+++ b/crypto/af_alg.c
@@ -0,0 +1,482 @@
+/*
+ * af_alg: User-space algorithm interface
+ *
+ * This file provides the user-space API for algorithms.
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/atomic.h>
+#include <crypto/if_alg.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/rwsem.h>
+
+struct alg_type_list {
+	const struct af_alg_type *type;
+	struct list_head list;
+};
+
+static atomic_t alg_memory_allocated;
+
+static struct proto alg_proto = {
+	.name			= "ALG",
+	.owner			= THIS_MODULE,
+	.memory_allocated	= &alg_memory_allocated,
+	.obj_size		= sizeof(struct alg_sock),
+};
+
+static LIST_HEAD(alg_types);
+static DECLARE_RWSEM(alg_types_sem);
+
+static const struct af_alg_type *alg_get_type(const char *name)
+{
+	const struct af_alg_type *type = ERR_PTR(-ENOENT);
+	struct alg_type_list *node;
+
+	down_read(&alg_types_sem);
+	list_for_each_entry(node, &alg_types, list) {
+		if (strcmp(node->type->name, name))
+			continue;
+
+		if (try_module_get(node->type->owner))
+			type = node->type;
+		break;
+	}
+	up_read(&alg_types_sem);
+
+	return type;
+}
+
+int af_alg_register_type(const struct af_alg_type *type)
+{
+	struct alg_type_list *node;
+	int err = -EEXIST;
+
+	down_write(&alg_types_sem);
+	list_for_each_entry(node, &alg_types, list) {
+		if (!strcmp(node->type->name, type->name))
+			goto unlock;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!node)
+		goto unlock;
+
+	type->ops->owner = THIS_MODULE;
+	node->type = type;
+	list_add(&node->list, &alg_types);
+	err = 0;
+
+unlock:
+	up_write(&alg_types_sem);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_register_type);
+
+int af_alg_unregister_type(const struct af_alg_type *type)
+{
+	struct alg_type_list *node;
+	int err = -ENOENT;
+
+	down_write(&alg_types_sem);
+	list_for_each_entry(node, &alg_types, list) {
+		if (strcmp(node->type->name, type->name))
+			continue;
+
+		list_del(&node->list);
+		kfree(node);
+		err = 0;
+		break;
+	}
+	up_write(&alg_types_sem);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_unregister_type);
+
+static void alg_do_release(const struct af_alg_type *type, void *private)
+{
+	if (!type)
+		return;
+
+	type->release(private);
+	module_put(type->owner);
+}
+
+int af_alg_release(struct socket *sock)
+{
+	if (sock->sk)
+		sock_put(sock->sk);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(af_alg_release);
+
+static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct sockaddr_alg *sa = (void *)uaddr;
+	const struct af_alg_type *type;
+	void *private;
+
+	if (sock->state == SS_CONNECTED)
+		return -EINVAL;
+
+	if (addr_len != sizeof(*sa))
+		return -EINVAL;
+
+	sa->salg_type[sizeof(sa->salg_type) - 1] = 0;
+	sa->salg_name[sizeof(sa->salg_name) - 1] = 0;
+
+	type = alg_get_type(sa->salg_type);
+	if (IS_ERR(type) && PTR_ERR(type) == -ENOENT) {
+		request_module("algif-%s", sa->salg_type);
+		type = alg_get_type(sa->salg_type);
+	}
+
+	if (IS_ERR(type))
+		return PTR_ERR(type);
+
+	private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask);
+	if (IS_ERR(private)) {
+		module_put(type->owner);
+		return PTR_ERR(private);
+	}
+
+	lock_sock(sk);
+
+	swap(ask->type, type);
+	swap(ask->private, private);
+
+	release_sock(sk);
+
+	alg_do_release(type, private);
+
+	return 0;
+}
+
+static int alg_setkey(struct sock *sk, char __user *ukey,
+		      unsigned int keylen)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	const struct af_alg_type *type = ask->type;
+	u8 *key;
+	int err;
+
+	key = sock_kmalloc(sk, keylen, GFP_KERNEL);
+	if (!key)
+		return -ENOMEM;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, keylen))
+		goto out;
+
+	err = type->setkey(ask->private, key, keylen);
+
+out:
+	sock_kfree_s(sk, key, keylen);
+
+	return err;
+}
+
+static int alg_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	const struct af_alg_type *type;
+	int err = -ENOPROTOOPT;
+
+	lock_sock(sk);
+	type = ask->type;
+
+	if (level != SOL_ALG || !type)
+		goto unlock;
+
+	switch (optname) {
+	case ALG_SET_KEY:
+		if (sock->state == SS_CONNECTED)
+			goto unlock;
+		if (!type->setkey)
+			goto unlock;
+
+		err = alg_setkey(sk, optval, optlen);
+	}
+
+unlock:
+	release_sock(sk);
+
+	return err;
+}
+
+int af_alg_accept(struct sock *sk, struct socket *newsock)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	const struct af_alg_type *type;
+	struct sock *sk2;
+	int err;
+
+	lock_sock(sk);
+	type = ask->type;
+
+	err = -EINVAL;
+	if (!type)
+		goto unlock;
+
+	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto);
+	err = -ENOMEM;
+	if (!sk2)
+		goto unlock;
+
+	sock_init_data(newsock, sk2);
+
+	err = type->accept(ask->private, sk2);
+	if (err) {
+		sk_free(sk2);
+		goto unlock;
+	}
+
+	sk2->sk_family = PF_ALG;
+
+	sock_hold(sk);
+	alg_sk(sk2)->parent = sk;
+	alg_sk(sk2)->type = type;
+
+	newsock->ops = type->ops;
+	newsock->state = SS_CONNECTED;
+
+	err = 0;
+
+unlock:
+	release_sock(sk);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_accept);
+
+static int alg_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	return af_alg_accept(sock->sk, newsock);
+}
+
+static const struct proto_ops alg_proto_ops = {
+	.family		=	PF_ALG,
+	.owner		=	THIS_MODULE,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.sendpage	=	sock_no_sendpage,
+	.sendmsg	=	sock_no_sendmsg,
+	.recvmsg	=	sock_no_recvmsg,
+	.poll		=	sock_no_poll,
+
+	.bind		=	alg_bind,
+	.release	=	af_alg_release,
+	.setsockopt	=	alg_setsockopt,
+	.accept		=	alg_accept,
+};
+
+static void alg_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+
+	alg_do_release(ask->type, ask->private);
+}
+
+static int alg_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	int err;
+
+	if (sock->type != SOCK_SEQPACKET)
+		return -ESOCKTNOSUPPORT;
+	if (protocol != 0)
+		return -EPROTONOSUPPORT;
+
+	err = -ENOMEM;
+	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto);
+	if (!sk)
+		goto out;
+
+	sock->ops = &alg_proto_ops;
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_ALG;
+	sk->sk_destruct = alg_sock_destruct;
+
+	return 0;
+out:
+	return err;
+}
+
+static const struct net_proto_family alg_family = {
+	.family	=	PF_ALG,
+	.create	=	alg_create,
+	.owner	=	THIS_MODULE,
+};
+
+int af_alg_make_sg(struct af_alg_sgl *sgl, void __user *addr, int len,
+		   int write)
+{
+	unsigned long from = (unsigned long)addr;
+	unsigned long npages;
+	unsigned off;
+	int err;
+	int i;
+
+	err = -EFAULT;
+	if (!access_ok(write ? VERIFY_READ : VERIFY_WRITE, addr, len))
+		goto out;
+
+	off = from & ~PAGE_MASK;
+	npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (npages > ALG_MAX_PAGES)
+		npages = ALG_MAX_PAGES;
+
+	err = get_user_pages_fast(from, npages, write, sgl->pages);
+	if (err < 0)
+		goto out;
+
+	npages = err;
+	err = -EINVAL;
+	if (WARN_ON(npages == 0))
+		goto out;
+
+	err = 0;
+
+	sg_init_table(sgl->sg, npages);
+
+	for (i = 0; i < npages; i++) {
+		int plen = min_t(int, len, PAGE_SIZE - off);
+
+		sg_set_page(sgl->sg + i, sgl->pages[i], plen, off);
+
+		off = 0;
+		len -= plen;
+		err += plen;
+	}
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_make_sg);
+
+void af_alg_free_sg(struct af_alg_sgl *sgl)
+{
+	int i;
+
+	i = 0;
+	do {
+		put_page(sgl->pages[i]);
+	} while (!sg_is_last(sgl->sg + (i++)));
+}
+EXPORT_SYMBOL_GPL(af_alg_free_sg);
+
+int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con)
+{
+	struct cmsghdr *cmsg;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+		if (cmsg->cmsg_level != SOL_ALG)
+			continue;
+
+		switch(cmsg->cmsg_type) {
+		case ALG_SET_IV:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*con->iv)))
+				return -EINVAL;
+			con->iv = (void *)CMSG_DATA(cmsg);
+			if (cmsg->cmsg_len < CMSG_LEN(con->iv->ivlen +
+						      sizeof(*con->iv)))
+				return -EINVAL;
+			break;
+
+		case ALG_SET_OP:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32)))
+				return -EINVAL;
+			con->op = *(u32 *)CMSG_DATA(cmsg);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(af_alg_cmsg_send);
+
+int af_alg_wait_for_completion(int err, struct af_alg_completion *completion)
+{
+	switch (err) {
+	case -EINPROGRESS:
+	case -EBUSY:
+		wait_for_completion(&completion->completion);
+		INIT_COMPLETION(completion->completion);
+		err = completion->err;
+		break;
+	};
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_wait_for_completion);
+
+void af_alg_complete(struct crypto_async_request *req, int err)
+{
+	struct af_alg_completion *completion = req->data;
+
+	completion->err = err;
+	complete(&completion->completion);
+}
+EXPORT_SYMBOL_GPL(af_alg_complete);
+
+static int __init af_alg_init(void)
+{
+	int err = proto_register(&alg_proto, 0);
+
+	if (err)
+		goto out;
+
+	err = sock_register(&alg_family);
+	if (err != 0)
+		goto out_unregister_proto;
+
+out:
+	return err;
+
+out_unregister_proto:
+	proto_unregister(&alg_proto);
+	goto out;
+}
+
+static void __exit af_alg_exit(void)
+{
+	sock_unregister(PF_ALG);
+	proto_unregister(&alg_proto);
+}
+
+module_init(af_alg_init);
+module_exit(af_alg_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(AF_ALG);
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
new file mode 100644
index 000000000000..c5813c87de06
--- /dev/null
+++ b/include/crypto/if_alg.h
@@ -0,0 +1,92 @@
+/*
+ * if_alg: User-space algorithm interface
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _CRYPTO_IF_ALG_H
+#define _CRYPTO_IF_ALG_H
+
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/if_alg.h>
+#include <linux/types.h>
+#include <net/sock.h>
+
+#define ALG_MAX_PAGES			16
+
+struct crypto_async_request;
+
+struct alg_sock {
+	/* struct sock must be the first member of struct alg_sock */
+	struct sock sk;
+
+	struct sock *parent;
+
+	const struct af_alg_type *type;
+	void *private;
+};
+
+struct af_alg_completion {
+	struct completion completion;
+	int err;
+};
+
+struct af_alg_control {
+	struct af_alg_iv *iv;
+	int op;
+};
+
+struct af_alg_type {
+	void *(*bind)(const char *name, u32 type, u32 mask);
+	void (*release)(void *private);
+	int (*setkey)(void *private, const u8 *key, unsigned int keylen);
+	int (*accept)(void *private, struct sock *sk);
+
+	struct proto_ops *ops;
+	struct module *owner;
+	char name[14];
+};
+
+struct af_alg_sgl {
+	struct scatterlist sg[ALG_MAX_PAGES];
+	struct page *pages[ALG_MAX_PAGES];
+};
+
+int af_alg_register_type(const struct af_alg_type *type);
+int af_alg_unregister_type(const struct af_alg_type *type);
+
+int af_alg_release(struct socket *sock);
+int af_alg_accept(struct sock *sk, struct socket *newsock);
+
+int af_alg_make_sg(struct af_alg_sgl *sgl, void __user *addr, int len,
+		   int write);
+void af_alg_free_sg(struct af_alg_sgl *sgl);
+
+int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con);
+
+int af_alg_wait_for_completion(int err, struct af_alg_completion *completion);
+void af_alg_complete(struct crypto_async_request *req, int err);
+
+static inline struct alg_sock *alg_sk(struct sock *sk)
+{
+	return (struct alg_sock *)sk;
+}
+
+static inline void af_alg_release_parent(struct sock *sk)
+{
+	sock_put(alg_sk(sk)->parent);
+}
+
+static inline void af_alg_init_completion(struct af_alg_completion *completion)
+{
+	init_completion(&completion->completion);
+}
+
+#endif	/* _CRYPTO_IF_ALG_H */
diff --git a/include/linux/if_alg.h b/include/linux/if_alg.h
new file mode 100644
index 000000000000..0f9acce5b1ff
--- /dev/null
+++ b/include/linux/if_alg.h
@@ -0,0 +1,40 @@
+/*
+ * if_alg: User-space algorithm interface
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _LINUX_IF_ALG_H
+#define _LINUX_IF_ALG_H
+
+#include <linux/types.h>
+
+struct sockaddr_alg {
+	__u16	salg_family;
+	__u8	salg_type[14];
+	__u32	salg_feat;
+	__u32	salg_mask;
+	__u8	salg_name[64];
+};
+
+struct af_alg_iv {
+	__u32	ivlen;
+	__u8	iv[0];
+};
+
+/* Socket options */
+#define ALG_SET_KEY			1
+#define ALG_SET_IV			2
+#define ALG_SET_OP			3
+
+/* Operations */
+#define ALG_OP_DECRYPT			0
+#define ALG_OP_ENCRYPT			1
+
+#endif	/* _LINUX_IF_ALG_H */
-- 
cgit v1.2.3-71-gd317


From ecf7ace9a8450303a987aa8364e53860cd50e554 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Tue, 16 Nov 2010 15:21:07 +0100
Subject: kref: Add a kref_sub function

Makes it possible to optimize batched multiple unrefs.
Initial user will be drivers/gpu/ttm which accumulates unrefs to be
processed outside of atomic code.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/kref.h |  2 ++
 lib/kref.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 6cc38fc07ab7..d4a62ab2ee5e 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -24,5 +24,7 @@ struct kref {
 void kref_init(struct kref *kref);
 void kref_get(struct kref *kref);
 int kref_put(struct kref *kref, void (*release) (struct kref *kref));
+int kref_sub(struct kref *kref, unsigned int count,
+	     void (*release) (struct kref *kref));
 
 #endif /* _KREF_H_ */
diff --git a/lib/kref.c b/lib/kref.c
index d3d227a08a4b..3efb882b11db 100644
--- a/lib/kref.c
+++ b/lib/kref.c
@@ -62,6 +62,36 @@ int kref_put(struct kref *kref, void (*release)(struct kref *kref))
 	return 0;
 }
 
+
+/**
+ * kref_sub - subtract a number of refcounts for object.
+ * @kref: object.
+ * @count: Number of recounts to subtract.
+ * @release: pointer to the function that will clean up the object when the
+ *	     last reference to the object is released.
+ *	     This pointer is required, and it is not acceptable to pass kfree
+ *	     in as this function.
+ *
+ * Subtract @count from the refcount, and if 0, call release().
+ * Return 1 if the object was removed, otherwise return 0.  Beware, if this
+ * function returns 0, you still can not count on the kref from remaining in
+ * memory.  Only use the return value if you want to see if the kref is now
+ * gone, not present.
+ */
+int kref_sub(struct kref *kref, unsigned int count,
+	     void (*release)(struct kref *kref))
+{
+	WARN_ON(release == NULL);
+	WARN_ON(release == (void (*)(struct kref *))kfree);
+
+	if (atomic_sub_and_test((int) count, &kref->refcount)) {
+		release(kref);
+		return 1;
+	}
+	return 0;
+}
+
 EXPORT_SYMBOL(kref_init);
 EXPORT_SYMBOL(kref_get);
 EXPORT_SYMBOL(kref_put);
+EXPORT_SYMBOL(kref_sub);
-- 
cgit v1.2.3-71-gd317


From 23ed992a5ebe6964ebe312b54142fbc5e8185cdc Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 5 Nov 2010 18:04:52 +0100
Subject: drm/i915|intel-gtt: consolidate intel-gtt.h headers

... and a few other defines.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/char/agp/intel-gtt.c    |  5 -----
 drivers/gpu/drm/i915/i915_gem.c |  1 -
 include/drm/intel-gtt.h         | 12 ++++++++++++
 include/linux/intel-gtt.h       | 20 --------------------
 4 files changed, 12 insertions(+), 26 deletions(-)
 delete mode 100644 include/linux/intel-gtt.h

(limited to 'include/linux')

diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c
index 72267c801637..291ac5113576 100644
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -24,7 +24,6 @@
 #include <asm/smp.h>
 #include "agp.h"
 #include "intel-agp.h"
-#include <linux/intel-gtt.h>
 #include <drm/intel-gtt.h>
 
 /*
@@ -39,10 +38,6 @@
 #define USE_PCI_DMA_API 0
 #endif
 
-#define AGP_DCACHE_MEMORY	1
-#define AGP_PHYS_MEMORY		2
-#define INTEL_AGP_CACHED_MEMORY 3
-
 struct intel_gtt_driver {
 	unsigned int gen : 8;
 	unsigned int is_g33 : 1;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bf05ac414b1f..68492357658c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/pci.h>
-#include <linux/intel-gtt.h>
 
 struct change_domains {
 	uint32_t invalidate_domains;
diff --git a/include/drm/intel-gtt.h b/include/drm/intel-gtt.h
index 020f8aab7e5b..9f91cbe35157 100644
--- a/include/drm/intel-gtt.h
+++ b/include/drm/intel-gtt.h
@@ -13,5 +13,17 @@ const struct intel_gtt {
 	unsigned int gtt_mappable_entries;
 } *intel_gtt_get(void);
 
+
+/* Special gtt memory types */
+#define AGP_DCACHE_MEMORY	1
+#define AGP_PHYS_MEMORY		2
+
+/* New caching attributes for gen6/sandybridge */
+#define AGP_USER_CACHED_MEMORY_LLC_MLC (AGP_USER_TYPES + 2)
+#define AGP_USER_UNCACHED_MEMORY (AGP_USER_TYPES + 4)
+
+/* flag for GFDT type */
+#define AGP_USER_CACHED_MEMORY_GFDT (1 << 3)
+
 #endif
 
diff --git a/include/linux/intel-gtt.h b/include/linux/intel-gtt.h
deleted file mode 100644
index 1d19ab2afa39..000000000000
--- a/include/linux/intel-gtt.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Common Intel AGPGART and GTT definitions.
- */
-#ifndef _INTEL_GTT_H
-#define _INTEL_GTT_H
-
-#include <linux/agp_backend.h>
-
-/* This is for Intel only GTT controls.
- *
- * Sandybridge: AGP_USER_CACHED_MEMORY default to LLC only
- */
-
-#define AGP_USER_CACHED_MEMORY_LLC_MLC (AGP_USER_TYPES + 2)
-#define AGP_USER_UNCACHED_MEMORY (AGP_USER_TYPES + 4)
-
-/* flag for GFDT type */
-#define AGP_USER_CACHED_MEMORY_GFDT (1 << 3)
-
-#endif
-- 
cgit v1.2.3-71-gd317


From f050a8abbda0efcd597c6b1825e3b9ce4d613383 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 5 Nov 2010 18:40:56 +0100
Subject: agp: kill agp_flush_chipset and corresponding ioctl

The intel drm calls the chipset functions now directly. Userspace
never called the corresponding ioctl, hence it can be killed, too.

Cc: Dave Airlie <airlied@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/char/agp/agp.h          | 1 -
 drivers/char/agp/compat_ioctl.c | 1 -
 drivers/char/agp/compat_ioctl.h | 1 -
 drivers/char/agp/frontend.c     | 8 --------
 drivers/char/agp/generic.c      | 7 -------
 drivers/char/agp/intel-gtt.c    | 6 ------
 include/linux/agp_backend.h     | 1 -
 7 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index 5259065f3c79..3e67ddde9e16 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -120,7 +120,6 @@ struct agp_bridge_driver {
 	void (*agp_destroy_page)(struct page *, int flags);
 	void (*agp_destroy_pages)(struct agp_memory *);
 	int (*agp_type_to_mask_type) (struct agp_bridge_data *, int);
-	void (*chipset_flush)(struct agp_bridge_data *);
 };
 
 struct agp_bridge_data {
diff --git a/drivers/char/agp/compat_ioctl.c b/drivers/char/agp/compat_ioctl.c
index 9d2c97a69cdd..a48e05b31593 100644
--- a/drivers/char/agp/compat_ioctl.c
+++ b/drivers/char/agp/compat_ioctl.c
@@ -276,7 +276,6 @@ long compat_agp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		break;
 
 	case AGPIOC_CHIPSET_FLUSH32:
-		ret_val = agpioc_chipset_flush_wrap(curr_priv);
 		break;
 	}
 
diff --git a/drivers/char/agp/compat_ioctl.h b/drivers/char/agp/compat_ioctl.h
index 0c9678ac0371..f30e0fd97963 100644
--- a/drivers/char/agp/compat_ioctl.h
+++ b/drivers/char/agp/compat_ioctl.h
@@ -102,6 +102,5 @@ void agp_free_memory_wrap(struct agp_memory *memory);
 struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type);
 struct agp_memory *agp_find_mem_by_key(int key);
 struct agp_client *agp_find_client_by_pid(pid_t id);
-int agpioc_chipset_flush_wrap(struct agp_file_private *priv);
 
 #endif /* _AGP_COMPAT_H */
diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c
index 3cb4539a98b2..2e044338753c 100644
--- a/drivers/char/agp/frontend.c
+++ b/drivers/char/agp/frontend.c
@@ -957,13 +957,6 @@ static int agpioc_unbind_wrap(struct agp_file_private *priv, void __user *arg)
 	return agp_unbind_memory(memory);
 }
 
-int agpioc_chipset_flush_wrap(struct agp_file_private *priv)
-{
-	DBG("");
-	agp_flush_chipset(agp_bridge);
-	return 0;
-}
-
 static long agp_ioctl(struct file *file,
 		     unsigned int cmd, unsigned long arg)
 {
@@ -1039,7 +1032,6 @@ static long agp_ioctl(struct file *file,
 		break;
 	       
 	case AGPIOC_CHIPSET_FLUSH:
-		ret_val = agpioc_chipset_flush_wrap(curr_priv);
 		break;
 	}
 
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 4956f1c8f9d5..78bc8de0f234 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -81,13 +81,6 @@ static int agp_get_key(void)
 	return -1;
 }
 
-void agp_flush_chipset(struct agp_bridge_data *bridge)
-{
-	if (bridge->driver->chipset_flush)
-		bridge->driver->chipset_flush(bridge);
-}
-EXPORT_SYMBOL(agp_flush_chipset);
-
 /*
  * Use kmalloc if possible for the page list. Otherwise fall back to
  * vmalloc. This speeds things up and also saves memory for small AGP
diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c
index 8e2e208c925b..1603e4f8ae73 100644
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -984,11 +984,6 @@ static int intel_fake_agp_remove_entries(struct agp_memory *mem,
 	return 0;
 }
 
-static void intel_fake_agp_chipset_flush(struct agp_bridge_data *bridge)
-{
-	intel_private.driver->chipset_flush();
-}
-
 static struct agp_memory *intel_fake_agp_alloc_by_type(size_t pg_count,
 						       int type)
 {
@@ -1222,7 +1217,6 @@ static const struct agp_bridge_driver intel_fake_agp_driver = {
 	.agp_alloc_pages        = agp_generic_alloc_pages,
 	.agp_destroy_page	= agp_generic_destroy_page,
 	.agp_destroy_pages      = agp_generic_destroy_pages,
-	.chipset_flush		= intel_fake_agp_chipset_flush,
 };
 
 static const struct intel_gtt_driver i81x_gtt_driver = {
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index 09ea4a1e9505..a479b4885d25 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -106,6 +106,5 @@ extern int agp_rebind_memory(void);
 extern void agp_enable(struct agp_bridge_data *, u32);
 extern struct agp_bridge_data *agp_backend_acquire(struct pci_dev *);
 extern void agp_backend_release(struct agp_bridge_data *);
-extern void agp_flush_chipset(struct agp_bridge_data *);
 
 #endif				/* _AGP_BACKEND_H */
-- 
cgit v1.2.3-71-gd317


From cb16b67b5cb33b7d6732e0c416d29d933eea13ce Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 5 Nov 2010 22:27:10 +0100
Subject: agp: kill agp_rebind_memory

Its only user, intel-gtt.c is now gone.

Cc: Dave Airlie <airlied@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/char/agp/generic.c  | 20 --------------------
 include/linux/agp_backend.h |  1 -
 2 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 78bc8de0f234..012cba0d6d96 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -480,26 +480,6 @@ int agp_unbind_memory(struct agp_memory *curr)
 }
 EXPORT_SYMBOL(agp_unbind_memory);
 
-/**
- *	agp_rebind_emmory  -  Rewrite the entire GATT, useful on resume
- */
-int agp_rebind_memory(void)
-{
-	struct agp_memory *curr;
-	int ret_val = 0;
-
-	spin_lock(&agp_bridge->mapped_lock);
-	list_for_each_entry(curr, &agp_bridge->mapped_list, mapped_list) {
-		ret_val = curr->bridge->driver->insert_memory(curr,
-							      curr->pg_start,
-							      curr->type);
-		if (ret_val != 0)
-			break;
-	}
-	spin_unlock(&agp_bridge->mapped_lock);
-	return ret_val;
-}
-EXPORT_SYMBOL(agp_rebind_memory);
 
 /* End - Routines for handling swapping of agp_memory into the GATT */
 
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index a479b4885d25..eaf6cd75a1b1 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -102,7 +102,6 @@ extern struct agp_memory *agp_allocate_memory(struct agp_bridge_data *, size_t,
 extern int agp_copy_info(struct agp_bridge_data *, struct agp_kern_info *);
 extern int agp_bind_memory(struct agp_memory *, off_t);
 extern int agp_unbind_memory(struct agp_memory *);
-extern int agp_rebind_memory(void);
 extern void agp_enable(struct agp_bridge_data *, u32);
 extern struct agp_bridge_data *agp_backend_acquire(struct pci_dev *);
 extern void agp_backend_release(struct agp_bridge_data *);
-- 
cgit v1.2.3-71-gd317


From 559e0df6b3ffbc218a11bb9dada5320a217cb7a6 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 31 Aug 2010 19:25:12 +0100
Subject: mfd: Add initial WM8958 support

The WM8958 is a derivative of the WM8994 which is register compatible
with the addition of some extra features, mostly in the CODEC side.
The major change visible at the MFD level is that rather than a single
DBVDD supply we now have three separate DBVDDs so we must request and
enable a different set of supplies.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/mfd/wm8994-core.c       | 93 ++++++++++++++++++++++++++++++++---------
 include/linux/mfd/wm8994/core.h |  8 ++++
 2 files changed, 82 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index b3b2aaf89dbe..8d221ba5e38d 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -218,6 +218,18 @@ static const char *wm8994_main_supplies[] = {
 	"SPKVDD2",
 };
 
+static const char *wm8958_main_supplies[] = {
+	"DBVDD1",
+	"DBVDD2",
+	"DBVDD3",
+	"DCVDD",
+	"AVDD1",
+	"AVDD2",
+	"CPVDD",
+	"SPKVDD1",
+	"SPKVDD2",
+};
+
 #ifdef CONFIG_PM
 static int wm8994_device_suspend(struct device *dev)
 {
@@ -239,7 +251,7 @@ static int wm8994_device_suspend(struct device *dev)
 	if (ret < 0)
 		dev_err(dev, "Failed to save LDO registers: %d\n", ret);
 
-	ret = regulator_bulk_disable(ARRAY_SIZE(wm8994_main_supplies),
+	ret = regulator_bulk_disable(wm8994->num_supplies,
 				     wm8994->supplies);
 	if (ret != 0) {
 		dev_err(dev, "Failed to disable supplies: %d\n", ret);
@@ -254,7 +266,7 @@ static int wm8994_device_resume(struct device *dev)
 	struct wm8994 *wm8994 = dev_get_drvdata(dev);
 	int ret;
 
-	ret = regulator_bulk_enable(ARRAY_SIZE(wm8994_main_supplies),
+	ret = regulator_bulk_enable(wm8994->num_supplies,
 				    wm8994->supplies);
 	if (ret != 0) {
 		dev_err(dev, "Failed to enable supplies: %d\n", ret);
@@ -305,9 +317,10 @@ static int wm8994_ldo_in_use(struct wm8994_pdata *pdata, int ldo)
 /*
  * Instantiate the generic non-control parts of the device.
  */
-static int wm8994_device_init(struct wm8994 *wm8994, unsigned long id, int irq)
+static int wm8994_device_init(struct wm8994 *wm8994, int irq)
 {
 	struct wm8994_pdata *pdata = wm8994->dev->platform_data;
+	const char *devname;
 	int ret, i;
 
 	mutex_init(&wm8994->io_lock);
@@ -323,25 +336,48 @@ static int wm8994_device_init(struct wm8994 *wm8994, unsigned long id, int irq)
 		goto err;
 	}
 
+	switch (wm8994->type) {
+	case WM8994:
+		wm8994->num_supplies = ARRAY_SIZE(wm8994_main_supplies);
+		break;
+	case WM8958:
+		wm8994->num_supplies = ARRAY_SIZE(wm8958_main_supplies);
+		break;
+	default:
+		BUG();
+		return -EINVAL;
+	}
+
 	wm8994->supplies = kzalloc(sizeof(struct regulator_bulk_data) *
-				   ARRAY_SIZE(wm8994_main_supplies),
+				   wm8994->num_supplies,
 				   GFP_KERNEL);
 	if (!wm8994->supplies) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(wm8994_main_supplies); i++)
-		wm8994->supplies[i].supply = wm8994_main_supplies[i];
-
-	ret = regulator_bulk_get(wm8994->dev, ARRAY_SIZE(wm8994_main_supplies),
+	switch (wm8994->type) {
+	case WM8994:
+		for (i = 0; i < ARRAY_SIZE(wm8994_main_supplies); i++)
+			wm8994->supplies[i].supply = wm8994_main_supplies[i];
+		break;
+	case WM8958:
+		for (i = 0; i < ARRAY_SIZE(wm8958_main_supplies); i++)
+			wm8994->supplies[i].supply = wm8958_main_supplies[i];
+		break;
+	default:
+		BUG();
+		return -EINVAL;
+	}
+		
+	ret = regulator_bulk_get(wm8994->dev, wm8994->num_supplies,
 				 wm8994->supplies);
 	if (ret != 0) {
 		dev_err(wm8994->dev, "Failed to get supplies: %d\n", ret);
 		goto err_supplies;
 	}
 
-	ret = regulator_bulk_enable(ARRAY_SIZE(wm8994_main_supplies),
+	ret = regulator_bulk_enable(wm8994->num_supplies,
 				    wm8994->supplies);
 	if (ret != 0) {
 		dev_err(wm8994->dev, "Failed to enable supplies: %d\n", ret);
@@ -353,7 +389,22 @@ static int wm8994_device_init(struct wm8994 *wm8994, unsigned long id, int irq)
 		dev_err(wm8994->dev, "Failed to read ID register\n");
 		goto err_enable;
 	}
-	if (ret != 0x8994) {
+	switch (ret) {
+	case 0x8994:
+		devname = "WM8994";
+		if (wm8994->type != WM8994)
+			dev_warn(wm8994->dev, "Device registered as type %d\n",
+				 wm8994->type);
+		wm8994->type = WM8994;
+		break;
+	case 0x8958:
+		devname = "WM8958";
+		if (wm8994->type != WM8958)
+			dev_warn(wm8994->dev, "Device registered as type %d\n",
+				 wm8994->type);
+		wm8994->type = WM8958;
+		break;
+	default:
 		dev_err(wm8994->dev, "Device is not a WM8994, ID is %x\n",
 			ret);
 		ret = -EINVAL;
@@ -370,14 +421,16 @@ static int wm8994_device_init(struct wm8994 *wm8994, unsigned long id, int irq)
 	switch (ret) {
 	case 0:
 	case 1:
-		dev_warn(wm8994->dev, "revision %c not fully supported\n",
-			'A' + ret);
+		if (wm8994->type == WM8994)
+			dev_warn(wm8994->dev,
+				 "revision %c not fully supported\n",
+				 'A' + ret);
 		break;
 	default:
-		dev_info(wm8994->dev, "revision %c\n", 'A' + ret);
 		break;
 	}
 
+	dev_info(wm8994->dev, "%s revision %c\n", devname, 'A' + ret);
 
 	if (pdata) {
 		wm8994->irq_base = pdata->irq_base;
@@ -423,10 +476,10 @@ static int wm8994_device_init(struct wm8994 *wm8994, unsigned long id, int irq)
 err_irq:
 	wm8994_irq_exit(wm8994);
 err_enable:
-	regulator_bulk_disable(ARRAY_SIZE(wm8994_main_supplies),
+	regulator_bulk_disable(wm8994->num_supplies,
 			       wm8994->supplies);
 err_get:
-	regulator_bulk_free(ARRAY_SIZE(wm8994_main_supplies), wm8994->supplies);
+	regulator_bulk_free(wm8994->num_supplies, wm8994->supplies);
 err_supplies:
 	kfree(wm8994->supplies);
 err:
@@ -439,9 +492,9 @@ static void wm8994_device_exit(struct wm8994 *wm8994)
 {
 	mfd_remove_devices(wm8994->dev);
 	wm8994_irq_exit(wm8994);
-	regulator_bulk_disable(ARRAY_SIZE(wm8994_main_supplies),
+	regulator_bulk_disable(wm8994->num_supplies,
 			       wm8994->supplies);
-	regulator_bulk_free(ARRAY_SIZE(wm8994_main_supplies), wm8994->supplies);
+	regulator_bulk_free(wm8994->num_supplies, wm8994->supplies);
 	kfree(wm8994->supplies);
 	kfree(wm8994);
 }
@@ -506,8 +559,9 @@ static int wm8994_i2c_probe(struct i2c_client *i2c,
 	wm8994->read_dev = wm8994_i2c_read_device;
 	wm8994->write_dev = wm8994_i2c_write_device;
 	wm8994->irq = i2c->irq;
+	wm8994->type = id->driver_data;
 
-	return wm8994_device_init(wm8994, id->driver_data, i2c->irq);
+	return wm8994_device_init(wm8994, i2c->irq);
 }
 
 static int wm8994_i2c_remove(struct i2c_client *i2c)
@@ -535,7 +589,8 @@ static int wm8994_i2c_resume(struct i2c_client *i2c)
 #endif
 
 static const struct i2c_device_id wm8994_i2c_id[] = {
-	{ "wm8994", 0 },
+	{ "wm8994", WM8994 },
+	{ "wm8958", WM8958 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, wm8994_i2c_id);
diff --git a/include/linux/mfd/wm8994/core.h b/include/linux/mfd/wm8994/core.h
index de79baee4925..3fd36845ca45 100644
--- a/include/linux/mfd/wm8994/core.h
+++ b/include/linux/mfd/wm8994/core.h
@@ -17,6 +17,11 @@
 
 #include <linux/interrupt.h>
 
+enum wm8994_type {
+	WM8994 = 0,
+	WM8958 = 1,
+};
+
 struct regulator_dev;
 struct regulator_bulk_data;
 
@@ -48,6 +53,8 @@ struct wm8994 {
 	struct mutex io_lock;
 	struct mutex irq_lock;
 
+	enum wm8994_type type;
+
 	struct device *dev;
 	int (*read_dev)(struct wm8994 *wm8994, unsigned short reg,
 			int bytes, void *dest);
@@ -68,6 +75,7 @@ struct wm8994 {
 	u16 gpio_regs[WM8994_NUM_GPIO_REGS];
 
 	struct regulator_dev *dbvdd;
+	int num_supplies;
 	struct regulator_bulk_data *supplies;
 };
 
-- 
cgit v1.2.3-71-gd317


From c4431df050ff124cae7716e301cead1e8f33c575 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 26 Nov 2010 15:21:07 +0000
Subject: ASoC: Implement support for enhanced AIF3 on WM8958

Additional audio routing options are available on the WM8958 audio
interface 3. Add support for these.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/mfd/wm8994/registers.h |  67 ++++++++++++++
 sound/soc/codecs/wm8994.c            | 166 +++++++++++++++++++++++++++++++++--
 2 files changed, 224 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index 967f62f54159..3eb70a4e7681 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -109,6 +109,10 @@
 #define WM8994_AIF2DAC_LRCLK                    0x315
 #define WM8994_AIF2DAC_DATA                     0x316
 #define WM8994_AIF2ADC_DATA                     0x317
+#define WM8958_AIF3_CONTROL_1                   0x320
+#define WM8958_AIF3_CONTROL_2                   0x321
+#define WM8958_AIF3DAC_DATA                     0x322
+#define WM8958_AIF3ADC_DATA                     0x323
 #define WM8994_AIF1_ADC1_LEFT_VOLUME            0x400
 #define WM8994_AIF1_ADC1_RIGHT_VOLUME           0x401
 #define WM8994_AIF1_DAC1_LEFT_VOLUME            0x402
@@ -992,6 +996,12 @@
 /*
  * R6 (0x06) - Power Management (6)
  */
+#define WM8958_AIF3ADC_SRC_MASK                 0x0600  /* AIF3ADC_SRC - [10:9] */
+#define WM8958_AIF3ADC_SRC_SHIFT                     9  /* AIF3ADC_SRC - [10:9] */
+#define WM8958_AIF3ADC_SRC_WIDTH                     2  /* AIF3ADC_SRC - [10:9] */
+#define WM8958_AIF2DAC_SRC_MASK                 0x0180  /* AIF2DAC_SRC - [8:7] */
+#define WM8958_AIF2DAC_SRC_SHIFT                     7  /* AIF2DAC_SRC - [8:7] */
+#define WM8958_AIF2DAC_SRC_WIDTH                     2  /* AIF2DAC_SRC - [8:7] */
 #define WM8994_AIF3_TRI                         0x0020  /* AIF3_TRI */
 #define WM8994_AIF3_TRI_MASK                    0x0020  /* AIF3_TRI */
 #define WM8994_AIF3_TRI_SHIFT                        5  /* AIF3_TRI */
@@ -2552,6 +2562,63 @@
 #define WM8994_AIF2ADCR_DAT_INV_SHIFT                0  /* AIF2ADCR_DAT_INV */
 #define WM8994_AIF2ADCR_DAT_INV_WIDTH                1  /* AIF2ADCR_DAT_INV */
 
+/*
+ * R800 (0x320) - AIF3 Control (1)
+ */
+#define WM8958_AIF3_LRCLK_INV                   0x0080  /* AIF3_LRCLK_INV */
+#define WM8958_AIF3_LRCLK_INV_MASK              0x0080  /* AIF3_LRCLK_INV */
+#define WM8958_AIF3_LRCLK_INV_SHIFT                  7  /* AIF3_LRCLK_INV */
+#define WM8958_AIF3_LRCLK_INV_WIDTH                  1  /* AIF3_LRCLK_INV */
+#define WM8958_AIF3_WL_MASK                     0x0060  /* AIF3_WL - [6:5] */
+#define WM8958_AIF3_WL_SHIFT                         5  /* AIF3_WL - [6:5] */
+#define WM8958_AIF3_WL_WIDTH                         2  /* AIF3_WL - [6:5] */
+#define WM8958_AIF3_FMT_MASK                    0x0018  /* AIF3_FMT - [4:3] */
+#define WM8958_AIF3_FMT_SHIFT                        3  /* AIF3_FMT - [4:3] */
+#define WM8958_AIF3_FMT_WIDTH                        2  /* AIF3_FMT - [4:3] */
+
+/*
+ * R801 (0x321) - AIF3 Control (2)
+ */
+#define WM8958_AIF3DAC_BOOST_MASK               0x0C00  /* AIF3DAC_BOOST - [11:10] */
+#define WM8958_AIF3DAC_BOOST_SHIFT                  10  /* AIF3DAC_BOOST - [11:10] */
+#define WM8958_AIF3DAC_BOOST_WIDTH                   2  /* AIF3DAC_BOOST - [11:10] */
+#define WM8958_AIF3DAC_COMP                     0x0010  /* AIF3DAC_COMP */
+#define WM8958_AIF3DAC_COMP_MASK                0x0010  /* AIF3DAC_COMP */
+#define WM8958_AIF3DAC_COMP_SHIFT                    4  /* AIF3DAC_COMP */
+#define WM8958_AIF3DAC_COMP_WIDTH                    1  /* AIF3DAC_COMP */
+#define WM8958_AIF3DAC_COMPMODE                 0x0008  /* AIF3DAC_COMPMODE */
+#define WM8958_AIF3DAC_COMPMODE_MASK            0x0008  /* AIF3DAC_COMPMODE */
+#define WM8958_AIF3DAC_COMPMODE_SHIFT                3  /* AIF3DAC_COMPMODE */
+#define WM8958_AIF3DAC_COMPMODE_WIDTH                1  /* AIF3DAC_COMPMODE */
+#define WM8958_AIF3ADC_COMP                     0x0004  /* AIF3ADC_COMP */
+#define WM8958_AIF3ADC_COMP_MASK                0x0004  /* AIF3ADC_COMP */
+#define WM8958_AIF3ADC_COMP_SHIFT                    2  /* AIF3ADC_COMP */
+#define WM8958_AIF3ADC_COMP_WIDTH                    1  /* AIF3ADC_COMP */
+#define WM8958_AIF3ADC_COMPMODE                 0x0002  /* AIF3ADC_COMPMODE */
+#define WM8958_AIF3ADC_COMPMODE_MASK            0x0002  /* AIF3ADC_COMPMODE */
+#define WM8958_AIF3ADC_COMPMODE_SHIFT                1  /* AIF3ADC_COMPMODE */
+#define WM8958_AIF3ADC_COMPMODE_WIDTH                1  /* AIF3ADC_COMPMODE */
+#define WM8958_AIF3_LOOPBACK                    0x0001  /* AIF3_LOOPBACK */
+#define WM8958_AIF3_LOOPBACK_MASK               0x0001  /* AIF3_LOOPBACK */
+#define WM8958_AIF3_LOOPBACK_SHIFT                   0  /* AIF3_LOOPBACK */
+#define WM8958_AIF3_LOOPBACK_WIDTH                   1  /* AIF3_LOOPBACK */
+
+/*
+ * R802 (0x322) - AIF3DAC Data
+ */
+#define WM8958_AIF3DAC_DAT_INV                  0x0001  /* AIF3DAC_DAT_INV */
+#define WM8958_AIF3DAC_DAT_INV_MASK             0x0001  /* AIF3DAC_DAT_INV */
+#define WM8958_AIF3DAC_DAT_INV_SHIFT                 0  /* AIF3DAC_DAT_INV */
+#define WM8958_AIF3DAC_DAT_INV_WIDTH                 1  /* AIF3DAC_DAT_INV */
+
+/*
+ * R803 (0x323) - AIF3ADC Data
+ */
+#define WM8958_AIF3ADC_DAT_INV                  0x0001  /* AIF3ADC_DAT_INV */
+#define WM8958_AIF3ADC_DAT_INV_MASK             0x0001  /* AIF3ADC_DAT_INV */
+#define WM8958_AIF3ADC_DAT_INV_SHIFT                 0  /* AIF3ADC_DAT_INV */
+#define WM8958_AIF3ADC_DAT_INV_WIDTH                 1  /* AIF3ADC_DAT_INV */
+
 /*
  * R1024 (0x400) - AIF1 ADC1 Left Volume
  */
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index 8232d5e73194..fb0609315cd6 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -647,6 +647,10 @@ SOC_SINGLE_TLV("AIF2 EQ5 Volume", WM8994_AIF2_EQ_GAINS_2, 6, 31, 0,
 	       eq_tlv),
 };
 
+static const struct snd_kcontrol_new wm8958_snd_controls[] = {
+SOC_SINGLE_TLV("AIF3 Boost Volume", WM8958_AIF3_CONTROL_2, 10, 3, 0, aif_tlv),
+};
+
 static int clk_sys_event(struct snd_soc_dapm_widget *w,
 			 struct snd_kcontrol *kcontrol, int event)
 {
@@ -953,14 +957,47 @@ static const struct snd_kcontrol_new aif2adc_mux =
 	SOC_DAPM_ENUM("AIF2ADC Mux", aif2adc_enum);
 
 static const char *aif3adc_text[] = {
-	"AIF1ADCDAT", "AIF2ADCDAT", "AIF2DACDAT",
+	"AIF1ADCDAT", "AIF2ADCDAT", "AIF2DACDAT", "Mono PCM",
 };
 
-static const struct soc_enum aif3adc_enum =
+static const struct soc_enum wm8994_aif3adc_enum =
 	SOC_ENUM_SINGLE(WM8994_POWER_MANAGEMENT_6, 3, 3, aif3adc_text);
 
-static const struct snd_kcontrol_new aif3adc_mux =
-	SOC_DAPM_ENUM("AIF3ADC Mux", aif3adc_enum);
+static const struct snd_kcontrol_new wm8994_aif3adc_mux =
+	SOC_DAPM_ENUM("AIF3ADC Mux", wm8994_aif3adc_enum);
+
+static const struct soc_enum wm8958_aif3adc_enum =
+	SOC_ENUM_SINGLE(WM8994_POWER_MANAGEMENT_6, 3, 4, aif3adc_text);
+
+static const struct snd_kcontrol_new wm8958_aif3adc_mux =
+	SOC_DAPM_ENUM("AIF3ADC Mux", wm8958_aif3adc_enum);
+
+static const char *mono_pcm_out_text[] = {
+	"None", "AIF2ADCL", "AIF2ADCR", 
+};
+
+static const struct soc_enum mono_pcm_out_enum =
+	SOC_ENUM_SINGLE(WM8994_POWER_MANAGEMENT_6, 9, 3, mono_pcm_out_text);
+
+static const struct snd_kcontrol_new mono_pcm_out_mux =
+	SOC_DAPM_ENUM("Mono PCM Out Mux", mono_pcm_out_enum);
+
+static const char *aif2dac_src_text[] = {
+	"AIF2", "AIF3",
+};
+
+/* Note that these two control shouldn't be simultaneously switched to AIF3 */
+static const struct soc_enum aif2dacl_src_enum =
+	SOC_ENUM_SINGLE(WM8994_POWER_MANAGEMENT_6, 7, 2, aif2dac_src_text);
+
+static const struct snd_kcontrol_new aif2dacl_src_mux =
+	SOC_DAPM_ENUM("AIF2DACL Mux", aif2dacl_src_enum);
+
+static const struct soc_enum aif2dacr_src_enum =
+	SOC_ENUM_SINGLE(WM8994_POWER_MANAGEMENT_6, 8, 2, aif2dac_src_text);
+
+static const struct snd_kcontrol_new aif2dacr_src_mux =
+	SOC_DAPM_ENUM("AIF2DACR Mux", aif2dacr_src_enum);
 
 static const struct snd_soc_dapm_widget wm8994_dapm_widgets[] = {
 SND_SOC_DAPM_INPUT("DMIC1DAT"),
@@ -1034,7 +1071,6 @@ SND_SOC_DAPM_AIF_OUT("AIF2ADCDAT", "AIF2 Capture", 0, SND_SOC_NOPM, 0, 0),
 SND_SOC_DAPM_MUX("AIF1DAC Mux", SND_SOC_NOPM, 0, 0, &aif1dac_mux),
 SND_SOC_DAPM_MUX("AIF2DAC Mux", SND_SOC_NOPM, 0, 0, &aif2dac_mux),
 SND_SOC_DAPM_MUX("AIF2ADC Mux", SND_SOC_NOPM, 0, 0, &aif2adc_mux),
-SND_SOC_DAPM_MUX("AIF3ADC Mux", SND_SOC_NOPM, 0, 0, &aif3adc_mux),
 
 SND_SOC_DAPM_AIF_IN("AIF3DACDAT", "AIF3 Playback", 0, SND_SOC_NOPM, 0, 0),
 SND_SOC_DAPM_AIF_IN("AIF3ADCDAT", "AIF3 Capture", 0, SND_SOC_NOPM, 0, 0),
@@ -1072,8 +1108,18 @@ SND_SOC_DAPM_MIXER("SPKR", WM8994_POWER_MANAGEMENT_3, 9, 0,
 SND_SOC_DAPM_POST("Debug log", post_ev),
 };
 
-static const struct snd_soc_dapm_route intercon[] = {
+static const struct snd_soc_dapm_widget wm8994_specific_dapm_widgets[] = {
+SND_SOC_DAPM_MUX("AIF3ADC Mux", SND_SOC_NOPM, 0, 0, &wm8994_aif3adc_mux),
+};
 
+static const struct snd_soc_dapm_widget wm8958_dapm_widgets[] = {
+SND_SOC_DAPM_MUX("Mono PCM Out Mux", SND_SOC_NOPM, 0, 0, &mono_pcm_out_mux),
+SND_SOC_DAPM_MUX("AIF2DACL Mux", SND_SOC_NOPM, 0, 0, &aif2dacl_src_mux),
+SND_SOC_DAPM_MUX("AIF2DACR Mux", SND_SOC_NOPM, 0, 0, &aif2dacr_src_mux),
+SND_SOC_DAPM_MUX("AIF3ADC Mux", SND_SOC_NOPM, 0, 0, &wm8958_aif3adc_mux),
+};
+
+static const struct snd_soc_dapm_route intercon[] = {
 	{ "CLK_SYS", NULL, "AIF1CLK", check_clk_sys },
 	{ "CLK_SYS", NULL, "AIF2CLK", check_clk_sys },
 
@@ -1181,9 +1227,6 @@ static const struct snd_soc_dapm_route intercon[] = {
 	{ "AIF1DAC2L", NULL, "AIF1DAC Mux" },
 	{ "AIF1DAC2R", NULL, "AIF1DAC Mux" },
 
-	{ "AIF2DACL", NULL, "AIF2DAC Mux" },
-	{ "AIF2DACR", NULL, "AIF2DAC Mux" },
-
 	{ "AIF1DAC Mux", "AIF1DACDAT", "AIF1DACDAT" },
 	{ "AIF1DAC Mux", "AIF3DACDAT", "AIF3DACDAT" },
 	{ "AIF2DAC Mux", "AIF2DACDAT", "AIF2DACDAT" },
@@ -1256,6 +1299,26 @@ static const struct snd_soc_dapm_route intercon[] = {
 	{ "Right Headphone Mux", "DAC", "DAC1R" },
 };
 
+static const struct snd_soc_dapm_route wm8994_intercon[] = {
+	{ "AIF2DACL", NULL, "AIF2DAC Mux" },
+	{ "AIF2DACR", NULL, "AIF2DAC Mux" },
+};
+
+static const struct snd_soc_dapm_route wm8958_intercon[] = {
+	{ "AIF2DACL", NULL, "AIF2DACL Mux" },
+	{ "AIF2DACR", NULL, "AIF2DACR Mux" },
+
+	{ "AIF2DACL Mux", "AIF2", "AIF2DAC Mux" },
+	{ "AIF2DACL Mux", "AIF3", "AIF3DACDAT" },
+	{ "AIF2DACR Mux", "AIF2", "AIF2DAC Mux" },
+	{ "AIF2DACR Mux", "AIF3", "AIF3DACDAT" },
+
+	{ "Mono PCM Out Mux", "AIF2ADCL", "AIF2ADCL" },
+	{ "Mono PCM Out Mux", "AIF2ADCR", "AIF2ADCR" },
+
+	{ "AIF3ADC Mux", "Mono PCM", "Mono PCM Out Mux" },
+};
+
 /* The size in bits of the FLL divide multiplied by 10
  * to allow rounding later */
 #define FIXED_FLL_SIZE ((1 << 16) * 10)
@@ -1635,6 +1698,7 @@ static int wm8994_set_bias_level(struct snd_soc_codec *codec,
 static int wm8994_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 {
 	struct snd_soc_codec *codec = dai->codec;
+	struct wm8994 *control = codec->control_data;
 	int ms_reg;
 	int aif1_reg;
 	int ms = 0;
@@ -1719,6 +1783,13 @@ static int wm8994_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 		return -EINVAL;
 	}
 
+	/* The AIF2 format configuration needs to be mirrored to AIF3
+	 * on WM8958 if it's in use so just do it all the time. */
+	if (control->type == WM8958 && dai->id == 2)
+		snd_soc_update_bits(codec, WM8958_AIF3_CONTROL_1,
+				    WM8994_AIF1_LRCLK_INV |
+				    WM8958_AIF3_FMT_MASK, aif1);
+
 	snd_soc_update_bits(codec, aif1_reg,
 			    WM8994_AIF1_BCLK_INV | WM8994_AIF1_LRCLK_INV |
 			    WM8994_AIF1_FMT_MASK,
@@ -1759,6 +1830,7 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream,
 			    struct snd_soc_dai *dai)
 {
 	struct snd_soc_codec *codec = dai->codec;
+	struct wm8994 *control = codec->control_data;
 	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
 	int aif1_reg;
 	int bclk_reg;
@@ -1797,6 +1869,14 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream,
 			dev_dbg(codec->dev, "AIF2 using split LRCLK\n");
 		}
 		break;
+	case 3:
+		switch (control->type) {
+		case WM8958:
+			aif1_reg = WM8958_AIF3_CONTROL_1;
+			break;
+		default:
+			return 0;
+		}
 	default:
 		return -EINVAL;
 	}
@@ -1900,6 +1980,47 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream,
 	return 0;
 }
 
+static int wm8994_aif3_hw_params(struct snd_pcm_substream *substream,
+				 struct snd_pcm_hw_params *params,
+				 struct snd_soc_dai *dai)
+{
+	struct snd_soc_codec *codec = dai->codec;
+	struct wm8994 *control = codec->control_data;
+	int aif1_reg;
+	int aif1 = 0;
+
+	switch (dai->id) {
+	case 3:
+		switch (control->type) {
+		case WM8958:
+			aif1_reg = WM8958_AIF3_CONTROL_1;
+			break;
+		default:
+			return 0;
+		}
+	default:
+		return 0;
+	}
+
+	switch (params_format(params)) {
+	case SNDRV_PCM_FORMAT_S16_LE:
+		break;
+	case SNDRV_PCM_FORMAT_S20_3LE:
+		aif1 |= 0x20;
+		break;
+	case SNDRV_PCM_FORMAT_S24_LE:
+		aif1 |= 0x40;
+		break;
+	case SNDRV_PCM_FORMAT_S32_LE:
+		aif1 |= 0x60;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return snd_soc_update_bits(codec, aif1_reg, WM8994_AIF1_WL_MASK, aif1);
+}
+
 static int wm8994_aif_mute(struct snd_soc_dai *codec_dai, int mute)
 {
 	struct snd_soc_codec *codec = codec_dai->codec;
@@ -1981,6 +2102,7 @@ static struct snd_soc_dai_ops wm8994_aif2_dai_ops = {
 };
 
 static struct snd_soc_dai_ops wm8994_aif3_dai_ops = {
+	.hw_params	= wm8994_aif3_hw_params,
 	.set_tristate	= wm8994_set_tristate,
 };
 
@@ -2511,9 +2633,35 @@ static int wm8994_codec_probe(struct snd_soc_codec *codec)
 			     ARRAY_SIZE(wm8994_snd_controls));
 	snd_soc_dapm_new_controls(dapm, wm8994_dapm_widgets,
 				  ARRAY_SIZE(wm8994_dapm_widgets));
+
+	switch (control->type) {
+	case WM8994:
+		snd_soc_dapm_new_controls(dapm, wm8994_specific_dapm_widgets,
+					  ARRAY_SIZE(wm8994_specific_dapm_widgets));
+		break;
+	case WM8958:
+		snd_soc_add_controls(codec, wm8958_snd_controls,
+				     ARRAY_SIZE(wm8958_snd_controls));
+		snd_soc_dapm_new_controls(dapm, wm8958_dapm_widgets,
+					  ARRAY_SIZE(wm8958_dapm_widgets));
+		break;
+	}
+		
+
 	wm_hubs_add_analogue_routes(codec, 0, 0);
 	snd_soc_dapm_add_routes(dapm, intercon, ARRAY_SIZE(intercon));
 
+	switch (control->type) {
+	case WM8994:
+		snd_soc_dapm_add_routes(dapm, wm8994_intercon,
+					ARRAY_SIZE(wm8994_intercon));
+		break;
+	case WM8958:
+		snd_soc_dapm_add_routes(dapm, wm8958_intercon,
+					ARRAY_SIZE(wm8958_intercon));
+		break;
+	}
+
 	return 0;
 
 err_irq:
-- 
cgit v1.2.3-71-gd317


From d6addcc9d88aeac4a0cc63a06d36baef04f5dc3b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 26 Nov 2010 15:21:08 +0000
Subject: ASoC: Add WM8958 Multi-band compressor support

The WM8958 features a multi-band compressor which can be enabled on
any of the AIF inputs. The MBC allows different gains to be applied to
differnt audio bands, providing an improvement in perceived loudness
of the signal by avoiding overdriving the output transducers. This
patch enables support for the MBC.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/mfd/wm8994/registers.h | 115 ++++++++++++++++++++
 sound/soc/codecs/wm8994.c            | 198 ++++++++++++++++++++++++++++++++---
 2 files changed, 301 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index 3eb70a4e7681..423b2b5c94ea 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -246,6 +246,15 @@
 #define WM8994_INTERRUPT_STATUS_2_MASK          0x739
 #define WM8994_INTERRUPT_CONTROL                0x740
 #define WM8994_IRQ_DEBOUNCE                     0x748
+#define WM8958_DSP2_PROGRAM                     0x900
+#define WM8958_DSP2_CONFIG                      0x901
+#define WM8958_DSP2_MAGICNUM                    0xA00
+#define WM8958_DSP2_RELEASEYEAR                 0xA01
+#define WM8958_DSP2_RELEASEMONTHDAY             0xA02
+#define WM8958_DSP2_RELEASETIME                 0xA03
+#define WM8958_DSP2_VERMAJMIN                   0xA04
+#define WM8958_DSP2_VERBUILD                    0xA05
+#define WM8958_DSP2_EXECCONTROL                 0xA0D
 #define WM8994_WRITE_SEQUENCER_0                0x3000
 #define WM8994_WRITE_SEQUENCER_1                0x3001
 #define WM8994_WRITE_SEQUENCER_2                0x3002
@@ -2079,6 +2088,14 @@
 /*
  * R520 (0x208) - Clocking (1)
  */
+#define WM8958_DSP2CLK_ENA                      0x4000  /* DSP2CLK_ENA */
+#define WM8958_DSP2CLK_ENA_MASK                 0x4000  /* DSP2CLK_ENA */
+#define WM8958_DSP2CLK_ENA_SHIFT                    14  /* DSP2CLK_ENA */
+#define WM8958_DSP2CLK_ENA_WIDTH                     1  /* DSP2CLK_ENA */
+#define WM8958_DSP2CLK_SRC                      0x1000  /* DSP2CLK_SRC */
+#define WM8958_DSP2CLK_SRC_MASK                 0x1000  /* DSP2CLK_SRC */
+#define WM8958_DSP2CLK_SRC_SHIFT                    12  /* DSP2CLK_SRC */
+#define WM8958_DSP2CLK_SRC_WIDTH                     1  /* DSP2CLK_SRC */
 #define WM8994_TOCLK_ENA                        0x0010  /* TOCLK_ENA */
 #define WM8994_TOCLK_ENA_MASK                   0x0010  /* TOCLK_ENA */
 #define WM8994_TOCLK_ENA_SHIFT                       4  /* TOCLK_ENA */
@@ -4356,4 +4373,102 @@
 #define WM8994_TEMP_SHUT_DB_SHIFT                    0  /* TEMP_SHUT_DB */
 #define WM8994_TEMP_SHUT_DB_WIDTH                    1  /* TEMP_SHUT_DB */
 
+/*
+ * R2304 (0x900) - DSP2_Program
+ */
+#define WM8958_DSP2_ENA                         0x0001  /* DSP2_ENA */
+#define WM8958_DSP2_ENA_MASK                    0x0001  /* DSP2_ENA */
+#define WM8958_DSP2_ENA_SHIFT                        0  /* DSP2_ENA */
+#define WM8958_DSP2_ENA_WIDTH                        1  /* DSP2_ENA */
+
+/*
+ * R2305 (0x901) - DSP2_Config
+ */
+#define WM8958_MBC_SEL_MASK                     0x0030  /* MBC_SEL - [5:4] */
+#define WM8958_MBC_SEL_SHIFT                         4  /* MBC_SEL - [5:4] */
+#define WM8958_MBC_SEL_WIDTH                         2  /* MBC_SEL - [5:4] */
+#define WM8958_MBC_ENA                          0x0001  /* MBC_ENA */
+#define WM8958_MBC_ENA_MASK                     0x0001  /* MBC_ENA */
+#define WM8958_MBC_ENA_SHIFT                         0  /* MBC_ENA */
+#define WM8958_MBC_ENA_WIDTH                         1  /* MBC_ENA */
+
+/*
+ * R2560 (0xA00) - DSP2_MagicNum
+ */
+#define WM8958_DSP2_MAGIC_NUM_MASK              0xFFFF  /* DSP2_MAGIC_NUM - [15:0] */
+#define WM8958_DSP2_MAGIC_NUM_SHIFT                  0  /* DSP2_MAGIC_NUM - [15:0] */
+#define WM8958_DSP2_MAGIC_NUM_WIDTH                 16  /* DSP2_MAGIC_NUM - [15:0] */
+
+/*
+ * R2561 (0xA01) - DSP2_ReleaseYear
+ */
+#define WM8958_DSP2_RELEASE_YEAR_MASK           0xFFFF  /* DSP2_RELEASE_YEAR - [15:0] */
+#define WM8958_DSP2_RELEASE_YEAR_SHIFT               0  /* DSP2_RELEASE_YEAR - [15:0] */
+#define WM8958_DSP2_RELEASE_YEAR_WIDTH              16  /* DSP2_RELEASE_YEAR - [15:0] */
+
+/*
+ * R2562 (0xA02) - DSP2_ReleaseMonthDay
+ */
+#define WM8958_DSP2_RELEASE_MONTH_MASK          0xFF00  /* DSP2_RELEASE_MONTH - [15:8] */
+#define WM8958_DSP2_RELEASE_MONTH_SHIFT              8  /* DSP2_RELEASE_MONTH - [15:8] */
+#define WM8958_DSP2_RELEASE_MONTH_WIDTH              8  /* DSP2_RELEASE_MONTH - [15:8] */
+#define WM8958_DSP2_RELEASE_DAY_MASK            0x00FF  /* DSP2_RELEASE_DAY - [7:0] */
+#define WM8958_DSP2_RELEASE_DAY_SHIFT                0  /* DSP2_RELEASE_DAY - [7:0] */
+#define WM8958_DSP2_RELEASE_DAY_WIDTH                8  /* DSP2_RELEASE_DAY - [7:0] */
+
+/*
+ * R2563 (0xA03) - DSP2_ReleaseTime
+ */
+#define WM8958_DSP2_RELEASE_HOURS_MASK          0xFF00  /* DSP2_RELEASE_HOURS - [15:8] */
+#define WM8958_DSP2_RELEASE_HOURS_SHIFT              8  /* DSP2_RELEASE_HOURS - [15:8] */
+#define WM8958_DSP2_RELEASE_HOURS_WIDTH              8  /* DSP2_RELEASE_HOURS - [15:8] */
+#define WM8958_DSP2_RELEASE_MINS_MASK           0x00FF  /* DSP2_RELEASE_MINS - [7:0] */
+#define WM8958_DSP2_RELEASE_MINS_SHIFT               0  /* DSP2_RELEASE_MINS - [7:0] */
+#define WM8958_DSP2_RELEASE_MINS_WIDTH               8  /* DSP2_RELEASE_MINS - [7:0] */
+
+/*
+ * R2564 (0xA04) - DSP2_VerMajMin
+ */
+#define WM8958_DSP2_MAJOR_VER_MASK              0xFF00  /* DSP2_MAJOR_VER - [15:8] */
+#define WM8958_DSP2_MAJOR_VER_SHIFT                  8  /* DSP2_MAJOR_VER - [15:8] */
+#define WM8958_DSP2_MAJOR_VER_WIDTH                  8  /* DSP2_MAJOR_VER - [15:8] */
+#define WM8958_DSP2_MINOR_VER_MASK              0x00FF  /* DSP2_MINOR_VER - [7:0] */
+#define WM8958_DSP2_MINOR_VER_SHIFT                  0  /* DSP2_MINOR_VER - [7:0] */
+#define WM8958_DSP2_MINOR_VER_WIDTH                  8  /* DSP2_MINOR_VER - [7:0] */
+
+/*
+ * R2565 (0xA05) - DSP2_VerBuild
+ */
+#define WM8958_DSP2_BUILD_VER_MASK              0xFFFF  /* DSP2_BUILD_VER - [15:0] */
+#define WM8958_DSP2_BUILD_VER_SHIFT                  0  /* DSP2_BUILD_VER - [15:0] */
+#define WM8958_DSP2_BUILD_VER_WIDTH                 16  /* DSP2_BUILD_VER - [15:0] */
+
+/*
+ * R2573 (0xA0D) - DSP2_ExecControl
+ */
+#define WM8958_DSP2_STOPC                       0x0020  /* DSP2_STOPC */
+#define WM8958_DSP2_STOPC_MASK                  0x0020  /* DSP2_STOPC */
+#define WM8958_DSP2_STOPC_SHIFT                      5  /* DSP2_STOPC */
+#define WM8958_DSP2_STOPC_WIDTH                      1  /* DSP2_STOPC */
+#define WM8958_DSP2_STOPS                       0x0010  /* DSP2_STOPS */
+#define WM8958_DSP2_STOPS_MASK                  0x0010  /* DSP2_STOPS */
+#define WM8958_DSP2_STOPS_SHIFT                      4  /* DSP2_STOPS */
+#define WM8958_DSP2_STOPS_WIDTH                      1  /* DSP2_STOPS */
+#define WM8958_DSP2_STOPI                       0x0008  /* DSP2_STOPI */
+#define WM8958_DSP2_STOPI_MASK                  0x0008  /* DSP2_STOPI */
+#define WM8958_DSP2_STOPI_SHIFT                      3  /* DSP2_STOPI */
+#define WM8958_DSP2_STOPI_WIDTH                      1  /* DSP2_STOPI */
+#define WM8958_DSP2_STOP                        0x0004  /* DSP2_STOP */
+#define WM8958_DSP2_STOP_MASK                   0x0004  /* DSP2_STOP */
+#define WM8958_DSP2_STOP_SHIFT                       2  /* DSP2_STOP */
+#define WM8958_DSP2_STOP_WIDTH                       1  /* DSP2_STOP */
+#define WM8958_DSP2_RUNR                        0x0002  /* DSP2_RUNR */
+#define WM8958_DSP2_RUNR_MASK                   0x0002  /* DSP2_RUNR */
+#define WM8958_DSP2_RUNR_SHIFT                       1  /* DSP2_RUNR */
+#define WM8958_DSP2_RUNR_WIDTH                       1  /* DSP2_RUNR */
+#define WM8958_DSP2_RUN                         0x0001  /* DSP2_RUN */
+#define WM8958_DSP2_RUN_MASK                    0x0001  /* DSP2_RUN */
+#define WM8958_DSP2_RUN_SHIFT                        0  /* DSP2_RUN */
+#define WM8958_DSP2_RUN_WIDTH                        1  /* DSP2_RUN */
+
 #endif
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index fb0609315cd6..b30b2dd3f1f4 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -80,6 +80,8 @@ struct wm8994_priv {
 	int dac_rates[2];
 	int lrclk_shared[2];
 
+	int mbc_ena[3];
+
 	/* Platform dependant DRC configuration */
 	const char **drc_texts;
 	int drc_cfg[WM8994_NUM_DRC];
@@ -137,6 +139,7 @@ static int wm8994_volatile(unsigned int reg)
 	case WM8994_RATE_STATUS:
 	case WM8994_LDO_1:
 	case WM8994_LDO_2:
+	case WM8958_DSP2_EXECCONTROL:
 		return 1;
 	default:
 		return 0;
@@ -520,6 +523,168 @@ static const struct soc_enum aif2dacl_src =
 static const struct soc_enum aif2dacr_src =
 	SOC_ENUM_SINGLE(WM8994_AIF2_CONTROL_2, 14, 2, aif_chan_src_text);
 
+static void wm8958_mbc_apply(struct snd_soc_codec *codec, int mbc, int start)
+{
+	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+	int pwr_reg = snd_soc_read(codec, WM8994_POWER_MANAGEMENT_5);
+	int ena, reg, aif;
+
+	switch (mbc) {
+	case 0:
+		pwr_reg &= (WM8994_AIF1DAC1L_ENA | WM8994_AIF1DAC1R_ENA);
+		aif = 0;
+		break;
+	case 1:
+		pwr_reg &= (WM8994_AIF1DAC2L_ENA | WM8994_AIF1DAC2R_ENA);
+		aif = 0;
+		break;
+	case 2:
+		pwr_reg &= (WM8994_AIF2DACL_ENA | WM8994_AIF2DACR_ENA);
+		aif = 1;
+		break;
+	default:
+		BUG();
+		return;
+	}
+
+	/* We can only enable the MBC if the AIF is enabled and we
+	 * want it to be enabled. */
+	ena = pwr_reg && wm8994->mbc_ena[mbc];
+
+	reg = snd_soc_read(codec, WM8958_DSP2_PROGRAM);
+
+	dev_dbg(codec->dev, "MBC %d startup: %d, power: %x, DSP: %x\n",
+		mbc, start, pwr_reg, reg);
+
+	if (start && ena) {
+		/* If the DSP is already running then noop */
+		if (reg & WM8958_DSP2_ENA)
+			return;
+
+		/* Switch the clock over to the appropriate AIF */
+		snd_soc_update_bits(codec, WM8994_CLOCKING_1,
+				    WM8958_DSP2CLK_SRC | WM8958_DSP2CLK_ENA,
+				    aif << WM8958_DSP2CLK_SRC_SHIFT |
+				    WM8958_DSP2CLK_ENA);
+
+		snd_soc_update_bits(codec, WM8958_DSP2_PROGRAM,
+				    WM8958_DSP2_ENA, WM8958_DSP2_ENA);
+
+		/* TODO: Apply any user specified MBC settings */
+
+		/* Run the DSP */
+		snd_soc_write(codec, WM8958_DSP2_EXECCONTROL,
+			      WM8958_DSP2_RUNR);
+
+		/* And we're off! */
+		snd_soc_update_bits(codec, WM8958_DSP2_CONFIG,
+				    WM8958_MBC_ENA | WM8958_MBC_SEL_MASK,
+				    mbc << WM8958_MBC_SEL_SHIFT |
+				    WM8958_MBC_ENA);
+	} else {
+		/* If the DSP is already stopped then noop */
+		if (!(reg & WM8958_DSP2_ENA))
+			return;
+
+		snd_soc_update_bits(codec, WM8958_DSP2_CONFIG,
+				    WM8958_MBC_ENA, 0);	
+		snd_soc_update_bits(codec, WM8958_DSP2_PROGRAM,
+				    WM8958_DSP2_ENA, 0);
+		snd_soc_update_bits(codec, WM8994_CLOCKING_1,
+				    WM8958_DSP2CLK_ENA, 0);
+	}
+}
+
+static int wm8958_aif_ev(struct snd_soc_dapm_widget *w,
+		    struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_codec *codec = w->codec;
+	int mbc;
+
+	switch (w->shift) {
+	case 13:
+	case 12:
+		mbc = 2;
+		break;
+	case 11:
+	case 10:
+		mbc = 1;
+		break;
+	case 9:
+	case 8:
+		mbc = 0;
+		break;
+	default:
+		BUG();
+		return -EINVAL;
+	}
+
+	switch (event) {
+	case SND_SOC_DAPM_POST_PMU:
+		wm8958_mbc_apply(codec, mbc, 1);
+		break;
+	case SND_SOC_DAPM_POST_PMD:
+		wm8958_mbc_apply(codec, mbc, 0);
+		break;
+	}
+
+	return 0;
+}
+
+static int wm8958_mbc_info(struct snd_kcontrol *kcontrol,
+			   struct snd_ctl_elem_info *uinfo)
+{
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN;
+	uinfo->count = 1;
+	uinfo->value.integer.min = 0;
+	uinfo->value.integer.max = 1;
+	return 0;
+}
+
+static int wm8958_mbc_get(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol)
+{
+	int mbc = kcontrol->private_value;
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+
+	ucontrol->value.integer.value[0] = wm8994->mbc_ena[mbc];
+
+	return 0;
+}
+
+static int wm8958_mbc_put(struct snd_kcontrol *kcontrol,
+			  struct snd_ctl_elem_value *ucontrol)
+{
+	int mbc = kcontrol->private_value;
+	int i;
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+
+	if (ucontrol->value.integer.value[0] > 1)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(wm8994->mbc_ena); i++) {
+		if (mbc != i && wm8994->mbc_ena[i]) {
+			dev_dbg(codec->dev, "MBC %d active already\n", mbc);
+			return -EBUSY;
+		}
+	}
+
+	wm8994->mbc_ena[mbc] = ucontrol->value.integer.value[0];
+
+	wm8958_mbc_apply(codec, mbc, wm8994->mbc_ena[mbc]);
+
+	return 0;
+}
+
+#define WM8958_MBC_SWITCH(xname, xval) {\
+	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname), \
+	.access = SNDRV_CTL_ELEM_ACCESS_READWRITE,\
+	.info = wm8958_mbc_info, \
+	.get = wm8958_mbc_get, .put = wm8958_mbc_put, \
+	.private_value = xval }
+
 static const struct snd_kcontrol_new wm8994_snd_controls[] = {
 SOC_DOUBLE_R_TLV("AIF1ADC1 Volume", WM8994_AIF1_ADC1_LEFT_VOLUME,
 		 WM8994_AIF1_ADC1_RIGHT_VOLUME,
@@ -649,6 +814,9 @@ SOC_SINGLE_TLV("AIF2 EQ5 Volume", WM8994_AIF2_EQ_GAINS_2, 6, 31, 0,
 
 static const struct snd_kcontrol_new wm8958_snd_controls[] = {
 SOC_SINGLE_TLV("AIF3 Boost Volume", WM8958_AIF3_CONTROL_2, 10, 3, 0, aif_tlv),
+WM8958_MBC_SWITCH("AIF1DAC1 MBC Switch", 0),
+WM8958_MBC_SWITCH("AIF1DAC2 MBC Switch", 1),
+WM8958_MBC_SWITCH("AIF2DAC MBC Switch", 2),
 };
 
 static int clk_sys_event(struct snd_soc_dapm_widget *w,
@@ -1018,19 +1186,23 @@ SND_SOC_DAPM_AIF_OUT("AIF1ADC1L", "AIF1 Capture",
 		     0, WM8994_POWER_MANAGEMENT_4, 9, 0),
 SND_SOC_DAPM_AIF_OUT("AIF1ADC1R", "AIF1 Capture",
 		     0, WM8994_POWER_MANAGEMENT_4, 8, 0),
-SND_SOC_DAPM_AIF_IN("AIF1DAC1L", NULL, 0,
-		    WM8994_POWER_MANAGEMENT_5, 9, 0),
-SND_SOC_DAPM_AIF_IN("AIF1DAC1R", NULL, 0,
-		    WM8994_POWER_MANAGEMENT_5, 8, 0),
+SND_SOC_DAPM_AIF_IN_E("AIF1DAC1L", NULL, 0,
+		      WM8994_POWER_MANAGEMENT_5, 9, 0, wm8958_aif_ev,
+		      SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+SND_SOC_DAPM_AIF_IN_E("AIF1DAC1R", NULL, 0,
+		      WM8994_POWER_MANAGEMENT_5, 8, 0, wm8958_aif_ev,
+		      SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
 
 SND_SOC_DAPM_AIF_OUT("AIF1ADC2L", "AIF1 Capture",
 		     0, WM8994_POWER_MANAGEMENT_4, 11, 0),
 SND_SOC_DAPM_AIF_OUT("AIF1ADC2R", "AIF1 Capture",
 		     0, WM8994_POWER_MANAGEMENT_4, 10, 0),
-SND_SOC_DAPM_AIF_IN("AIF1DAC2L", NULL, 0,
-		    WM8994_POWER_MANAGEMENT_5, 11, 0),
-SND_SOC_DAPM_AIF_IN("AIF1DAC2R", NULL, 0,
-		    WM8994_POWER_MANAGEMENT_5, 10, 0),
+SND_SOC_DAPM_AIF_IN_E("AIF1DAC2L", NULL, 0,
+		      WM8994_POWER_MANAGEMENT_5, 11, 0, wm8958_aif_ev,
+		      SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+SND_SOC_DAPM_AIF_IN_E("AIF1DAC2R", NULL, 0,
+		      WM8994_POWER_MANAGEMENT_5, 10, 0, wm8958_aif_ev,
+		      SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
 
 SND_SOC_DAPM_MIXER("AIF1ADC1L Mixer", SND_SOC_NOPM, 0, 0,
 		   aif1adc1l_mix, ARRAY_SIZE(aif1adc1l_mix)),
@@ -1059,10 +1231,12 @@ SND_SOC_DAPM_AIF_OUT("AIF2ADCL", NULL, 0,
 		     WM8994_POWER_MANAGEMENT_4, 13, 0),
 SND_SOC_DAPM_AIF_OUT("AIF2ADCR", NULL, 0,
 		     WM8994_POWER_MANAGEMENT_4, 12, 0),
-SND_SOC_DAPM_AIF_IN("AIF2DACL", NULL, 0,
-		    WM8994_POWER_MANAGEMENT_5, 13, 0),
-SND_SOC_DAPM_AIF_IN("AIF2DACR", NULL, 0,
-		    WM8994_POWER_MANAGEMENT_5, 12, 0),
+SND_SOC_DAPM_AIF_IN_E("AIF2DACL", NULL, 0,
+		      WM8994_POWER_MANAGEMENT_5, 13, 0, wm8958_aif_ev,
+		      SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+SND_SOC_DAPM_AIF_IN_E("AIF2DACR", NULL, 0,
+		      WM8994_POWER_MANAGEMENT_5, 12, 0, wm8958_aif_ev,
+		      SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
 
 SND_SOC_DAPM_AIF_IN("AIF1DACDAT", "AIF1 Playback", 0, SND_SOC_NOPM, 0, 0),
 SND_SOC_DAPM_AIF_IN("AIF2DACDAT", "AIF2 Playback", 0, SND_SOC_NOPM, 0, 0),
-- 
cgit v1.2.3-71-gd317


From 821edd2fb5b289b84d715fb744106019fa2e1920 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 26 Nov 2010 15:21:09 +0000
Subject: ASoC: Add WM8958 microphone detection support

The WM8958 contains an advanced accessory detection feature which allows
detection of up to seven different impedence levels on the microphone
bias output, including detection of video outputs. Since some of the
more involved accessory interfaces may involve noticable interactions
with external components a simple detection scheme is provided by
default with the option to provide custom handling of accessory detect.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/mfd/wm8994/registers.h |  43 ++++++++++
 sound/soc/codecs/wm8994.c            | 150 +++++++++++++++++++++++++++++++++++
 sound/soc/codecs/wm8994.h            |   4 +
 3 files changed, 197 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index 423b2b5c94ea..a610c8746436 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -70,6 +70,9 @@
 #define WM8994_DC_SERVO_4                       0x57
 #define WM8994_DC_SERVO_READBACK                0x58
 #define WM8994_ANALOGUE_HP_1                    0x60
+#define WM8958_MIC_DETECT_1                     0xD0
+#define WM8958_MIC_DETECT_2                     0xD1
+#define WM8958_MIC_DETECT_3                     0xD2
 #define WM8994_CHIP_REVISION                    0x100
 #define WM8994_CONTROL_INTERFACE                0x101
 #define WM8994_WRITE_SEQUENCER_CTRL_1           0x110
@@ -1970,6 +1973,46 @@
 #define WM8994_HPOUT1R_DLY_SHIFT                     1  /* HPOUT1R_DLY */
 #define WM8994_HPOUT1R_DLY_WIDTH                     1  /* HPOUT1R_DLY */
 
+/*
+ * R208 (0xD0) - Mic Detect 1
+ */
+#define WM8958_MICD_BIAS_STARTTIME_MASK         0xF000  /* MICD_BIAS_STARTTIME - [15:12] */
+#define WM8958_MICD_BIAS_STARTTIME_SHIFT            12  /* MICD_BIAS_STARTTIME - [15:12] */
+#define WM8958_MICD_BIAS_STARTTIME_WIDTH             4  /* MICD_BIAS_STARTTIME - [15:12] */
+#define WM8958_MICD_RATE_MASK                   0x0F00  /* MICD_RATE - [11:8] */
+#define WM8958_MICD_RATE_SHIFT                       8  /* MICD_RATE - [11:8] */
+#define WM8958_MICD_RATE_WIDTH                       4  /* MICD_RATE - [11:8] */
+#define WM8958_MICD_DBTIME                      0x0002  /* MICD_DBTIME */
+#define WM8958_MICD_DBTIME_MASK                 0x0002  /* MICD_DBTIME */
+#define WM8958_MICD_DBTIME_SHIFT                     1  /* MICD_DBTIME */
+#define WM8958_MICD_DBTIME_WIDTH                     1  /* MICD_DBTIME */
+#define WM8958_MICD_ENA                         0x0001  /* MICD_ENA */
+#define WM8958_MICD_ENA_MASK                    0x0001  /* MICD_ENA */
+#define WM8958_MICD_ENA_SHIFT                        0  /* MICD_ENA */
+#define WM8958_MICD_ENA_WIDTH                        1  /* MICD_ENA */
+
+/*
+ * R209 (0xD1) - Mic Detect 2
+ */
+#define WM8958_MICD_LVL_SEL_MASK                0x00FF  /* MICD_LVL_SEL - [7:0] */
+#define WM8958_MICD_LVL_SEL_SHIFT                    0  /* MICD_LVL_SEL - [7:0] */
+#define WM8958_MICD_LVL_SEL_WIDTH                    8  /* MICD_LVL_SEL - [7:0] */
+
+/*
+ * R210 (0xD2) - Mic Detect 3
+ */
+#define WM8958_MICD_LVL_MASK                    0x07FC  /* MICD_LVL - [10:2] */
+#define WM8958_MICD_LVL_SHIFT                        2  /* MICD_LVL - [10:2] */
+#define WM8958_MICD_LVL_WIDTH                        9  /* MICD_LVL - [10:2] */
+#define WM8958_MICD_VALID                       0x0002  /* MICD_VALID */
+#define WM8958_MICD_VALID_MASK                  0x0002  /* MICD_VALID */
+#define WM8958_MICD_VALID_SHIFT                      1  /* MICD_VALID */
+#define WM8958_MICD_VALID_WIDTH                      1  /* MICD_VALID */
+#define WM8958_MICD_STS                         0x0001  /* MICD_STS */
+#define WM8958_MICD_STS_MASK                    0x0001  /* MICD_STS */
+#define WM8958_MICD_STS_SHIFT                        0  /* MICD_STS */
+#define WM8958_MICD_STS_WIDTH                        1  /* MICD_STS */
+
 /*
  * R256 (0x100) - Chip Revision
  */
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index b30b2dd3f1f4..948677b581b1 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -21,6 +21,7 @@
 #include <linux/regulator/consumer.h>
 #include <linux/slab.h>
 #include <sound/core.h>
+#include <sound/jack.h>
 #include <sound/pcm.h>
 #include <sound/pcm_params.h>
 #include <sound/soc.h>
@@ -95,6 +96,11 @@ struct wm8994_priv {
 
 	struct wm8994_micdet micdet[2];
 
+	wm8958_micdet_cb jack_cb;
+	void *jack_cb_data;
+	bool jack_is_mic;
+	bool jack_is_video;
+
 	int revision;
 	struct wm8994_pdata *pdata;
 };
@@ -140,6 +146,7 @@ static int wm8994_volatile(unsigned int reg)
 	case WM8994_LDO_1:
 	case WM8994_LDO_2:
 	case WM8958_DSP2_EXECCONTROL:
+	case WM8958_MIC_DETECT_3:
 		return 1;
 	default:
 		return 0;
@@ -2633,6 +2640,133 @@ static irqreturn_t wm8994_mic_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+/* Default microphone detection handler for WM8958 - the user can
+ * override this if they wish.
+ */
+static void wm8958_default_micdet(u16 status, void *data)
+{
+	struct snd_soc_codec *codec = data;
+	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+	int report = 0;
+
+	/* If nothing present then clear our statuses */
+	if (!(status & WM8958_MICD_STS)) {
+		wm8994->jack_is_video = false;
+		wm8994->jack_is_mic = false;
+		goto done;
+	}
+
+	/* Assume anything over 475 ohms is a microphone and remember
+	 * that we've seen one (since buttons override it) */
+	if (status & 0x600)
+		wm8994->jack_is_mic = true;
+	if (wm8994->jack_is_mic)
+		report |= SND_JACK_MICROPHONE;
+
+	/* Video has an impedence of approximately 75 ohms; assume
+	 * this isn't used as a button and remember it since buttons
+	 * override it. */
+	if (status & 0x40)
+		wm8994->jack_is_video = true;
+	if (wm8994->jack_is_video)
+		report |= SND_JACK_VIDEOOUT;
+
+	/* Everything else is buttons; just assign slots */
+	if (status & 0x4)
+		report |= SND_JACK_BTN_0;
+	if (status & 0x8)
+		report |= SND_JACK_BTN_1;
+	if (status & 0x10)
+		report |= SND_JACK_BTN_2;
+	if (status & 0x20)
+		report |= SND_JACK_BTN_3;
+	if (status & 0x80)
+		report |= SND_JACK_BTN_4;
+	if (status & 0x100)
+		report |= SND_JACK_BTN_5;
+
+done:
+	snd_soc_jack_report(wm8994->micdet[0].jack,
+			    SND_JACK_BTN_0 | SND_JACK_BTN_1 | SND_JACK_BTN_2 |
+			    SND_JACK_BTN_3 | SND_JACK_BTN_4 | SND_JACK_BTN_5 |
+			    SND_JACK_MICROPHONE | SND_JACK_VIDEOOUT,
+			    report);
+}
+
+/**
+ * wm8958_mic_detect - Enable microphone detection via the WM8958 IRQ
+ *
+ * @codec:   WM8958 codec
+ * @jack:    jack to report detection events on
+ *
+ * Enable microphone detection functionality for the WM8958.  By
+ * default simple detection which supports the detection of up to 6
+ * buttons plus video and microphone functionality is supported.
+ *
+ * The WM8958 has an advanced jack detection facility which is able to
+ * support complex accessory detection, especially when used in
+ * conjunction with external circuitry.  In order to provide maximum
+ * flexiblity a callback is provided which allows a completely custom
+ * detection algorithm.
+ */
+int wm8958_mic_detect(struct snd_soc_codec *codec, struct snd_soc_jack *jack,
+		      wm8958_micdet_cb cb, void *cb_data)
+{
+	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+	struct wm8994 *control = codec->control_data;
+
+	if (control->type != WM8958)
+		return -EINVAL;
+
+	if (jack) {
+		if (!cb) {
+			dev_dbg(codec->dev, "Using default micdet callback\n");
+			cb = wm8958_default_micdet;
+			cb_data = codec;
+		}
+
+		wm8994->micdet[0].jack = jack;
+		wm8994->jack_cb = cb;
+		wm8994->jack_cb_data = cb_data;
+
+		snd_soc_update_bits(codec, WM8958_MIC_DETECT_1,
+				    WM8958_MICD_ENA, WM8958_MICD_ENA);
+	} else {
+		snd_soc_update_bits(codec, WM8958_MIC_DETECT_1,
+				    WM8958_MICD_ENA, 0);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wm8958_mic_detect);
+
+static irqreturn_t wm8958_mic_irq(int irq, void *data)
+{
+	struct wm8994_priv *wm8994 = data;
+	struct snd_soc_codec *codec = wm8994->codec;
+	int reg;
+
+	reg = snd_soc_read(codec, WM8958_MIC_DETECT_3);
+	if (reg < 0) {
+		dev_err(codec->dev, "Failed to read mic detect status: %d\n",
+			reg);
+		return IRQ_NONE;
+	}
+
+	if (!(reg & WM8958_MICD_VALID)) {
+		dev_dbg(codec->dev, "Mic detect data not valid\n");
+		goto out;
+	}
+
+	if (wm8994->jack_cb)
+		wm8994->jack_cb(reg, wm8994->jack_cb_data);
+	else
+		dev_warn(codec->dev, "Accessory detection with no callback\n");
+
+out:
+	return IRQ_HANDLED;
+}
+
 static int wm8994_codec_probe(struct snd_soc_codec *codec)
 {
 	struct wm8994 *control;
@@ -2732,6 +2866,17 @@ static int wm8994_codec_probe(struct snd_soc_codec *codec)
 				 "Failed to request Mic2 short IRQ: %d\n",
 				 ret);
 		break;
+
+	case WM8958:
+		ret = wm8994_request_irq(codec->control_data,
+					 WM8994_IRQ_MIC1_DET,
+					 wm8958_mic_irq, "Mic detect",
+					 wm8994);
+		if (ret != 0)
+			dev_warn(codec->dev,
+				 "Failed to request Mic detect IRQ: %d\n",
+				 ret);
+		break;
 	}
 
 	/* Remember if AIFnLRCLK is configured as a GPIO.  This should be
@@ -2866,6 +3011,11 @@ static int  wm8994_codec_remove(struct snd_soc_codec *codec)
 		wm8994_free_irq(codec->control_data, WM8994_IRQ_MIC1_DET,
 				wm8994);
 		break;
+
+	case WM8958:
+		wm8994_free_irq(codec->control_data, WM8994_IRQ_MIC1_DET,
+				wm8994);
+		break;
 	}
 	kfree(wm8994->retune_mobile_texts);
 	kfree(wm8994->drc_texts);
diff --git a/sound/soc/codecs/wm8994.h b/sound/soc/codecs/wm8994.h
index b8b3166e1f03..455cae6b4356 100644
--- a/sound/soc/codecs/wm8994.h
+++ b/sound/soc/codecs/wm8994.h
@@ -28,8 +28,12 @@
 #define WM8994_FLL_SRC_LRCLK  3
 #define WM8994_FLL_SRC_BCLK   4
 
+typedef void (*wm8958_micdet_cb)(u16 status, void *data);
+
 int wm8994_mic_detect(struct snd_soc_codec *codec, struct snd_soc_jack *jack,
 		      int micbias, int det, int shrt);
+int wm8958_mic_detect(struct snd_soc_codec *codec, struct snd_soc_jack *jack,
+		      wm8958_micdet_cb cb, void *cb_data);
 
 #define WM8994_CACHE_SIZE 1570
 
-- 
cgit v1.2.3-71-gd317


From 85beb5869a4f6abb52a7cf8e01de6fa57e9ee47d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 24 Nov 2010 16:23:34 -0500
Subject: tracing/slab: Move kmalloc tracepoint out of inline code

The tracepoint for kmalloc is in the slab inlined code which causes
every instance of kmalloc to have the tracepoint.

This patch moves the tracepoint out of the inline code to the
slab C file, which removes a large number of inlined trace
points.

  objdump -dr vmlinux.slab| grep 'jmpq.*<trace_kmalloc' |wc -l
213
  objdump -dr vmlinux.slab.patched| grep 'jmpq.*<trace_kmalloc' |wc -l
1

This also has a nice impact on size.

   text	   data	    bss	    dec	    hex	filename
7023060	2121564	2482432	11627056	 b16a30	vmlinux.slab
6970579	2109772	2482432	11562783	 b06f1f	vmlinux.slab.patched

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab_def.h | 33 +++++++++++++--------------------
 mm/slab.c                | 38 +++++++++++++++++++++++---------------
 2 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 791a502f6906..83203ae9390b 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -138,11 +138,12 @@ void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags);
+extern void *kmem_cache_alloc_trace(size_t size,
+				    struct kmem_cache *cachep, gfp_t flags);
 extern size_t slab_buffer_size(struct kmem_cache *cachep);
 #else
 static __always_inline void *
-kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
 {
 	return kmem_cache_alloc(cachep, flags);
 }
@@ -179,10 +180,7 @@ found:
 #endif
 			cachep = malloc_sizes[i].cs_cachep;
 
-		ret = kmem_cache_alloc_notrace(cachep, flags);
-
-		trace_kmalloc(_THIS_IP_, ret,
-			      size, slab_buffer_size(cachep), flags);
+		ret = kmem_cache_alloc_trace(size, cachep, flags);
 
 		return ret;
 	}
@@ -194,14 +192,16 @@ extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
 extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
-					   gfp_t flags,
-					   int nodeid);
+extern void *kmem_cache_alloc_node_trace(size_t size,
+					 struct kmem_cache *cachep,
+					 gfp_t flags,
+					 int nodeid);
 #else
 static __always_inline void *
-kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
-			      gfp_t flags,
-			      int nodeid)
+kmem_cache_alloc_node_trace(size_t size,
+			    struct kmem_cache *cachep,
+			    gfp_t flags,
+			    int nodeid)
 {
 	return kmem_cache_alloc_node(cachep, flags, nodeid);
 }
@@ -210,7 +210,6 @@ kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *cachep;
-	void *ret;
 
 	if (__builtin_constant_p(size)) {
 		int i = 0;
@@ -234,13 +233,7 @@ found:
 #endif
 			cachep = malloc_sizes[i].cs_cachep;
 
-		ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
-
-		trace_kmalloc_node(_THIS_IP_, ret,
-				   size, slab_buffer_size(cachep),
-				   flags, node);
-
-		return ret;
+		return kmem_cache_alloc_node_trace(size, cachep, flags, node);
 	}
 	return __kmalloc_node(size, flags, node);
 }
diff --git a/mm/slab.c b/mm/slab.c
index b1e40dafbab3..dfcc8885d7d5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3653,11 +3653,18 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
 {
-	return __cache_alloc(cachep, flags, __builtin_return_address(0));
+	void *ret;
+
+	ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+
+	trace_kmalloc(_RET_IP_, ret,
+		      size, slab_buffer_size(cachep), flags);
+	return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #endif
 
 /**
@@ -3705,31 +3712,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
-				    gfp_t flags,
-				    int nodeid)
+void *kmem_cache_alloc_node_trace(size_t size,
+				  struct kmem_cache *cachep,
+				  gfp_t flags,
+				  int nodeid)
 {
-	return __cache_alloc_node(cachep, flags, nodeid,
+	void *ret;
+
+	ret = __cache_alloc_node(cachep, flags, nodeid,
 				  __builtin_return_address(0));
+	trace_kmalloc_node(_RET_IP_, ret,
+			   size, slab_buffer_size(cachep),
+			   flags, nodeid);
+	return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 
 static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
 	struct kmem_cache *cachep;
-	void *ret;
 
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
-
-	trace_kmalloc_node((unsigned long) caller, ret,
-			   size, cachep->buffer_size, flags, node);
-
-	return ret;
+	return kmem_cache_alloc_node_trace(size, cachep, flags, node);
 }
 
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
-- 
cgit v1.2.3-71-gd317


From ce6ada35bdf710d16582cc4869c26722547e6f11 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Thu, 25 Nov 2010 17:11:32 +0000
Subject: security: Define CAP_SYSLOG

Privileged syslog operations currently require CAP_SYS_ADMIN.  Split
this off into a new CAP_SYSLOG privilege which we can sanely take away
from a container through the capability bounding set.

With this patch, an lxc container can be prevented from messing with
the host's syslog (i.e. dmesg -c).

Changelog: mar 12 2010: add selinux capability2:cap_syslog perm
Changelog: nov 22 2010:
	. port to new kernel
	. add a WARN_ONCE if userspace isn't using CAP_SYSLOG

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-By: Kees Cook <kees.cook@canonical.com>
Cc: James Morris <jmorris@namei.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: "Christopher J. PeBenito" <cpebenito@tresys.com>
Cc: Eric Paris <eparis@parisplace.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h          | 7 +++++--
 kernel/printk.c                     | 8 +++++++-
 security/selinux/include/classmap.h | 2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 90012b9ddbf3..fb16a3699b99 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -246,7 +246,6 @@ struct cpu_vfs_cap_data {
 /* Allow configuration of the secure attention key */
 /* Allow administration of the random device */
 /* Allow examination and configuration of disk quotas */
-/* Allow configuring the kernel's syslog (printk behaviour) */
 /* Allow setting the domainname */
 /* Allow setting the hostname */
 /* Allow calling bdflush() */
@@ -352,7 +351,11 @@ struct cpu_vfs_cap_data {
 
 #define CAP_MAC_ADMIN        33
 
-#define CAP_LAST_CAP         CAP_MAC_ADMIN
+/* Allow configuring the kernel's syslog (printk behaviour) */
+
+#define CAP_SYSLOG           34
+
+#define CAP_LAST_CAP         CAP_SYSLOG
 
 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 9a2264fc42ca..0712380737b3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -283,8 +283,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			return -EPERM;
 		if ((type != SYSLOG_ACTION_READ_ALL &&
 		     type != SYSLOG_ACTION_SIZE_BUFFER) &&
-		    !capable(CAP_SYS_ADMIN))
+		    !capable(CAP_SYSLOG)) {
+			/* remove after 2.6.38 */
+			if (capable(CAP_SYS_ADMIN))
+				WARN_ONCE(1, "Attempt to access syslog with "
+				  "CAP_SYS_ADMIN but no CAP_SYSLOG "
+				  "(deprecated and denied).\n");
 			return -EPERM;
+		}
 	}
 
 	error = security_syslog(type);
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 8858d2b2d4b6..7ed3663332ec 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -142,7 +142,7 @@ struct security_class_mapping secclass_map[] = {
 	    "node_bind", "name_connect", NULL } },
 	{ "memprotect", { "mmap_zero", NULL } },
 	{ "peer", { "recv", NULL } },
-	{ "capability2", { "mac_override", "mac_admin", NULL } },
+	{ "capability2", { "mac_override", "mac_admin", "syslog", NULL } },
 	{ "kernel_service", { "use_as_override", "create_files_as", NULL } },
 	{ "tun_socket",
 	  { COMMON_SOCK_PERMS, NULL } },
-- 
cgit v1.2.3-71-gd317


From dc88e46029486ed475c71fe1bb696d39511ac8fe Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 23 Nov 2010 17:50:31 -0500
Subject: lib: hex2bin converts ascii hexadecimal string to binary

Similar to the kgdb_hex2mem() code, hex2bin converts a string
to binary using the hex_to_bin() library call.

Changelog:
- Replace parameter names with src/dst (based on David Howell's comment)
- Add 'const' where needed (based on David Howell's comment)
- Replace int with size_t (based on David Howell's comment)

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/kernel.h |  1 +
 lib/hexdump.c          | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a35b4f7332f0..d0fbc043de60 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -265,6 +265,7 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 }
 
 extern int hex_to_bin(char ch);
+extern void hex2bin(u8 *dst, const char *src, size_t count);
 
 /*
  * General tracing related utility functions - trace_printk(),
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 5d7a4802c562..b66b2bd67952 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -33,6 +33,22 @@ int hex_to_bin(char ch)
 }
 EXPORT_SYMBOL(hex_to_bin);
 
+/**
+ * hex2bin - convert an ascii hexadecimal string to its binary representation
+ * @dst: binary result
+ * @src: ascii hexadecimal string
+ * @count: result length
+ */
+void hex2bin(u8 *dst, const char *src, size_t count)
+{
+	while (count--) {
+		*dst = hex_to_bin(*src++) << 4;
+		*dst += hex_to_bin(*src++);
+		dst++;
+	}
+}
+EXPORT_SYMBOL(hex2bin);
+
 /**
  * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory
  * @buf: data blob to dump
-- 
cgit v1.2.3-71-gd317


From c749ba912e87ccebd674ae24b97462176c63732e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 23 Nov 2010 18:54:16 -0500
Subject: key: add tpm_send command

Add internal kernel tpm_send() command used to seal/unseal keys.

Changelog:
- replaced module_put in tpm_send() with new tpm_chip_put() wrapper
  (suggested by David Howells)
- Make tpm_send() cmd argument a 'void *' (suggested by David Howells)

Signed-off-by: David Safford <safford@watson.ibm.com>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 drivers/char/tpm/tpm.c | 16 ++++++++++++++++
 include/linux/tpm.h    |  4 ++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index 26c09f3b4a74..068bac858b4a 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -780,6 +780,22 @@ int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash)
 }
 EXPORT_SYMBOL_GPL(tpm_pcr_extend);
 
+int tpm_send(u32 chip_num, void *cmd, size_t buflen)
+{
+	struct tpm_chip *chip;
+	int rc;
+
+	chip = tpm_chip_find_get(chip_num);
+	if (chip == NULL)
+		return -ENODEV;
+
+	rc = transmit_cmd(chip, cmd, buflen, "attempting tpm_cmd");
+
+	tpm_chip_put(chip);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(tpm_send);
+
 ssize_t tpm_show_pcrs(struct device *dev, struct device_attribute *attr,
 		      char *buf)
 {
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index ac5d1c1285d9..fdc718abf83b 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -31,6 +31,7 @@
 
 extern int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf);
 extern int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash);
+extern int tpm_send(u32 chip_num, void *cmd, size_t buflen);
 #else
 static inline int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf) {
 	return -ENODEV;
@@ -38,5 +39,8 @@ static inline int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf) {
 static inline int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash) {
 	return -ENODEV;
 }
+static inline int tpm_send(u32 chip_num, void *cmd, size_t buflen) {
+	return -ENODEV;
+}
 #endif
 #endif
-- 
cgit v1.2.3-71-gd317


From d00a1c72f7f4661212299e6cb132dfa58030bcdb Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 23 Nov 2010 17:50:34 -0500
Subject: keys: add new trusted key-type

Define a new kernel key-type called 'trusted'.  Trusted keys are random
number symmetric keys, generated and RSA-sealed by the TPM.  The TPM
only unseals the keys, if the boot PCRs and other criteria match.
Userspace can only ever see encrypted blobs.

Based on suggestions by Jason Gunthorpe, several new options have been
added to support additional usages.

The new options are:
migratable=  designates that the key may/may not ever be updated
             (resealed under a new key, new pcrinfo or new auth.)

pcrlock=n    extends the designated PCR 'n' with a random value,
             so that a key sealed to that PCR may not be unsealed
             again until after a reboot.

keyhandle=   specifies the sealing/unsealing key handle.

keyauth=     specifies the sealing/unsealing key auth.

blobauth=    specifies the sealed data auth.

Implementation of a kernel reserved locality for trusted keys will be
investigated for a possible future extension.

Changelog:
- Updated and added examples to Documentation/keys-trusted-encrypted.txt
- Moved generic TPM constants to include/linux/tpm_command.h
  (David Howell's suggestion.)
- trusted_defined.c: replaced kzalloc with kmalloc, added pcrlock failure
  error handling, added const qualifiers where appropriate.
- moved to late_initcall
- updated from hash to shash (suggestion by David Howells)
- reduced worst stack usage (tpm_seal) from 530 to 312 bytes
- moved documentation to Documentation directory (suggestion by David Howells)
- all the other code cleanups suggested by David Howells
- Add pcrlock CAP_SYS_ADMIN dependency (based on comment by Jason Gunthorpe)
- New options: migratable, pcrlock, keyhandle, keyauth, blobauth (based on
  discussions with Jason Gunthorpe)
- Free payload on failure to create key(reported/fixed by Roberto Sassu)
- Updated Kconfig and other descriptions (based on Serge Hallyn's suggestion)
- Replaced kzalloc() with kmalloc() (reported by Serge Hallyn)

Signed-off-by: David Safford <safford@watson.ibm.com>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/keys-trusted-encrypted.txt |  145 ++++
 include/keys/trusted-type.h              |   31 +
 include/linux/tpm_command.h              |   28 +
 security/Kconfig                         |   15 +
 security/keys/Makefile                   |    1 +
 security/keys/trusted_defined.c          | 1151 ++++++++++++++++++++++++++++++
 security/keys/trusted_defined.h          |  134 ++++
 7 files changed, 1505 insertions(+)
 create mode 100644 Documentation/keys-trusted-encrypted.txt
 create mode 100644 include/keys/trusted-type.h
 create mode 100644 include/linux/tpm_command.h
 create mode 100644 security/keys/trusted_defined.c
 create mode 100644 security/keys/trusted_defined.h

(limited to 'include/linux')

diff --git a/Documentation/keys-trusted-encrypted.txt b/Documentation/keys-trusted-encrypted.txt
new file mode 100644
index 000000000000..8fb79bc1ac4b
--- /dev/null
+++ b/Documentation/keys-trusted-encrypted.txt
@@ -0,0 +1,145 @@
+			Trusted and Encrypted Keys
+
+Trusted and Encrypted Keys are two new key types added to the existing kernel
+key ring service.  Both of these new types are variable length symmetic keys,
+and in both cases all keys are created in the kernel, and user space sees,
+stores, and loads only encrypted blobs.  Trusted Keys require the availability
+of a Trusted Platform Module (TPM) chip for greater security, while Encrypted
+Keys can be used on any system.  All user level blobs, are displayed and loaded
+in hex ascii for convenience, and are integrity verified.
+
+Trusted Keys use a TPM both to generate and to seal the keys.  Keys are sealed
+under a 2048 bit RSA key in the TPM, and optionally sealed to specified PCR
+(integrity measurement) values, and only unsealed by the TPM, if PCRs and blob
+integrity verifications match.  A loaded Trusted Key can be updated with new
+(future) PCR values, so keys are easily migrated to new pcr values, such as
+when the kernel and initramfs are updated.  The same key can have many saved
+blobs under different PCR values, so multiple boots are easily supported.
+
+By default, trusted keys are sealed under the SRK, which has the default
+authorization value (20 zeros).  This can be set at takeownership time with the
+trouser's utility: "tpm_takeownership -u -z".
+
+Usage:
+    keyctl add trusted name "new keylen [options]" ring
+    keyctl add trusted name "load hex_blob [pcrlock=pcrnum]" ring
+    keyctl update key "update [options]"
+    keyctl print keyid
+
+    options:
+       keyhandle= ascii hex value of sealing key default 0x40000000 (SRK)
+       keyauth=	  ascii hex auth for sealing key default 0x00...i
+		  (40 ascii zeros)
+       blobauth=  ascii hex auth for sealed data default 0x00...
+		  (40 ascii zeros)
+       blobauth=  ascii hex auth for sealed data default 0x00...
+		  (40 ascii zeros)
+       pcrinfo=	  ascii hex of PCR_INFO or PCR_INFO_LONG (no default)
+       pcrlock=	  pcr number to be extended to "lock" blob
+       migratable= 0|1 indicating permission to reseal to new PCR values,
+                   default 1 (resealing allowed)
+
+"keyctl print" returns an ascii hex copy of the sealed key, which is in standard
+TPM_STORED_DATA format.  The key length for new keys are always in bytes.
+Trusted Keys can be 32 - 128 bytes (256 - 1024 bits), the upper limit is to fit
+within the 2048 bit SRK (RSA) keylength, with all necessary structure/padding.
+
+Encrypted keys do not depend on a TPM, and are faster, as they use AES for
+encryption/decryption.  New keys are created from kernel generated random
+numbers, and are encrypted/decrypted using a specified 'master' key.  The
+'master' key can either be a trusted-key or user-key type.  The main
+disadvantage of encrypted keys is that if they are not rooted in a trusted key,
+they are only as secure as the user key encrypting them.  The master user key
+should therefore be loaded in as secure a way as possible, preferably early in
+boot.
+
+Usage:
+  keyctl add encrypted name "new key-type:master-key-name keylen" ring
+  keyctl add encrypted name "load hex_blob" ring
+  keyctl update keyid "update key-type:master-key-name"
+
+where 'key-type' is either 'trusted' or 'user'.
+
+Examples of trusted and encrypted key usage:
+
+Create and save a trusted key named "kmk" of length 32 bytes:
+
+    $ keyctl add trusted kmk "new 32" @u
+    440502848
+
+    $ keyctl show
+    Session Keyring
+           -3 --alswrv    500   500  keyring: _ses
+     97833714 --alswrv    500    -1   \_ keyring: _uid.500
+    440502848 --alswrv    500   500       \_ trusted: kmk
+
+    $ keyctl print 440502848
+    0101000000000000000001005d01b7e3f4a6be5709930f3b70a743cbb42e0cc95e18e915
+    3f60da455bbf1144ad12e4f92b452f966929f6105fd29ca28e4d4d5a031d068478bacb0b
+    27351119f822911b0a11ba3d3498ba6a32e50dac7f32894dd890eb9ad578e4e292c83722
+    a52e56a097e6a68b3f56f7a52ece0cdccba1eb62cad7d817f6dc58898b3ac15f36026fec
+    d568bd4a706cb60bb37be6d8f1240661199d640b66fb0fe3b079f97f450b9ef9c22c6d5d
+    dd379f0facd1cd020281dfa3c70ba21a3fa6fc2471dc6d13ecf8298b946f65345faa5ef0
+    f1f8fff03ad0acb083725535636addb08d73dedb9832da198081e5deae84bfaf0409c22b
+    e4a8aea2b607ec96931e6f4d4fe563ba
+
+    $ keyctl pipe 440502848 > kmk.blob
+
+Load a trusted key from the saved blob:
+
+    $ keyctl add trusted kmk "load `cat kmk.blob`" @u
+    268728824
+
+    $ keyctl print 268728824
+    0101000000000000000001005d01b7e3f4a6be5709930f3b70a743cbb42e0cc95e18e915
+    3f60da455bbf1144ad12e4f92b452f966929f6105fd29ca28e4d4d5a031d068478bacb0b
+    27351119f822911b0a11ba3d3498ba6a32e50dac7f32894dd890eb9ad578e4e292c83722
+    a52e56a097e6a68b3f56f7a52ece0cdccba1eb62cad7d817f6dc58898b3ac15f36026fec
+    d568bd4a706cb60bb37be6d8f1240661199d640b66fb0fe3b079f97f450b9ef9c22c6d5d
+    dd379f0facd1cd020281dfa3c70ba21a3fa6fc2471dc6d13ecf8298b946f65345faa5ef0
+    f1f8fff03ad0acb083725535636addb08d73dedb9832da198081e5deae84bfaf0409c22b
+    e4a8aea2b607ec96931e6f4d4fe563ba
+
+Reseal a trusted key under new pcr values:
+
+    $ keyctl update 268728824 "update pcrinfo=`cat pcr.blob`"
+    $ keyctl print 268728824
+    010100000000002c0002800093c35a09b70fff26e7a98ae786c641e678ec6ffb6b46d805
+    77c8a6377aed9d3219c6dfec4b23ffe3000001005d37d472ac8a44023fbb3d18583a4f73
+    d3a076c0858f6f1dcaa39ea0f119911ff03f5406df4f7f27f41da8d7194f45c9f4e00f2e
+    df449f266253aa3f52e55c53de147773e00f0f9aca86c64d94c95382265968c354c5eab4
+    9638c5ae99c89de1e0997242edfb0b501744e11ff9762dfd951cffd93227cc513384e7e6
+    e782c29435c7ec2edafaa2f4c1fe6e7a781b59549ff5296371b42133777dcc5b8b971610
+    94bc67ede19e43ddb9dc2baacad374a36feaf0314d700af0a65c164b7082401740e489c9
+    7ef6a24defe4846104209bf0c3eced7fa1a672ed5b125fc9d8cd88b476a658a4434644ef
+    df8ae9a178e9f83ba9f08d10fa47e4226b98b0702f06b3b8
+
+Create and save an encrypted key "evm" using the above trusted key "kmk":
+
+    $ keyctl add encrypted evm "new trusted:kmk 32" @u
+    159771175
+
+    $ keyctl print 159771175
+    trusted:kmk 32 2375725ad57798846a9bbd240de8906f006e66c03af53b1b382dbbc55
+    be2a44616e4959430436dc4f2a7a9659aa60bb4652aeb2120f149ed197c564e024717c64
+    5972dcb82ab2dde83376d82b2e3c09ffc
+
+    $ keyctl pipe 159771175 > evm.blob
+
+Load an encrypted key "evm" from saved blob:
+
+    $ keyctl add encrypted evm "load `cat evm.blob`" @u
+    831684262
+
+    $ keyctl print 831684262
+    trusted:kmk 32 2375725ad57798846a9bbd240de8906f006e66c03af53b1b382dbbc55
+    be2a44616e4959430436dc4f2a7a9659aa60bb4652aeb2120f149ed197c564e024717c64
+    5972dcb82ab2dde83376d82b2e3c09ffc
+
+
+The initial consumer of trusted keys is EVM, which at boot time needs a high
+quality symmetric key for HMAC protection of file metadata.  The use of a
+trusted key provides strong guarantees that the EVM key has not been
+compromised by a user level problem, and when sealed to specific boot PCR
+values, protects against boot and offline attacks.  Other uses for trusted and
+encrypted keys, such as for disk and file encryption are anticipated.
diff --git a/include/keys/trusted-type.h b/include/keys/trusted-type.h
new file mode 100644
index 000000000000..56f82e5c9975
--- /dev/null
+++ b/include/keys/trusted-type.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2010 IBM Corporation
+ * Author: David Safford <safford@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 2 of the License.
+ */
+
+#ifndef _KEYS_TRUSTED_TYPE_H
+#define _KEYS_TRUSTED_TYPE_H
+
+#include <linux/key.h>
+#include <linux/rcupdate.h>
+
+#define MIN_KEY_SIZE			32
+#define MAX_KEY_SIZE			128
+#define MAX_BLOB_SIZE			320
+
+struct trusted_key_payload {
+	struct rcu_head rcu;
+	unsigned int key_len;
+	unsigned int blob_len;
+	unsigned char migratable;
+	unsigned char key[MAX_KEY_SIZE + 1];
+	unsigned char blob[MAX_BLOB_SIZE];
+};
+
+extern struct key_type key_type_trusted;
+
+#endif /* _KEYS_TRUSTED_TYPE_H */
diff --git a/include/linux/tpm_command.h b/include/linux/tpm_command.h
new file mode 100644
index 000000000000..727512e249b5
--- /dev/null
+++ b/include/linux/tpm_command.h
@@ -0,0 +1,28 @@
+#ifndef __LINUX_TPM_COMMAND_H__
+#define __LINUX_TPM_COMMAND_H__
+
+/*
+ * TPM Command constants from specifications at
+ * http://www.trustedcomputinggroup.org
+ */
+
+/* Command TAGS */
+#define TPM_TAG_RQU_COMMAND             193
+#define TPM_TAG_RQU_AUTH1_COMMAND       194
+#define TPM_TAG_RQU_AUTH2_COMMAND       195
+#define TPM_TAG_RSP_COMMAND             196
+#define TPM_TAG_RSP_AUTH1_COMMAND       197
+#define TPM_TAG_RSP_AUTH2_COMMAND       198
+
+/* Command Ordinals */
+#define TPM_ORD_GETRANDOM               70
+#define TPM_ORD_OSAP                    11
+#define TPM_ORD_OIAP                    10
+#define TPM_ORD_SEAL                    23
+#define TPM_ORD_UNSEAL                  24
+
+/* Other constants */
+#define SRKHANDLE                       0x40000000
+#define TPM_NONCE_SIZE                  20
+
+#endif
diff --git a/security/Kconfig b/security/Kconfig
index e80da955e687..24b8f9b491b8 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -21,6 +21,21 @@ config KEYS
 
 	  If you are unsure as to whether this is required, answer N.
 
+config TRUSTED_KEYS
+	tristate "TRUSTED KEYS"
+	depends on KEYS && TCG_TPM
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_SHA1
+	help
+	  This option provides support for creating, sealing, and unsealing
+	  keys in the kernel. Trusted keys are random number symmetric keys,
+	  generated and RSA-sealed by the TPM. The TPM only unseals the keys,
+	  if the boot PCRs and other criteria match.  Userspace will only ever
+	  see encrypted blobs.
+
+	  If you are unsure as to whether this is required, answer N.
+
 config KEYS_DEBUG_PROC_KEYS
 	bool "Enable the /proc/keys file by which keys may be viewed"
 	depends on KEYS
diff --git a/security/keys/Makefile b/security/keys/Makefile
index 74d5447d7df7..fcb107020b4a 100644
--- a/security/keys/Makefile
+++ b/security/keys/Makefile
@@ -13,6 +13,7 @@ obj-y := \
 	request_key_auth.o \
 	user_defined.o
 
+obj-$(CONFIG_TRUSTED_KEYS) += trusted_defined.o
 obj-$(CONFIG_KEYS_COMPAT) += compat.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/security/keys/trusted_defined.c b/security/keys/trusted_defined.c
new file mode 100644
index 000000000000..1bec72e7596d
--- /dev/null
+++ b/security/keys/trusted_defined.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * Author:
+ * David Safford <safford@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 2 of the License.
+ *
+ * See Documentation/keys-trusted-encrypted.txt
+ */
+
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/string.h>
+#include <keys/user-type.h>
+#include <keys/trusted-type.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/crypto.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/capability.h>
+#include <linux/tpm.h>
+#include <linux/tpm_command.h>
+
+#include "trusted_defined.h"
+
+static const char hmac_alg[] = "hmac(sha1)";
+static const char hash_alg[] = "sha1";
+
+struct sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
+static struct crypto_shash *hashalg;
+static struct crypto_shash *hmacalg;
+
+static struct sdesc *init_sdesc(struct crypto_shash *alg)
+{
+	struct sdesc *sdesc;
+	int size;
+
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(alg);
+	sdesc = kmalloc(size, GFP_KERNEL);
+	if (!sdesc)
+		return ERR_PTR(-ENOMEM);
+	sdesc->shash.tfm = alg;
+	sdesc->shash.flags = 0x0;
+	return sdesc;
+}
+
+static int TSS_sha1(const unsigned char *data, const unsigned int datalen,
+		    unsigned char *digest)
+{
+	struct sdesc *sdesc;
+	int ret;
+
+	sdesc = init_sdesc(hashalg);
+	if (IS_ERR(sdesc)) {
+		pr_info("trusted_key: can't alloc %s\n", hash_alg);
+		return PTR_ERR(sdesc);
+	}
+
+	ret = crypto_shash_digest(&sdesc->shash, data, datalen, digest);
+	kfree(sdesc);
+	return ret;
+}
+
+static int TSS_rawhmac(unsigned char *digest, const unsigned char *key,
+		       const unsigned int keylen, ...)
+{
+	struct sdesc *sdesc;
+	va_list argp;
+	unsigned int dlen;
+	unsigned char *data;
+	int ret;
+
+	sdesc = init_sdesc(hmacalg);
+	if (IS_ERR(sdesc)) {
+		pr_info("trusted_key: can't alloc %s\n", hmac_alg);
+		return PTR_ERR(sdesc);
+	}
+
+	ret = crypto_shash_setkey(hmacalg, key, keylen);
+	if (ret < 0)
+		goto out;
+	ret = crypto_shash_init(&sdesc->shash);
+	if (ret < 0)
+		goto out;
+
+	va_start(argp, keylen);
+	for (;;) {
+		dlen = va_arg(argp, unsigned int);
+		if (dlen == 0)
+			break;
+		data = va_arg(argp, unsigned char *);
+		if (data == NULL)
+			return -EINVAL;
+		ret = crypto_shash_update(&sdesc->shash, data, dlen);
+		if (ret < 0)
+			goto out;
+	}
+	va_end(argp);
+	ret = crypto_shash_final(&sdesc->shash, digest);
+out:
+	kfree(sdesc);
+	return ret;
+}
+
+/*
+ * calculate authorization info fields to send to TPM
+ */
+static uint32_t TSS_authhmac(unsigned char *digest, const unsigned char *key,
+			     const unsigned int keylen, unsigned char *h1,
+			     unsigned char *h2, unsigned char h3, ...)
+{
+	unsigned char paramdigest[SHA1_DIGEST_SIZE];
+	struct sdesc *sdesc;
+	unsigned int dlen;
+	unsigned char *data;
+	unsigned char c;
+	int ret;
+	va_list argp;
+
+	sdesc = init_sdesc(hashalg);
+	if (IS_ERR(sdesc)) {
+		pr_info("trusted_key: can't alloc %s\n", hash_alg);
+		return PTR_ERR(sdesc);
+	}
+
+	c = h3;
+	ret = crypto_shash_init(&sdesc->shash);
+	if (ret < 0)
+		goto out;
+	va_start(argp, h3);
+	for (;;) {
+		dlen = va_arg(argp, unsigned int);
+		if (dlen == 0)
+			break;
+		data = va_arg(argp, unsigned char *);
+		ret = crypto_shash_update(&sdesc->shash, data, dlen);
+		if (ret < 0)
+			goto out;
+	}
+	va_end(argp);
+	ret = crypto_shash_final(&sdesc->shash, paramdigest);
+	if (!ret)
+		TSS_rawhmac(digest, key, keylen, SHA1_DIGEST_SIZE,
+			    paramdigest, TPM_NONCE_SIZE, h1,
+			    TPM_NONCE_SIZE, h2, 1, &c, 0, 0);
+out:
+	kfree(sdesc);
+	return ret;
+}
+
+/*
+ * verify the AUTH1_COMMAND (Seal) result from TPM
+ */
+static uint32_t TSS_checkhmac1(unsigned char *buffer,
+			       const uint32_t command,
+			       const unsigned char *ononce,
+			       const unsigned char *key,
+			       const unsigned int keylen, ...)
+{
+	uint32_t bufsize;
+	uint16_t tag;
+	uint32_t ordinal;
+	uint32_t result;
+	unsigned char *enonce;
+	unsigned char *continueflag;
+	unsigned char *authdata;
+	unsigned char testhmac[SHA1_DIGEST_SIZE];
+	unsigned char paramdigest[SHA1_DIGEST_SIZE];
+	struct sdesc *sdesc;
+	unsigned int dlen;
+	unsigned int dpos;
+	va_list argp;
+	int ret;
+
+	bufsize = LOAD32(buffer, TPM_SIZE_OFFSET);
+	tag = LOAD16(buffer, 0);
+	ordinal = command;
+	result = LOAD32N(buffer, TPM_RETURN_OFFSET);
+	if (tag == TPM_TAG_RSP_COMMAND)
+		return 0;
+	if (tag != TPM_TAG_RSP_AUTH1_COMMAND)
+		return -EINVAL;
+	authdata = buffer + bufsize - SHA1_DIGEST_SIZE;
+	continueflag = authdata - 1;
+	enonce = continueflag - TPM_NONCE_SIZE;
+
+	sdesc = init_sdesc(hashalg);
+	if (IS_ERR(sdesc)) {
+		pr_info("trusted_key: can't alloc %s\n", hash_alg);
+		return PTR_ERR(sdesc);
+	}
+	ret = crypto_shash_init(&sdesc->shash);
+	if (ret < 0)
+		goto out;
+	ret = crypto_shash_update(&sdesc->shash, (const u8 *)&result,
+				  sizeof result);
+	if (ret < 0)
+		goto out;
+	ret = crypto_shash_update(&sdesc->shash, (const u8 *)&ordinal,
+				  sizeof ordinal);
+	if (ret < 0)
+		goto out;
+	va_start(argp, keylen);
+	for (;;) {
+		dlen = va_arg(argp, unsigned int);
+		if (dlen == 0)
+			break;
+		dpos = va_arg(argp, unsigned int);
+		ret = crypto_shash_update(&sdesc->shash, buffer + dpos, dlen);
+		if (ret < 0)
+			goto out;
+	}
+	va_end(argp);
+	ret = crypto_shash_final(&sdesc->shash, paramdigest);
+	if (ret < 0)
+		goto out;
+	ret = TSS_rawhmac(testhmac, key, keylen, SHA1_DIGEST_SIZE, paramdigest,
+			  TPM_NONCE_SIZE, enonce, TPM_NONCE_SIZE, ononce,
+			  1, continueflag, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (memcmp(testhmac, authdata, SHA1_DIGEST_SIZE))
+		ret = -EINVAL;
+out:
+	kfree(sdesc);
+	return ret;
+}
+
+/*
+ * verify the AUTH2_COMMAND (unseal) result from TPM
+ */
+static uint32_t TSS_checkhmac2(unsigned char *buffer,
+			       const uint32_t command,
+			       const unsigned char *ononce,
+			       const unsigned char *key1,
+			       const unsigned int keylen1,
+			       const unsigned char *key2,
+			       const unsigned int keylen2, ...)
+{
+	uint32_t bufsize;
+	uint16_t tag;
+	uint32_t ordinal;
+	uint32_t result;
+	unsigned char *enonce1;
+	unsigned char *continueflag1;
+	unsigned char *authdata1;
+	unsigned char *enonce2;
+	unsigned char *continueflag2;
+	unsigned char *authdata2;
+	unsigned char testhmac1[SHA1_DIGEST_SIZE];
+	unsigned char testhmac2[SHA1_DIGEST_SIZE];
+	unsigned char paramdigest[SHA1_DIGEST_SIZE];
+	struct sdesc *sdesc;
+	unsigned int dlen;
+	unsigned int dpos;
+	va_list argp;
+	int ret;
+
+	bufsize = LOAD32(buffer, TPM_SIZE_OFFSET);
+	tag = LOAD16(buffer, 0);
+	ordinal = command;
+	result = LOAD32N(buffer, TPM_RETURN_OFFSET);
+
+	if (tag == TPM_TAG_RSP_COMMAND)
+		return 0;
+	if (tag != TPM_TAG_RSP_AUTH2_COMMAND)
+		return -EINVAL;
+	authdata1 = buffer + bufsize - (SHA1_DIGEST_SIZE + 1
+			+ SHA1_DIGEST_SIZE + SHA1_DIGEST_SIZE);
+	authdata2 = buffer + bufsize - (SHA1_DIGEST_SIZE);
+	continueflag1 = authdata1 - 1;
+	continueflag2 = authdata2 - 1;
+	enonce1 = continueflag1 - TPM_NONCE_SIZE;
+	enonce2 = continueflag2 - TPM_NONCE_SIZE;
+
+	sdesc = init_sdesc(hashalg);
+	if (IS_ERR(sdesc)) {
+		pr_info("trusted_key: can't alloc %s\n", hash_alg);
+		return PTR_ERR(sdesc);
+	}
+	ret = crypto_shash_init(&sdesc->shash);
+	if (ret < 0)
+		goto out;
+	ret = crypto_shash_update(&sdesc->shash, (const u8 *)&result,
+				  sizeof result);
+	if (ret < 0)
+		goto out;
+	ret = crypto_shash_update(&sdesc->shash, (const u8 *)&ordinal,
+				  sizeof ordinal);
+	if (ret < 0)
+		goto out;
+
+	va_start(argp, keylen2);
+	for (;;) {
+		dlen = va_arg(argp, unsigned int);
+		if (dlen == 0)
+			break;
+		dpos = va_arg(argp, unsigned int);
+		ret = crypto_shash_update(&sdesc->shash, buffer + dpos, dlen);
+		if (ret < 0)
+			goto out;
+	}
+	ret = crypto_shash_final(&sdesc->shash, paramdigest);
+	if (ret < 0)
+		goto out;
+
+	ret = TSS_rawhmac(testhmac1, key1, keylen1, SHA1_DIGEST_SIZE,
+			  paramdigest, TPM_NONCE_SIZE, enonce1,
+			  TPM_NONCE_SIZE, ononce, 1, continueflag1, 0, 0);
+	if (memcmp(testhmac1, authdata1, SHA1_DIGEST_SIZE)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = TSS_rawhmac(testhmac2, key2, keylen2, SHA1_DIGEST_SIZE,
+			  paramdigest, TPM_NONCE_SIZE, enonce2,
+			  TPM_NONCE_SIZE, ononce, 1, continueflag2, 0, 0);
+	if (memcmp(testhmac2, authdata2, SHA1_DIGEST_SIZE))
+		ret = -EINVAL;
+out:
+	kfree(sdesc);
+	return ret;
+}
+
+/*
+ * For key specific tpm requests, we will generate and send our
+ * own TPM command packets using the drivers send function.
+ */
+static int trusted_tpm_send(const u32 chip_num, unsigned char *cmd,
+			    size_t buflen)
+{
+	int rc;
+
+	dump_tpm_buf(cmd);
+	rc = tpm_send(chip_num, cmd, buflen);
+	dump_tpm_buf(cmd);
+	if (rc > 0)
+		/* Can't return positive return codes values to keyctl */
+		rc = -EPERM;
+	return rc;
+}
+
+/*
+ * get a random value from TPM
+ */
+static int tpm_get_random(struct tpm_buf *tb, unsigned char *buf, uint32_t len)
+{
+	int ret;
+
+	INIT_BUF(tb);
+	store16(tb, TPM_TAG_RQU_COMMAND);
+	store32(tb, TPM_GETRANDOM_SIZE);
+	store32(tb, TPM_ORD_GETRANDOM);
+	store32(tb, len);
+	ret = trusted_tpm_send(TPM_ANY_NUM, tb->data, sizeof tb->data);
+	memcpy(buf, tb->data + TPM_GETRANDOM_SIZE, len);
+
+	return ret;
+}
+
+static int my_get_random(unsigned char *buf, int len)
+{
+	struct tpm_buf *tb;
+	int ret;
+
+	tb = kzalloc(sizeof *tb, GFP_KERNEL);
+	if (!tb)
+		return -ENOMEM;
+	ret = tpm_get_random(tb, buf, len);
+
+	kfree(tb);
+	return ret;
+}
+
+/*
+ * Lock a trusted key, by extending a selected PCR.
+ *
+ * Prevents a trusted key that is sealed to PCRs from being accessed.
+ * This uses the tpm driver's extend function.
+ */
+static int pcrlock(const int pcrnum)
+{
+	unsigned char hash[SHA1_DIGEST_SIZE];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	my_get_random(hash, SHA1_DIGEST_SIZE);
+	return tpm_pcr_extend(TPM_ANY_NUM, pcrnum, hash) ? -EINVAL : 0;
+}
+
+/*
+ * Create an object specific authorisation protocol (OSAP) session
+ */
+static int osap(struct tpm_buf *tb, struct osapsess *s,
+		const unsigned char *key, const uint16_t type,
+		const uint32_t handle)
+{
+	unsigned char enonce[TPM_NONCE_SIZE];
+	unsigned char ononce[TPM_NONCE_SIZE];
+	int ret;
+
+	ret = tpm_get_random(tb, ononce, TPM_NONCE_SIZE);
+	if (ret < 0)
+		return ret;
+
+	INIT_BUF(tb);
+	store16(tb, TPM_TAG_RQU_COMMAND);
+	store32(tb, TPM_OSAP_SIZE);
+	store32(tb, TPM_ORD_OSAP);
+	store16(tb, type);
+	store32(tb, handle);
+	storebytes(tb, ononce, TPM_NONCE_SIZE);
+
+	ret = trusted_tpm_send(TPM_ANY_NUM, tb->data, MAX_BUF_SIZE);
+	if (ret < 0)
+		return ret;
+
+	s->handle = LOAD32(tb->data, TPM_DATA_OFFSET);
+	memcpy(s->enonce, &(tb->data[TPM_DATA_OFFSET + sizeof(uint32_t)]),
+	       TPM_NONCE_SIZE);
+	memcpy(enonce, &(tb->data[TPM_DATA_OFFSET + sizeof(uint32_t) +
+				  TPM_NONCE_SIZE]), TPM_NONCE_SIZE);
+	ret = TSS_rawhmac(s->secret, key, SHA1_DIGEST_SIZE, TPM_NONCE_SIZE,
+			  enonce, TPM_NONCE_SIZE, ononce, 0, 0);
+	return ret;
+}
+
+/*
+ * Create an object independent authorisation protocol (oiap) session
+ */
+static int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce)
+{
+	int ret;
+
+	INIT_BUF(tb);
+	store16(tb, TPM_TAG_RQU_COMMAND);
+	store32(tb, TPM_OIAP_SIZE);
+	store32(tb, TPM_ORD_OIAP);
+	ret = trusted_tpm_send(TPM_ANY_NUM, tb->data, MAX_BUF_SIZE);
+	if (ret < 0)
+		return ret;
+
+	*handle = LOAD32(tb->data, TPM_DATA_OFFSET);
+	memcpy(nonce, &tb->data[TPM_DATA_OFFSET + sizeof(uint32_t)],
+	       TPM_NONCE_SIZE);
+	return ret;
+}
+
+struct tpm_digests {
+	unsigned char encauth[SHA1_DIGEST_SIZE];
+	unsigned char pubauth[SHA1_DIGEST_SIZE];
+	unsigned char xorwork[SHA1_DIGEST_SIZE * 2];
+	unsigned char xorhash[SHA1_DIGEST_SIZE];
+	unsigned char nonceodd[TPM_NONCE_SIZE];
+};
+
+/*
+ * Have the TPM seal(encrypt) the trusted key, possibly based on
+ * Platform Configuration Registers (PCRs). AUTH1 for sealing key.
+ */
+static int tpm_seal(struct tpm_buf *tb, const uint16_t keytype,
+		    const uint32_t keyhandle, const unsigned char *keyauth,
+		    const unsigned char *data, const uint32_t datalen,
+		    unsigned char *blob, uint32_t *bloblen,
+		    const unsigned char *blobauth,
+		    const unsigned char *pcrinfo, const uint32_t pcrinfosize)
+{
+	struct osapsess sess;
+	struct tpm_digests *td;
+	unsigned char cont;
+	uint32_t ordinal;
+	uint32_t pcrsize;
+	uint32_t datsize;
+	int sealinfosize;
+	int encdatasize;
+	int storedsize;
+	int ret;
+	int i;
+
+	/* alloc some work space for all the hashes */
+	td = kmalloc(sizeof *td, GFP_KERNEL);
+	if (!td)
+		return -ENOMEM;
+
+	/* get session for sealing key */
+	ret = osap(tb, &sess, keyauth, keytype, keyhandle);
+	if (ret < 0)
+		return ret;
+	dump_sess(&sess);
+
+	/* calculate encrypted authorization value */
+	memcpy(td->xorwork, sess.secret, SHA1_DIGEST_SIZE);
+	memcpy(td->xorwork + SHA1_DIGEST_SIZE, sess.enonce, SHA1_DIGEST_SIZE);
+	ret = TSS_sha1(td->xorwork, SHA1_DIGEST_SIZE * 2, td->xorhash);
+	if (ret < 0)
+		return ret;
+
+	ret = tpm_get_random(tb, td->nonceodd, TPM_NONCE_SIZE);
+	if (ret < 0)
+		return ret;
+	ordinal = htonl(TPM_ORD_SEAL);
+	datsize = htonl(datalen);
+	pcrsize = htonl(pcrinfosize);
+	cont = 0;
+
+	/* encrypt data authorization key */
+	for (i = 0; i < SHA1_DIGEST_SIZE; ++i)
+		td->encauth[i] = td->xorhash[i] ^ blobauth[i];
+
+	/* calculate authorization HMAC value */
+	if (pcrinfosize == 0) {
+		/* no pcr info specified */
+		TSS_authhmac(td->pubauth, sess.secret, SHA1_DIGEST_SIZE,
+			     sess.enonce, td->nonceodd, cont, sizeof(uint32_t),
+			     &ordinal, SHA1_DIGEST_SIZE, td->encauth,
+			     sizeof(uint32_t), &pcrsize, sizeof(uint32_t),
+			     &datsize, datalen, data, 0, 0);
+	} else {
+		/* pcr info specified */
+		TSS_authhmac(td->pubauth, sess.secret, SHA1_DIGEST_SIZE,
+			     sess.enonce, td->nonceodd, cont, sizeof(uint32_t),
+			     &ordinal, SHA1_DIGEST_SIZE, td->encauth,
+			     sizeof(uint32_t), &pcrsize, pcrinfosize,
+			     pcrinfo, sizeof(uint32_t), &datsize, datalen,
+			     data, 0, 0);
+	}
+
+	/* build and send the TPM request packet */
+	INIT_BUF(tb);
+	store16(tb, TPM_TAG_RQU_AUTH1_COMMAND);
+	store32(tb, TPM_SEAL_SIZE + pcrinfosize + datalen);
+	store32(tb, TPM_ORD_SEAL);
+	store32(tb, keyhandle);
+	storebytes(tb, td->encauth, SHA1_DIGEST_SIZE);
+	store32(tb, pcrinfosize);
+	storebytes(tb, pcrinfo, pcrinfosize);
+	store32(tb, datalen);
+	storebytes(tb, data, datalen);
+	store32(tb, sess.handle);
+	storebytes(tb, td->nonceodd, TPM_NONCE_SIZE);
+	store8(tb, cont);
+	storebytes(tb, td->pubauth, SHA1_DIGEST_SIZE);
+
+	ret = trusted_tpm_send(TPM_ANY_NUM, tb->data, MAX_BUF_SIZE);
+	if (ret < 0)
+		return ret;
+
+	/* calculate the size of the returned Blob */
+	sealinfosize = LOAD32(tb->data, TPM_DATA_OFFSET + sizeof(uint32_t));
+	encdatasize = LOAD32(tb->data, TPM_DATA_OFFSET + sizeof(uint32_t) +
+			     sizeof(uint32_t) + sealinfosize);
+	storedsize = sizeof(uint32_t) + sizeof(uint32_t) + sealinfosize +
+	    sizeof(uint32_t) + encdatasize;
+
+	/* check the HMAC in the response */
+	ret = TSS_checkhmac1(tb->data, ordinal, td->nonceodd, sess.secret,
+			     SHA1_DIGEST_SIZE, storedsize, TPM_DATA_OFFSET, 0,
+			     0);
+
+	/* copy the returned blob to caller */
+	memcpy(blob, tb->data + TPM_DATA_OFFSET, storedsize);
+	*bloblen = storedsize;
+	return ret;
+}
+
+/*
+ * use the AUTH2_COMMAND form of unseal, to authorize both key and blob
+ */
+static int tpm_unseal(struct tpm_buf *tb,
+		      const uint32_t keyhandle, const unsigned char *keyauth,
+		      const unsigned char *blob, const int bloblen,
+		      const unsigned char *blobauth,
+		      unsigned char *data, unsigned int *datalen)
+{
+	unsigned char nonceodd[TPM_NONCE_SIZE];
+	unsigned char enonce1[TPM_NONCE_SIZE];
+	unsigned char enonce2[TPM_NONCE_SIZE];
+	unsigned char authdata1[SHA1_DIGEST_SIZE];
+	unsigned char authdata2[SHA1_DIGEST_SIZE];
+	uint32_t authhandle1 = 0;
+	uint32_t authhandle2 = 0;
+	unsigned char cont = 0;
+	uint32_t ordinal;
+	uint32_t keyhndl;
+	int ret;
+
+	/* sessions for unsealing key and data */
+	ret = oiap(tb, &authhandle1, enonce1);
+	if (ret < 0) {
+		pr_info("trusted_key: oiap failed (%d)\n", ret);
+		return ret;
+	}
+	ret = oiap(tb, &authhandle2, enonce2);
+	if (ret < 0) {
+		pr_info("trusted_key: oiap failed (%d)\n", ret);
+		return ret;
+	}
+
+	ordinal = htonl(TPM_ORD_UNSEAL);
+	keyhndl = htonl(SRKHANDLE);
+	ret = tpm_get_random(tb, nonceodd, TPM_NONCE_SIZE);
+	if (ret < 0) {
+		pr_info("trusted_key: tpm_get_random failed (%d)\n", ret);
+		return ret;
+	}
+	TSS_authhmac(authdata1, keyauth, TPM_NONCE_SIZE,
+		     enonce1, nonceodd, cont, sizeof(uint32_t),
+		     &ordinal, bloblen, blob, 0, 0);
+	TSS_authhmac(authdata2, blobauth, TPM_NONCE_SIZE,
+		     enonce2, nonceodd, cont, sizeof(uint32_t),
+		     &ordinal, bloblen, blob, 0, 0);
+
+	/* build and send TPM request packet */
+	INIT_BUF(tb);
+	store16(tb, TPM_TAG_RQU_AUTH2_COMMAND);
+	store32(tb, TPM_UNSEAL_SIZE + bloblen);
+	store32(tb, TPM_ORD_UNSEAL);
+	store32(tb, keyhandle);
+	storebytes(tb, blob, bloblen);
+	store32(tb, authhandle1);
+	storebytes(tb, nonceodd, TPM_NONCE_SIZE);
+	store8(tb, cont);
+	storebytes(tb, authdata1, SHA1_DIGEST_SIZE);
+	store32(tb, authhandle2);
+	storebytes(tb, nonceodd, TPM_NONCE_SIZE);
+	store8(tb, cont);
+	storebytes(tb, authdata2, SHA1_DIGEST_SIZE);
+
+	ret = trusted_tpm_send(TPM_ANY_NUM, tb->data, MAX_BUF_SIZE);
+	if (ret < 0) {
+		pr_info("trusted_key: authhmac failed (%d)\n", ret);
+		return ret;
+	}
+
+	*datalen = LOAD32(tb->data, TPM_DATA_OFFSET);
+	ret = TSS_checkhmac2(tb->data, ordinal, nonceodd,
+			     keyauth, SHA1_DIGEST_SIZE,
+			     blobauth, SHA1_DIGEST_SIZE,
+			     sizeof(uint32_t), TPM_DATA_OFFSET,
+			     *datalen, TPM_DATA_OFFSET + sizeof(uint32_t), 0,
+			     0);
+	if (ret < 0)
+		pr_info("trusted_key: TSS_checkhmac2 failed (%d)\n", ret);
+	memcpy(data, tb->data + TPM_DATA_OFFSET + sizeof(uint32_t), *datalen);
+	return ret;
+}
+
+/*
+ * Have the TPM seal(encrypt) the symmetric key
+ */
+static int key_seal(struct trusted_key_payload *p,
+		    struct trusted_key_options *o)
+{
+	struct tpm_buf *tb;
+	int ret;
+
+	tb = kzalloc(sizeof *tb, GFP_KERNEL);
+	if (!tb)
+		return -ENOMEM;
+
+	/* include migratable flag at end of sealed key */
+	p->key[p->key_len] = p->migratable;
+
+	ret = tpm_seal(tb, o->keytype, o->keyhandle, o->keyauth,
+		       p->key, p->key_len + 1, p->blob, &p->blob_len,
+		       o->blobauth, o->pcrinfo, o->pcrinfo_len);
+	if (ret < 0)
+		pr_info("trusted_key: srkseal failed (%d)\n", ret);
+
+	kfree(tb);
+	return ret;
+}
+
+/*
+ * Have the TPM unseal(decrypt) the symmetric key
+ */
+static int key_unseal(struct trusted_key_payload *p,
+		      struct trusted_key_options *o)
+{
+	struct tpm_buf *tb;
+	int ret;
+
+	tb = kzalloc(sizeof *tb, GFP_KERNEL);
+	if (!tb)
+		return -ENOMEM;
+
+	ret = tpm_unseal(tb, o->keyhandle, o->keyauth, p->blob, p->blob_len,
+			 o->blobauth, p->key, &p->key_len);
+	/* pull migratable flag out of sealed key */
+	p->migratable = p->key[--p->key_len];
+
+	if (ret < 0)
+		pr_info("trusted_key: srkunseal failed (%d)\n", ret);
+
+	kfree(tb);
+	return ret;
+}
+
+enum {
+	Opt_err = -1,
+	Opt_new, Opt_load, Opt_update,
+	Opt_keyhandle, Opt_keyauth, Opt_blobauth,
+	Opt_pcrinfo, Opt_pcrlock, Opt_migratable
+};
+
+static const match_table_t key_tokens = {
+	{Opt_new, "new"},
+	{Opt_load, "load"},
+	{Opt_update, "update"},
+	{Opt_keyhandle, "keyhandle=%s"},
+	{Opt_keyauth, "keyauth=%s"},
+	{Opt_blobauth, "blobauth=%s"},
+	{Opt_pcrinfo, "pcrinfo=%s"},
+	{Opt_pcrlock, "pcrlock=%s"},
+	{Opt_migratable, "migratable=%s"},
+	{Opt_err, NULL}
+};
+
+/* can have zero or more token= options */
+static int getoptions(char *c, struct trusted_key_payload *pay,
+		      struct trusted_key_options *opt)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *p = c;
+	int token;
+	int res;
+	unsigned long handle;
+	unsigned long lock;
+
+	while ((p = strsep(&c, " \t"))) {
+		if (*p == '\0' || *p == ' ' || *p == '\t')
+			continue;
+		token = match_token(p, key_tokens, args);
+
+		switch (token) {
+		case Opt_pcrinfo:
+			opt->pcrinfo_len = strlen(args[0].from) / 2;
+			if (opt->pcrinfo_len > MAX_PCRINFO_SIZE)
+				return -EINVAL;
+			hex2bin(opt->pcrinfo, args[0].from, opt->pcrinfo_len);
+			break;
+		case Opt_keyhandle:
+			res = strict_strtoul(args[0].from, 16, &handle);
+			if (res < 0)
+				return -EINVAL;
+			opt->keytype = SEAL_keytype;
+			opt->keyhandle = handle;
+			break;
+		case Opt_keyauth:
+			if (strlen(args[0].from) != 2 * SHA1_DIGEST_SIZE)
+				return -EINVAL;
+			hex2bin(opt->keyauth, args[0].from, SHA1_DIGEST_SIZE);
+			break;
+		case Opt_blobauth:
+			if (strlen(args[0].from) != 2 * SHA1_DIGEST_SIZE)
+				return -EINVAL;
+			hex2bin(opt->blobauth, args[0].from, SHA1_DIGEST_SIZE);
+			break;
+		case Opt_migratable:
+			if (*args[0].from == '0')
+				pay->migratable = 0;
+			else
+				return -EINVAL;
+			break;
+		case Opt_pcrlock:
+			res = strict_strtoul(args[0].from, 10, &lock);
+			if (res < 0)
+				return -EINVAL;
+			opt->pcrlock = lock;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/*
+ * datablob_parse - parse the keyctl data and fill in the
+ * 		    payload and options structures
+ *
+ * On success returns 0, otherwise -EINVAL.
+ */
+static int datablob_parse(char *datablob, struct trusted_key_payload *p,
+			  struct trusted_key_options *o)
+{
+	substring_t args[MAX_OPT_ARGS];
+	long keylen;
+	int ret = -EINVAL;
+	int key_cmd;
+	char *c;
+
+	/* main command */
+	c = strsep(&datablob, " \t");
+	if (!c)
+		return -EINVAL;
+	key_cmd = match_token(c, key_tokens, args);
+	switch (key_cmd) {
+	case Opt_new:
+		/* first argument is key size */
+		c = strsep(&datablob, " \t");
+		if (!c)
+			return -EINVAL;
+		ret = strict_strtol(c, 10, &keylen);
+		if (ret < 0 || keylen < MIN_KEY_SIZE || keylen > MAX_KEY_SIZE)
+			return -EINVAL;
+		p->key_len = keylen;
+		ret = getoptions(datablob, p, o);
+		if (ret < 0)
+			return ret;
+		ret = Opt_new;
+		break;
+	case Opt_load:
+		/* first argument is sealed blob */
+		c = strsep(&datablob, " \t");
+		if (!c)
+			return -EINVAL;
+		p->blob_len = strlen(c) / 2;
+		if (p->blob_len > MAX_BLOB_SIZE)
+			return -EINVAL;
+		hex2bin(p->blob, c, p->blob_len);
+		ret = getoptions(datablob, p, o);
+		if (ret < 0)
+			return ret;
+		ret = Opt_load;
+		break;
+	case Opt_update:
+		/* all arguments are options */
+		ret = getoptions(datablob, p, o);
+		if (ret < 0)
+			return ret;
+		ret = Opt_update;
+		break;
+	case Opt_err:
+		return -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static struct trusted_key_options *trusted_options_alloc(void)
+{
+	struct trusted_key_options *options;
+
+	options = kzalloc(sizeof *options, GFP_KERNEL);
+	if (!options)
+		return options;
+
+	/* set any non-zero defaults */
+	options->keytype = SRK_keytype;
+	options->keyhandle = SRKHANDLE;
+	return options;
+}
+
+static struct trusted_key_payload *trusted_payload_alloc(struct key *key)
+{
+	struct trusted_key_payload *p = NULL;
+	int ret;
+
+	ret = key_payload_reserve(key, sizeof *p);
+	if (ret < 0)
+		return p;
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+
+	/* migratable by default */
+	p->migratable = 1;
+	return p;
+}
+
+/*
+ * trusted_instantiate - create a new trusted key
+ *
+ * Unseal an existing trusted blob or, for a new key, get a
+ * random key, then seal and create a trusted key-type key,
+ * adding it to the specified keyring.
+ *
+ * On success, return 0. Otherwise return errno.
+ */
+static int trusted_instantiate(struct key *key, const void *data,
+			       const size_t datalen)
+{
+	struct trusted_key_payload *payload = NULL;
+	struct trusted_key_options *options = NULL;
+	char *datablob;
+	int ret = 0;
+	int key_cmd;
+
+	if (datalen <= 0 || datalen > 32767 || !data)
+		return -EINVAL;
+
+	datablob = kmalloc(datalen + 1, GFP_KERNEL);
+	if (!datablob)
+		return -ENOMEM;
+	memcpy(datablob, data, datalen);
+	datablob[datalen] = '\0';
+
+	options = trusted_options_alloc();
+	if (!options) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	payload = trusted_payload_alloc(key);
+	if (!payload) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key_cmd = datablob_parse(datablob, payload, options);
+	if (key_cmd < 0) {
+		ret = key_cmd;
+		goto out;
+	}
+
+	dump_payload(payload);
+	dump_options(options);
+
+	switch (key_cmd) {
+	case Opt_load:
+		ret = key_unseal(payload, options);
+		dump_payload(payload);
+		dump_options(options);
+		if (ret < 0)
+			pr_info("trusted_key: key_unseal failed (%d)\n", ret);
+		break;
+	case Opt_new:
+		ret = my_get_random(payload->key, payload->key_len);
+		if (ret < 0) {
+			pr_info("trusted_key: key_create failed (%d)\n", ret);
+			goto out;
+		}
+		ret = key_seal(payload, options);
+		if (ret < 0)
+			pr_info("trusted_key: key_seal failed (%d)\n", ret);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!ret && options->pcrlock)
+		ret = pcrlock(options->pcrlock);
+out:
+	kfree(datablob);
+	kfree(options);
+	if (!ret)
+		rcu_assign_pointer(key->payload.data, payload);
+	else
+		kfree(payload);
+	return ret;
+}
+
+static void trusted_rcu_free(struct rcu_head *rcu)
+{
+	struct trusted_key_payload *p;
+
+	p = container_of(rcu, struct trusted_key_payload, rcu);
+	memset(p->key, 0, p->key_len);
+	kfree(p);
+}
+
+/*
+ * trusted_update - reseal an existing key with new PCR values
+ */
+static int trusted_update(struct key *key, const void *data,
+			  const size_t datalen)
+{
+	struct trusted_key_payload *p = key->payload.data;
+	struct trusted_key_payload *new_p;
+	struct trusted_key_options *new_o;
+	char *datablob;
+	int ret = 0;
+
+	if (!p->migratable)
+		return -EPERM;
+	if (datalen <= 0 || datalen > 32767 || !data)
+		return -EINVAL;
+
+	datablob = kmalloc(datalen + 1, GFP_KERNEL);
+	if (!datablob)
+		return -ENOMEM;
+	new_o = trusted_options_alloc();
+	if (!new_o) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	new_p = trusted_payload_alloc(key);
+	if (!new_p) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(datablob, data, datalen);
+	datablob[datalen] = '\0';
+	ret = datablob_parse(datablob, new_p, new_o);
+	if (ret != Opt_update) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/* copy old key values, and reseal with new pcrs */
+	new_p->migratable = p->migratable;
+	new_p->key_len = p->key_len;
+	memcpy(new_p->key, p->key, p->key_len);
+	dump_payload(p);
+	dump_payload(new_p);
+
+	ret = key_seal(new_p, new_o);
+	if (ret < 0) {
+		pr_info("trusted_key: key_seal failed (%d)\n", ret);
+		kfree(new_p);
+		goto out;
+	}
+	if (new_o->pcrlock) {
+		ret = pcrlock(new_o->pcrlock);
+		if (ret < 0) {
+			pr_info("trusted_key: pcrlock failed (%d)\n", ret);
+			kfree(new_p);
+			goto out;
+		}
+	}
+	rcu_assign_pointer(key->payload.data, new_p);
+	call_rcu(&p->rcu, trusted_rcu_free);
+out:
+	kfree(datablob);
+	kfree(new_o);
+	return ret;
+}
+
+/*
+ * trusted_read - copy the sealed blob data to userspace in hex.
+ * On success, return to userspace the trusted key datablob size.
+ */
+static long trusted_read(const struct key *key, char __user *buffer,
+			 size_t buflen)
+{
+	struct trusted_key_payload *p;
+	char *ascii_buf;
+	char *bufp;
+	int i;
+
+	p = rcu_dereference_protected(key->payload.data,
+			rwsem_is_locked(&((struct key *)key)->sem));
+	if (!p)
+		return -EINVAL;
+	if (!buffer || buflen <= 0)
+		return 2 * p->blob_len;
+	ascii_buf = kmalloc(2 * p->blob_len, GFP_KERNEL);
+	if (!ascii_buf)
+		return -ENOMEM;
+
+	bufp = ascii_buf;
+	for (i = 0; i < p->blob_len; i++)
+		bufp = pack_hex_byte(bufp, p->blob[i]);
+	if ((copy_to_user(buffer, ascii_buf, 2 * p->blob_len)) != 0) {
+		kfree(ascii_buf);
+		return -EFAULT;
+	}
+	kfree(ascii_buf);
+	return 2 * p->blob_len;
+}
+
+/*
+ * trusted_destroy - before freeing the key, clear the decrypted data
+ */
+static void trusted_destroy(struct key *key)
+{
+	struct trusted_key_payload *p = key->payload.data;
+
+	if (!p)
+		return;
+	memset(p->key, 0, p->key_len);
+	kfree(key->payload.data);
+}
+
+struct key_type key_type_trusted = {
+	.name = "trusted",
+	.instantiate = trusted_instantiate,
+	.update = trusted_update,
+	.match = user_match,
+	.destroy = trusted_destroy,
+	.describe = user_describe,
+	.read = trusted_read,
+};
+
+EXPORT_SYMBOL_GPL(key_type_trusted);
+
+static void trusted_shash_release(void)
+{
+	if (hashalg)
+		crypto_free_shash(hashalg);
+	if (hmacalg)
+		crypto_free_shash(hmacalg);
+}
+
+static int __init trusted_shash_alloc(void)
+{
+	int ret;
+
+	hmacalg = crypto_alloc_shash(hmac_alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hmacalg)) {
+		pr_info("trusted_key: could not allocate crypto %s\n",
+			hmac_alg);
+		return PTR_ERR(hmacalg);
+	}
+
+	hashalg = crypto_alloc_shash(hash_alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hashalg)) {
+		pr_info("trusted_key: could not allocate crypto %s\n",
+			hash_alg);
+		ret = PTR_ERR(hashalg);
+		goto hashalg_fail;
+	}
+
+	return 0;
+
+hashalg_fail:
+	crypto_free_shash(hmacalg);
+	return ret;
+}
+
+static int __init init_trusted(void)
+{
+	int ret;
+
+	ret = trusted_shash_alloc();
+	if (ret < 0)
+		return ret;
+	ret = register_key_type(&key_type_trusted);
+	if (ret < 0)
+		trusted_shash_release();
+	return ret;
+}
+
+static void __exit cleanup_trusted(void)
+{
+	trusted_shash_release();
+	unregister_key_type(&key_type_trusted);
+}
+
+late_initcall(init_trusted);
+module_exit(cleanup_trusted);
+
+MODULE_LICENSE("GPL");
diff --git a/security/keys/trusted_defined.h b/security/keys/trusted_defined.h
new file mode 100644
index 000000000000..3249fbd2b653
--- /dev/null
+++ b/security/keys/trusted_defined.h
@@ -0,0 +1,134 @@
+#ifndef __TRUSTED_KEY_H
+#define __TRUSTED_KEY_H
+
+/* implementation specific TPM constants */
+#define MAX_PCRINFO_SIZE		64
+#define MAX_BUF_SIZE			512
+#define TPM_GETRANDOM_SIZE		14
+#define TPM_OSAP_SIZE			36
+#define TPM_OIAP_SIZE			10
+#define TPM_SEAL_SIZE			87
+#define TPM_UNSEAL_SIZE			104
+#define TPM_SIZE_OFFSET			2
+#define TPM_RETURN_OFFSET		6
+#define TPM_DATA_OFFSET			10
+
+#define LOAD32(buffer, offset)	(ntohl(*(uint32_t *)&buffer[offset]))
+#define LOAD32N(buffer, offset)	(*(uint32_t *)&buffer[offset])
+#define LOAD16(buffer, offset)	(ntohs(*(uint16_t *)&buffer[offset]))
+
+struct tpm_buf {
+	int len;
+	unsigned char data[MAX_BUF_SIZE];
+};
+
+#define INIT_BUF(tb) (tb->len = 0)
+
+struct osapsess {
+	uint32_t handle;
+	unsigned char secret[SHA1_DIGEST_SIZE];
+	unsigned char enonce[TPM_NONCE_SIZE];
+};
+
+/* discrete values, but have to store in uint16_t for TPM use */
+enum {
+	SEAL_keytype = 1,
+	SRK_keytype = 4
+};
+
+struct trusted_key_options {
+	uint16_t keytype;
+	uint32_t keyhandle;
+	unsigned char keyauth[SHA1_DIGEST_SIZE];
+	unsigned char blobauth[SHA1_DIGEST_SIZE];
+	uint32_t pcrinfo_len;
+	unsigned char pcrinfo[MAX_PCRINFO_SIZE];
+	int pcrlock;
+};
+
+#define TPM_DEBUG 0
+
+#if TPM_DEBUG
+static inline void dump_options(struct trusted_key_options *o)
+{
+	pr_info("trusted_key: sealing key type %d\n", o->keytype);
+	pr_info("trusted_key: sealing key handle %0X\n", o->keyhandle);
+	pr_info("trusted_key: pcrlock %d\n", o->pcrlock);
+	pr_info("trusted_key: pcrinfo %d\n", o->pcrinfo_len);
+	print_hex_dump(KERN_INFO, "pcrinfo ", DUMP_PREFIX_NONE,
+		       16, 1, o->pcrinfo, o->pcrinfo_len, 0);
+}
+
+static inline void dump_payload(struct trusted_key_payload *p)
+{
+	pr_info("trusted_key: key_len %d\n", p->key_len);
+	print_hex_dump(KERN_INFO, "key ", DUMP_PREFIX_NONE,
+		       16, 1, p->key, p->key_len, 0);
+	pr_info("trusted_key: bloblen %d\n", p->blob_len);
+	print_hex_dump(KERN_INFO, "blob ", DUMP_PREFIX_NONE,
+		       16, 1, p->blob, p->blob_len, 0);
+	pr_info("trusted_key: migratable %d\n", p->migratable);
+}
+
+static inline void dump_sess(struct osapsess *s)
+{
+	print_hex_dump(KERN_INFO, "trusted-key: handle ", DUMP_PREFIX_NONE,
+		       16, 1, &s->handle, 4, 0);
+	pr_info("trusted-key: secret:\n");
+	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE,
+		       16, 1, &s->secret, SHA1_DIGEST_SIZE, 0);
+	pr_info("trusted-key: enonce:\n");
+	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE,
+		       16, 1, &s->enonce, SHA1_DIGEST_SIZE, 0);
+}
+
+static inline void dump_tpm_buf(unsigned char *buf)
+{
+	int len;
+
+	pr_info("\ntrusted-key: tpm buffer\n");
+	len = LOAD32(buf, TPM_SIZE_OFFSET);
+	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, buf, len, 0);
+}
+#else
+static inline void dump_options(struct trusted_key_options *o)
+{
+}
+
+static inline void dump_payload(struct trusted_key_payload *p)
+{
+}
+
+static inline void dump_sess(struct osapsess *s)
+{
+}
+
+static inline void dump_tpm_buf(unsigned char *buf)
+{
+}
+#endif
+
+static inline void store8(struct tpm_buf *buf, const unsigned char value)
+{
+	buf->data[buf->len++] = value;
+}
+
+static inline void store16(struct tpm_buf *buf, const uint16_t value)
+{
+	*(uint16_t *) & buf->data[buf->len] = htons(value);
+	buf->len += sizeof value;
+}
+
+static inline void store32(struct tpm_buf *buf, const uint32_t value)
+{
+	*(uint32_t *) & buf->data[buf->len] = htonl(value);
+	buf->len += sizeof value;
+}
+
+static inline void storebytes(struct tpm_buf *buf, const unsigned char *in,
+			      const int len)
+{
+	memcpy(buf->data + buf->len, in, len);
+	buf->len += len;
+}
+#endif
-- 
cgit v1.2.3-71-gd317


From fa9f90be745d3b600a9d97a063be404c5e5d9071 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sun, 28 Nov 2010 21:39:34 +0100
Subject: Kill off a bunch of warning: ‘inline’ is not at beginning of
 declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These warnings are spewed during a build of a 'allnoconfig' kernel
(especially the ones from u64_stats_sync.h show up a lot) when building
with -Wextra (which I often do)..
They are
  a) annoying
  b) easy to get rid of.
This patch kills them off.

include/linux/u64_stats_sync.h:70:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:77:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:84:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:96:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:115:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:127:1: warning: ‘inline’ is not at beginning of declaration
kernel/time.c:241:1: warning: ‘inline’ is not at beginning of declaration
kernel/time.c:257:1: warning: ‘inline’ is not at beginning of declaration
kernel/perf_event.c:4513:1: warning: ‘inline’ is not at beginning of declaration
mm/page_alloc.c:4012:1: warning: ‘inline’ is not at beginning of declaration

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/u64_stats_sync.h | 12 ++++++------
 kernel/perf_event.c            |  2 +-
 kernel/time.c                  |  4 ++--
 mm/page_alloc.c                |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index fa261a0da280..8da8c4e87da3 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -67,21 +67,21 @@ struct u64_stats_sync {
 #endif
 };
 
-static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
+static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_begin(&syncp->seq);
 #endif
 }
 
-static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
+static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_end(&syncp->seq);
 #endif
 }
 
-static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
@@ -93,7 +93,7 @@ static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *sy
 #endif
 }
 
-static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
@@ -112,7 +112,7 @@ static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
  * - UP 32bit must disable BH.
  * - 64bit have no problem atomically reading u64 values, irq safe.
  */
-static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp)
+static inline unsigned int u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
@@ -124,7 +124,7 @@ static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync
 #endif
 }
 
-static bool inline u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp,
+static inline bool u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f4982..06682e7b12e2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4454,7 +4454,7 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
 {
 	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
  * Avoid unnecessary multiplications/divisions in the
  * two most common HZ cases:
  */
-unsigned int inline jiffies_to_msecs(const unsigned long j)
+inline unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
 	return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
 
-unsigned int inline jiffies_to_usecs(const unsigned long j)
+inline unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
 	return (USEC_PER_SEC / HZ) * j;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a654486f75..f823ee192e94 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4013,7 +4013,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
 		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 
-- 
cgit v1.2.3-71-gd317


From 62a8c3a32e4143812ed8e0f3783ef1ea40dc87e4 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 24 Nov 2010 19:33:43 +0000
Subject: Staging: sep: handle the rar definition stuff in the header

SEP isn't the only driver that may need to handle both cases easily

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/staging/sep/sep_driver.c        |  1 +
 drivers/staging/sep/sep_driver_config.h | 61 ---------------------------------
 include/linux/rar_register.h            | 16 +++++++++
 3 files changed, 17 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/sep/sep_driver.c b/drivers/staging/sep/sep_driver.c
index ef36239c7b24..8a1ff861b135 100644
--- a/drivers/staging/sep/sep_driver.c
+++ b/drivers/staging/sep/sep_driver.c
@@ -53,6 +53,7 @@
 #include <asm/cacheflush.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
+#include <linux/rar_register.h>
 
 #include <linux/netlink.h>
 #include <linux/connector.h>
diff --git a/drivers/staging/sep/sep_driver_config.h b/drivers/staging/sep/sep_driver_config.h
index 68688cbc2a92..cfda86f2aaff 100644
--- a/drivers/staging/sep/sep_driver_config.h
+++ b/drivers/staging/sep/sep_driver_config.h
@@ -235,15 +235,6 @@ held by the proccess (struct file) */
 
 /* This stub header is for non Moorestown driver only */
 
-/*
- * Constants that specify different kinds of RAR regions that could be
- * set up.
- */
-static __u32 const RAR_TYPE_VIDEO;  /* 0 */
-static __u32 const RAR_TYPE_AUDIO = 1;
-static __u32 const RAR_TYPE_IMAGE = 2;
-static __u32 const RAR_TYPE_DATA  = 3;
-
 /*
  * @struct RAR_stat
  *
@@ -373,56 +364,4 @@ struct RAR_buffer {
 
 #endif  /* MEMRAR */
 
-/* rar_register */
-#ifndef CONFIG_RAR_REGISTER
-/* This stub header is for non Moorestown driver only */
-
-/* The register_rar function is to used by other device drivers
- * to ensure that this driver is ready. As we cannot be sure of
- * the compile/execute order of dirvers in ther kernel, it is
- * best to give this driver a callback function to call when
- * it is ready to give out addresses. The callback function
- * would have those steps that continue the initialization of
- * a driver that do require a valid RAR address. One of those
- * steps would be to call get_rar_address()
- * This function return 0 on success an -1 on failure.
- */
-#define register_rar(a, b, c) (-ENODEV)
-
-/* The get_rar_address function is used by other device drivers
- * to obtain RAR address information on a RAR. It takes two
- * parameter:
- *
- * int rar_index
- * The rar_index is an index to the rar for which you wish to retrieve
- * the address information.
- * Values can be 0,1, or 2.
- *
- * struct RAR_address_struct is a pointer to a place to which the function
- * can return the address structure for the RAR.
- *
- * The function returns a 0 upon success or a -1 if there is no RAR
- * facility on this system.
- */
-#define rar_get_address(a, b, c) (-ENODEV)
-
-/* The lock_rar function is ued by other device drivers to lock an RAR.
- * once an RAR is locked, it stays locked until the next system reboot.
- * The function takes one parameter:
- *
- * int rar_index
- * The rar_index is an index to the rar that you want to lock.
- * Values can be 0,1, or 2.
- *
- * The function returns a 0 upon success or a -1 if there is no RAR
- * facility on this system.
- */
-#define rar_lock(a) (-1)
-
-#else /* using real RAR_REGISTER */
-
-#include <linux/rar_register.h>
-
-#endif  /* CONFIG_RAR_REGISTER */
-
 #endif /* SEP DRIVER CONFIG */
diff --git a/include/linux/rar_register.h b/include/linux/rar_register.h
index ffa805780f85..5c6118189363 100644
--- a/include/linux/rar_register.h
+++ b/include/linux/rar_register.h
@@ -34,11 +34,27 @@
 
 struct rar_device;
 
+#if defined(CONFIG_RAR_REGISTER)
 int register_rar(int num,
 		int (*callback)(unsigned long data), unsigned long data);
 void unregister_rar(int num);
 int rar_get_address(int rar_index, dma_addr_t *start, dma_addr_t *end);
 int rar_lock(int rar_index);
+#else
+extern void unregister_rar(int num)  { }
+extern int rar_lock(int rar_index) { return -EIO; }
+
+extern inline int register_rar(int num,
+		int (*callback)(unsigned long data), unsigned long data)
+{
+	return -ENODEV;
+}
+
+extern int rar_get_address(int rar_index, dma_addr_t *start, dma_addr_t *end)
+{
+	return -ENODEV;
+}
+#endif	/* RAR_REGISTER */
 
 #endif  /* __KERNEL__ */
 #endif  /* _RAR_REGISTER_H */
-- 
cgit v1.2.3-71-gd317


From ad9c2b048b605fbc8d50526e330b88abdd631ab2 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 30 Nov 2010 11:06:47 +0900
Subject: security: Fix comment of security_key_permission

Comment for return value of security_key_permission() has been wrong
since it was added in 2.6.15.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index fd4d55fb8845..e7d89b0c1fd8 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1058,8 +1058,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@cred points to the credentials to provide the context against which to
  *	evaluate the security data on the key.
  *	@perm describes the combination of permissions required of this key.
- *	Return 1 if permission granted, 0 if permission denied and -ve it the
- *	normal permissions model should be effected.
+ *	Return 0 if permission is granted, -ve error otherwise.
  * @key_getsecurity:
  *	Get a textual representation of the security context attached to a key
  *	for the purposes of honouring KEYCTL_GETSECURITY.  This function
-- 
cgit v1.2.3-71-gd317


From c41ab6a1b9028de33e74101cb0aae13098a56fdb Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 29 Nov 2010 15:47:09 -0500
Subject: flex_array: fix flex_array_put_ptr macro to be valid C
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using flex_array_put_ptr() results in a compile error "error: lvalue
required as unary ‘&’ operand"  fix the casting order to fix this.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/flex_array.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 631b77f2ac70..70e4efabe0fb 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -71,7 +71,7 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
 int flex_array_shrink(struct flex_array *fa);
 
 #define flex_array_put_ptr(fa, nr, src, gfp) \
-	flex_array_put(fa, nr, &(void *)(src), gfp)
+	flex_array_put(fa, nr, (void *)&(src), gfp)
 
 void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr);
 
-- 
cgit v1.2.3-71-gd317


From 5b275ce27077d6463ca28c9671dce7c2c1f622e2 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Nov 2010 15:27:29 +0000
Subject: thermal: make ops constant

And while touching that function definition do something about the disaster
of formatting there.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/thermal/thermal_sys.c | 17 +++++------------
 include/linux/thermal.h       | 15 +++++----------
 2 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 13c72c629329..bde3477c1c6b 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -823,11 +823,8 @@ static struct class thermal_class = {
  * @devdata:	device private data.
  * @ops:		standard thermal cooling devices callbacks.
  */
-struct thermal_cooling_device *thermal_cooling_device_register(char *type,
-							       void *devdata,
-							       struct
-							       thermal_cooling_device_ops
-							       *ops)
+struct thermal_cooling_device *thermal_cooling_device_register(
+     char *type, void *devdata, const struct thermal_cooling_device_ops *ops)
 {
 	struct thermal_cooling_device *cdev;
 	struct thermal_zone_device *pos;
@@ -1048,13 +1045,9 @@ EXPORT_SYMBOL(thermal_zone_device_update);
  * section 11.1.5.1 of the ACPI specification 3.0.
  */
 struct thermal_zone_device *thermal_zone_device_register(char *type,
-							 int trips,
-							 void *devdata, struct
-							 thermal_zone_device_ops
-							 *ops, int tc1, int
-							 tc2,
-							 int passive_delay,
-							 int polling_delay)
+	int trips, void *devdata,
+	const struct thermal_zone_device_ops *ops,
+	int tc1, int tc2, int passive_delay, int polling_delay)
 {
 	struct thermal_zone_device *tz;
 	struct thermal_cooling_device *pos;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1de8b9eb841b..06626904daa0 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -77,7 +77,7 @@ struct thermal_cooling_device {
 	char type[THERMAL_NAME_LENGTH];
 	struct device device;
 	void *devdata;
-	struct thermal_cooling_device_ops *ops;
+	const struct thermal_cooling_device_ops *ops;
 	struct list_head node;
 };
 
@@ -114,7 +114,7 @@ struct thermal_zone_device {
 	int last_temperature;
 	bool passive;
 	unsigned int forced_passive;
-	struct thermal_zone_device_ops *ops;
+	const struct thermal_zone_device_ops *ops;
 	struct list_head cooling_devices;
 	struct idr idr;
 	struct mutex lock;	/* protect cooling devices list */
@@ -129,11 +129,8 @@ struct thermal_zone_device {
 };
 
 struct thermal_zone_device *thermal_zone_device_register(char *, int, void *,
-							 struct
-							 thermal_zone_device_ops
-							 *, int tc1, int tc2,
-							 int passive_freq,
-							 int polling_freq);
+		const struct thermal_zone_device_ops *, int tc1, int tc2,
+		int passive_freq, int polling_freq);
 void thermal_zone_device_unregister(struct thermal_zone_device *);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
@@ -142,9 +139,7 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *);
 struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
-							       struct
-							       thermal_cooling_device_ops
-							       *);
+		const struct thermal_cooling_device_ops *);
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
 
 #endif /* __THERMAL_H__ */
-- 
cgit v1.2.3-71-gd317


From 131d81061eba5ffd436b5d132530ac5205b16892 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 30 Nov 2010 17:03:39 +0000
Subject: ASoC: Allow user-specified WM8958 multiband compressor configurations

The paramters of the WM8958 multiband compressor can be tuned by the
user for their system using a graphical configuration tool on the host.
Allow the user to specify a set of such paramters in platform data and
select between them at runtime.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/mfd/wm8994/pdata.h     | 17 ++++++++
 include/linux/mfd/wm8994/registers.h | 68 +++++++++++++++++++++++++++++
 sound/soc/codecs/wm8994.c            | 84 +++++++++++++++++++++++++++++++++++-
 3 files changed, 167 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/pdata.h b/include/linux/mfd/wm8994/pdata.h
index 5c51f367c061..882b51aefbc0 100644
--- a/include/linux/mfd/wm8994/pdata.h
+++ b/include/linux/mfd/wm8994/pdata.h
@@ -30,6 +30,8 @@ struct wm8994_ldo_pdata {
 
 #define WM8994_DRC_REGS 5
 #define WM8994_EQ_REGS  19
+#define WM8958_MBC_CUTOFF_REGS 20
+#define WM8958_MBC_COEFF_REGS  48
 
 /**
  * DRC configurations are specified with a label and a set of register
@@ -59,6 +61,18 @@ struct wm8994_retune_mobile_cfg {
         u16 regs[WM8994_EQ_REGS];
 };
 
+/**
+ * Multiband compressor configurations are specified with a label and
+ * two sets of values to write.  Configurations are expected to be
+ * generated using the multiband compressor configuration panel in
+ * WISCE - see http://www.wolfsonmicro.com/wisce/
+ */
+struct wm8958_mbc_cfg {
+	const char *name;
+	u16 cutoff_regs[WM8958_MBC_CUTOFF_REGS];
+	u16 coeff_regs[WM8958_MBC_COEFF_REGS];
+};
+
 struct wm8994_pdata {
 	int gpio_base;
 
@@ -78,6 +92,9 @@ struct wm8994_pdata {
         int num_retune_mobile_cfgs;
         struct wm8994_retune_mobile_cfg *retune_mobile_cfgs;
 
+	int num_mbc_cfgs;
+	struct wm8958_mbc_cfg *mbc_cfgs;
+
         /* LINEOUT can be differential or single ended */
         unsigned int lineout1_diff:1;
         unsigned int lineout2_diff:1;
diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index a610c8746436..ccf3a774fe36 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -258,6 +258,74 @@
 #define WM8958_DSP2_VERMAJMIN                   0xA04
 #define WM8958_DSP2_VERBUILD                    0xA05
 #define WM8958_DSP2_EXECCONTROL                 0xA0D
+#define WM8958_MBC_BAND_2_LOWER_CUTOFF_C1_1     0x2200
+#define WM8958_MBC_BAND_2_LOWER_CUTOFF_C1_2     0x2201
+#define WM8958_MBC_BAND_2_LOWER_CUTOFF_C2_1     0x2202
+#define WM8958_MBC_BAND_2_LOWER_CUTOFF_C2_2     0x2203
+#define WM8958_MBC_BAND_2_LOWER_CUTOFF_C3_1     0x2204
+#define WM8958_MBC_BAND_2_LOWER_CUTOFF_C3_2     0x2205
+#define WM8958_MBC_BAND_2_UPPER_CUTOFF_C2_1     0x2206
+#define WM8958_MBC_BAND_2_UPPER_CUTOFF_C2_2     0x2207
+#define WM8958_MBC_BAND_2_UPPER_CUTOFF_C3_1     0x2208
+#define WM8958_MBC_BAND_2_UPPER_CUTOFF_C3_2     0x2209
+#define WM8958_MBC_BAND_2_UPPER_CUTOFF_C1_1     0x220A
+#define WM8958_MBC_BAND_2_UPPER_CUTOFF_C1_2     0x220B
+#define WM8958_MBC_BAND_1_UPPER_CUTOFF_C1_1     0x220C
+#define WM8958_MBC_BAND_1_UPPER_CUTOFF_C1_2     0x220D
+#define WM8958_MBC_BAND_1_UPPER_CUTOFF_C2_1     0x220E
+#define WM8958_MBC_BAND_1_UPPER_CUTOFF_C2_2     0x220F
+#define WM8958_MBC_BAND_1_UPPER_CUTOFF_C3_1     0x2210
+#define WM8958_MBC_BAND_1_UPPER_CUTOFF_C3_2     0x2211
+#define WM8958_MBC_BAND_1_LOWER_CUTOFF_1        0x2212
+#define WM8958_MBC_BAND_1_LOWER_CUTOFF_2        0x2213
+#define WM8958_MBC_BAND_1_K_1                   0x2400
+#define WM8958_MBC_BAND_1_K_2                   0x2401
+#define WM8958_MBC_BAND_1_N1_1                  0x2402
+#define WM8958_MBC_BAND_1_N1_2                  0x2403
+#define WM8958_MBC_BAND_1_N2_1                  0x2404
+#define WM8958_MBC_BAND_1_N2_2                  0x2405
+#define WM8958_MBC_BAND_1_N3_1                  0x2406
+#define WM8958_MBC_BAND_1_N3_2                  0x2407
+#define WM8958_MBC_BAND_1_N4_1                  0x2408
+#define WM8958_MBC_BAND_1_N4_2                  0x2409
+#define WM8958_MBC_BAND_1_N5_1                  0x240A
+#define WM8958_MBC_BAND_1_N5_2                  0x240B
+#define WM8958_MBC_BAND_1_X1_1                  0x240C
+#define WM8958_MBC_BAND_1_X1_2                  0x240D
+#define WM8958_MBC_BAND_1_X2_1                  0x240E
+#define WM8958_MBC_BAND_1_X2_2                  0x240F
+#define WM8958_MBC_BAND_1_X3_1                  0x2410
+#define WM8958_MBC_BAND_1_X3_2                  0x2411
+#define WM8958_MBC_BAND_1_ATTACK_1              0x2412
+#define WM8958_MBC_BAND_1_ATTACK_2              0x2413
+#define WM8958_MBC_BAND_1_DECAY_1               0x2414
+#define WM8958_MBC_BAND_1_DECAY_2               0x2415
+#define WM8958_MBC_BAND_2_K_1                   0x2416
+#define WM8958_MBC_BAND_2_K_2                   0x2417
+#define WM8958_MBC_BAND_2_N1_1                  0x2418
+#define WM8958_MBC_BAND_2_N1_2                  0x2419
+#define WM8958_MBC_BAND_2_N2_1                  0x241A
+#define WM8958_MBC_BAND_2_N2_2                  0x241B
+#define WM8958_MBC_BAND_2_N3_1                  0x241C
+#define WM8958_MBC_BAND_2_N3_2                  0x241D
+#define WM8958_MBC_BAND_2_N4_1                  0x241E
+#define WM8958_MBC_BAND_2_N4_2                  0x241F
+#define WM8958_MBC_BAND_2_N5_1                  0x2420
+#define WM8958_MBC_BAND_2_N5_2                  0x2421
+#define WM8958_MBC_BAND_2_X1_1                  0x2422
+#define WM8958_MBC_BAND_2_X1_2                  0x2423
+#define WM8958_MBC_BAND_2_X2_1                  0x2424
+#define WM8958_MBC_BAND_2_X2_2                  0x2425
+#define WM8958_MBC_BAND_2_X3_1                  0x2426
+#define WM8958_MBC_BAND_2_X3_2                  0x2427
+#define WM8958_MBC_BAND_2_ATTACK_1              0x2428
+#define WM8958_MBC_BAND_2_ATTACK_2              0x2429
+#define WM8958_MBC_BAND_2_DECAY_1               0x242A
+#define WM8958_MBC_BAND_2_DECAY_2               0x242B
+#define WM8958_MBC_B2_PG2_1                     0x242C
+#define WM8958_MBC_B2_PG2_2                     0x242D
+#define WM8958_MBC_B1_PG2_1                     0x242E
+#define WM8958_MBC_B1_PG2_2                     0x242F
 #define WM8994_WRITE_SEQUENCER_0                0x3000
 #define WM8994_WRITE_SEQUENCER_1                0x3001
 #define WM8994_WRITE_SEQUENCER_2                0x3002
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index 51f5cf16b425..59d361145b15 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -92,6 +92,11 @@ struct wm8994_priv {
 	int retune_mobile_cfg[WM8994_NUM_EQ];
 	struct soc_enum retune_mobile_enum;
 
+	/* Platform dependant MBC configuration */
+	int mbc_cfg;
+	const char **mbc_texts;
+	struct soc_enum mbc_enum;
+
 	struct wm8994_micdet micdet[2];
 
 	wm8958_micdet_cb jack_cb;
@@ -543,8 +548,9 @@ static const struct soc_enum aif2dacr_src =
 static void wm8958_mbc_apply(struct snd_soc_codec *codec, int mbc, int start)
 {
 	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+	struct wm8994_pdata *pdata = wm8994->pdata;
 	int pwr_reg = snd_soc_read(codec, WM8994_POWER_MANAGEMENT_5);
-	int ena, reg, aif;
+	int ena, reg, aif, i;
 
 	switch (mbc) {
 	case 0:
@@ -587,7 +593,20 @@ static void wm8958_mbc_apply(struct snd_soc_codec *codec, int mbc, int start)
 		snd_soc_update_bits(codec, WM8958_DSP2_PROGRAM,
 				    WM8958_DSP2_ENA, WM8958_DSP2_ENA);
 
-		/* TODO: Apply any user specified MBC settings */
+		/* If we've got user supplied MBC settings use them */
+		if (pdata && pdata->num_mbc_cfgs) {
+			struct wm8958_mbc_cfg *cfg
+				= &pdata->mbc_cfgs[wm8994->mbc_cfg];
+
+			for (i = 0; i < ARRAY_SIZE(cfg->coeff_regs); i++)
+				snd_soc_write(codec, i + WM8958_MBC_BAND_1_K_1,
+					      cfg->coeff_regs[i]);
+
+			for (i = 0; i < ARRAY_SIZE(cfg->cutoff_regs); i++)
+				snd_soc_write(codec,
+					      i + WM8958_MBC_BAND_2_LOWER_CUTOFF_C1_1,
+					      cfg->cutoff_regs[i]);
+		}
 
 		/* Run the DSP */
 		snd_soc_write(codec, WM8958_DSP2_EXECCONTROL,
@@ -648,6 +667,39 @@ static int wm8958_aif_ev(struct snd_soc_dapm_widget *w,
 	return 0;
 }
 
+static int wm8958_put_mbc_enum(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+	struct wm8994_pdata *pdata = wm8994->pdata;
+	int value = ucontrol->value.integer.value[0];
+	int reg;
+
+	/* Don't allow on the fly reconfiguration */
+	reg = snd_soc_read(codec, WM8994_CLOCKING_1);
+	if (reg < 0 || reg & WM8958_DSP2CLK_ENA)
+		return -EBUSY;
+
+	if (value >= pdata->num_mbc_cfgs)
+		return -EINVAL;
+
+	wm8994->mbc_cfg = value;
+
+	return 0;
+}
+
+static int wm8958_get_mbc_enum(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+
+	ucontrol->value.enumerated.item[0] = wm8994->mbc_cfg;
+
+	return 0;
+}
+
 static int wm8958_mbc_info(struct snd_kcontrol *kcontrol,
 			   struct snd_ctl_elem_info *uinfo)
 {
@@ -2539,6 +2591,34 @@ static void wm8994_handle_pdata(struct wm8994_priv *wm8994)
 	dev_dbg(codec->dev, "%d ReTune Mobile configurations\n",
 		pdata->num_retune_mobile_cfgs);
 
+	if (pdata->num_mbc_cfgs) {
+		struct snd_kcontrol_new control[] = {
+			SOC_ENUM_EXT("MBC Mode", wm8994->mbc_enum,
+				     wm8958_get_mbc_enum, wm8958_put_mbc_enum),
+		};
+
+		/* We need an array of texts for the enum API */
+		wm8994->mbc_texts = kmalloc(sizeof(char *)
+					    * pdata->num_mbc_cfgs, GFP_KERNEL);
+		if (!wm8994->mbc_texts) {
+			dev_err(wm8994->codec->dev,
+				"Failed to allocate %d MBC config texts\n",
+				pdata->num_mbc_cfgs);
+			return;
+		}
+
+		for (i = 0; i < pdata->num_mbc_cfgs; i++)
+			wm8994->mbc_texts[i] = pdata->mbc_cfgs[i].name;
+
+		wm8994->mbc_enum.max = pdata->num_mbc_cfgs;
+		wm8994->mbc_enum.texts = wm8994->mbc_texts;
+
+		ret = snd_soc_add_controls(wm8994->codec, control, 1);
+		if (ret != 0)
+			dev_err(wm8994->codec->dev,
+				"Failed to add MBC mode controls: %d\n", ret);
+	}
+
 	if (pdata->num_retune_mobile_cfgs)
 		wm8994_handle_retune_mobile_pdata(wm8994);
 	else
-- 
cgit v1.2.3-71-gd317


From 8bc3c2c207dc82d47ffc6ef7b788e04ea637d3f1 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 30 Nov 2010 14:56:18 +0000
Subject: ASoC: Tune performance of WM8958 revision A

Update some of the default configuration for the device to improve
the performance.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/mfd/wm8994/registers.h |  9 +++++++++
 sound/soc/codecs/wm8994.c            | 36 +++++++++++++++++++++++++++---------
 2 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index ccf3a774fe36..be072faec6f0 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -64,6 +64,7 @@
 #define WM8994_LDO_1                            0x3B
 #define WM8994_LDO_2                            0x3C
 #define WM8994_CHARGE_PUMP_1                    0x4C
+#define WM8958_CHARGE_PUMP_2                    0x4D
 #define WM8994_CLASS_W_1                        0x51
 #define WM8994_DC_SERVO_1                       0x54
 #define WM8994_DC_SERVO_2                       0x55
@@ -1925,6 +1926,14 @@
 #define WM8994_CP_ENA_SHIFT                         15  /* CP_ENA */
 #define WM8994_CP_ENA_WIDTH                          1  /* CP_ENA */
 
+/*
+ * R77 (0x4D) - Charge Pump (2)
+ */
+#define WM8958_CP_DISCH                         0x8000  /* CP_DISCH */
+#define WM8958_CP_DISCH_MASK                    0x8000  /* CP_DISCH */
+#define WM8958_CP_DISCH_SHIFT                       15  /* CP_DISCH */
+#define WM8958_CP_DISCH_WIDTH                        1  /* CP_DISCH */
+
 /*
  * R81 (0x51) - Class W (1)
  */
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index 59d361145b15..af3a98ae0579 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -1858,15 +1858,33 @@ static int wm8994_set_bias_level(struct snd_soc_codec *codec,
 		if (codec->dapm.bias_level == SND_SOC_BIAS_OFF) {
 			pm_runtime_get_sync(codec->dev);
 
-			/* Tweak DC servo and DSP configuration for
-			 * improved performance. */
-			if (control->type == WM8994 && wm8994->revision < 4) {
-				/* Tweak DC servo and DSP configuration for
-				 * improved performance. */
-				snd_soc_write(codec, 0x102, 0x3);
-				snd_soc_write(codec, 0x56, 0x3);
-				snd_soc_write(codec, 0x817, 0);
-				snd_soc_write(codec, 0x102, 0);
+			switch (control->type) {
+			case WM8994:
+				if (wm8994->revision < 4) {
+					/* Tweak DC servo and DSP
+					 * configuration for improved
+					 * performance. */
+					snd_soc_write(codec, 0x102, 0x3);
+					snd_soc_write(codec, 0x56, 0x3);
+					snd_soc_write(codec, 0x817, 0);
+					snd_soc_write(codec, 0x102, 0);
+				}
+				break;
+
+			case WM8958:
+				if (wm8994->revision == 0) {
+					/* Optimise performance for rev A */
+					snd_soc_write(codec, 0x102, 0x3);
+					snd_soc_write(codec, 0xcb, 0x81);
+					snd_soc_write(codec, 0x817, 0);
+					snd_soc_write(codec, 0x102, 0);
+
+					snd_soc_update_bits(codec,
+							    WM8958_CHARGE_PUMP_2,
+							    WM8958_CP_DISCH,
+							    WM8958_CP_DISCH);
+				}
+				break;
 			}
 
 			/* Discharge LINEOUT1 & 2 */
-- 
cgit v1.2.3-71-gd317


From 676dac4b1bee0469d6932f698aeb77e8489f5861 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Thu, 2 Dec 2010 06:43:39 -0800
Subject: This patch adds a new security attribute to Smack called SMACK64EXEC.
 It defines label that is used while task is running.

Exception: in smack_task_wait() child task is checked
for write access to parent task using label inherited
from the task that forked it.

Fixed issues from previous submit:
- SMACK64EXEC was not read when SMACK64 was not set.
- inode security blob was not updated after setting
  SMACK64EXEC
- inode security blob was not updated when removing
  SMACK64EXEC
---
 include/linux/xattr.h         |   2 +
 security/smack/smack.h        |  30 +++++++
 security/smack/smack_access.c |   4 +-
 security/smack/smack_lsm.c    | 192 +++++++++++++++++++++++++++++++-----------
 security/smack/smackfs.c      |   4 +-
 5 files changed, 178 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index f1e5bde4b35a..351c7901d74a 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -40,9 +40,11 @@
 #define XATTR_SMACK_SUFFIX "SMACK64"
 #define XATTR_SMACK_IPIN "SMACK64IPIN"
 #define XATTR_SMACK_IPOUT "SMACK64IPOUT"
+#define XATTR_SMACK_EXEC "SMACK64EXEC"
 #define XATTR_NAME_SMACK XATTR_SECURITY_PREFIX XATTR_SMACK_SUFFIX
 #define XATTR_NAME_SMACKIPIN	XATTR_SECURITY_PREFIX XATTR_SMACK_IPIN
 #define XATTR_NAME_SMACKIPOUT	XATTR_SECURITY_PREFIX XATTR_SMACK_IPOUT
+#define XATTR_NAME_SMACKEXEC	XATTR_SECURITY_PREFIX XATTR_SMACK_EXEC
 
 #define XATTR_CAPS_SUFFIX "capability"
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
diff --git a/security/smack/smack.h b/security/smack/smack.h
index 43ae747a5aa4..a2e2cdfab4ef 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -51,10 +51,16 @@ struct socket_smack {
  */
 struct inode_smack {
 	char		*smk_inode;	/* label of the fso */
+	char		*smk_task;	/* label of the task */
 	struct mutex	smk_lock;	/* initialization lock */
 	int		smk_flags;	/* smack inode flags */
 };
 
+struct task_smack {
+	char		*smk_task;	/* label used for access control */
+	char		*smk_forked;	/* label when forked */
+};
+
 #define	SMK_INODE_INSTANT	0x01	/* inode is instantiated */
 
 /*
@@ -242,6 +248,30 @@ static inline char *smk_of_inode(const struct inode *isp)
 	return sip->smk_inode;
 }
 
+/*
+ * Present a pointer to the smack label in an task blob.
+ */
+static inline char *smk_of_task(const struct task_smack *tsp)
+{
+	return tsp->smk_task;
+}
+
+/*
+ * Present a pointer to the forked smack label in an task blob.
+ */
+static inline char *smk_of_forked(const struct task_smack *tsp)
+{
+	return tsp->smk_forked;
+}
+
+/*
+ * Present a pointer to the smack label in the curren task blob.
+ */
+static inline char *smk_of_current(void)
+{
+	return smk_of_task(current_security());
+}
+
 /*
  * logging functions
  */
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index f4fac64c4da8..42becbc1ce33 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -185,7 +185,7 @@ out_audit:
 int smk_curacc(char *obj_label, u32 mode, struct smk_audit_info *a)
 {
 	int rc;
-	char *sp = current_security();
+	char *sp = smk_of_current();
 
 	rc = smk_access(sp, obj_label, mode, NULL);
 	if (rc == 0)
@@ -196,7 +196,7 @@ int smk_curacc(char *obj_label, u32 mode, struct smk_audit_info *a)
 	 * only one that gets privilege and current does not
 	 * have that label.
 	 */
-	if (smack_onlycap != NULL && smack_onlycap != current->cred->security)
+	if (smack_onlycap != NULL && smack_onlycap != sp)
 		goto out_audit;
 
 	if (capable(CAP_MAC_OVERRIDE))
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 04a98c361a65..7e19afe0e738 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -43,7 +43,7 @@
  * Returns a pointer to the master list entry for the Smack label
  * or NULL if there was no label to fetch.
  */
-static char *smk_fetch(struct inode *ip, struct dentry *dp)
+static char *smk_fetch(const char *name, struct inode *ip, struct dentry *dp)
 {
 	int rc;
 	char in[SMK_LABELLEN];
@@ -51,7 +51,7 @@ static char *smk_fetch(struct inode *ip, struct dentry *dp)
 	if (ip->i_op->getxattr == NULL)
 		return NULL;
 
-	rc = ip->i_op->getxattr(dp, XATTR_NAME_SMACK, in, SMK_LABELLEN);
+	rc = ip->i_op->getxattr(dp, name, in, SMK_LABELLEN);
 	if (rc < 0)
 		return NULL;
 
@@ -103,8 +103,8 @@ static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode)
 	if (rc != 0)
 		return rc;
 
-	sp = current_security();
-	tsp = task_security(ctp);
+	sp = smk_of_current();
+	tsp = smk_of_task(task_security(ctp));
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
 	smk_ad_setfield_u_tsk(&ad, ctp);
 
@@ -138,8 +138,8 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
 	smk_ad_setfield_u_tsk(&ad, ptp);
 
-	sp = current_security();
-	tsp = task_security(ptp);
+	sp = smk_of_current();
+	tsp = smk_of_task(task_security(ptp));
 	/* we won't log here, because rc can be overriden */
 	rc = smk_access(tsp, sp, MAY_READWRITE, NULL);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
@@ -160,7 +160,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 static int smack_syslog(int typefrom_file)
 {
 	int rc = 0;
-	char *sp = current_security();
+	char *sp = smk_of_current();
 
 	if (capable(CAP_MAC_OVERRIDE))
 		return 0;
@@ -390,6 +390,40 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags)
 	return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
+/*
+ * BPRM hooks
+ */
+
+static int smack_bprm_set_creds(struct linux_binprm *bprm)
+{
+	struct task_smack *tsp = bprm->cred->security;
+	struct inode_smack *isp;
+	struct dentry *dp;
+	int rc;
+
+	rc = cap_bprm_set_creds(bprm);
+	if (rc != 0)
+		return rc;
+
+	if (bprm->cred_prepared)
+		return 0;
+
+	if (bprm->file == NULL || bprm->file->f_dentry == NULL)
+		return 0;
+
+	dp = bprm->file->f_dentry;
+
+	if (dp->d_inode == NULL)
+		return 0;
+
+	isp = dp->d_inode->i_security;
+
+	if (isp->smk_task != NULL)
+		tsp->smk_task = isp->smk_task;
+
+	return 0;
+}
+
 /*
  * Inode hooks
  */
@@ -402,7 +436,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags)
  */
 static int smack_inode_alloc_security(struct inode *inode)
 {
-	inode->i_security = new_inode_smack(current_security());
+	inode->i_security = new_inode_smack(smk_of_current());
 	if (inode->i_security == NULL)
 		return -ENOMEM;
 	return 0;
@@ -664,7 +698,8 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name,
 
 	if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
 	    strcmp(name, XATTR_NAME_SMACKIPIN) == 0 ||
-	    strcmp(name, XATTR_NAME_SMACKIPOUT) == 0) {
+	    strcmp(name, XATTR_NAME_SMACKIPOUT) == 0 ||
+	    strcmp(name, XATTR_NAME_SMACKEXEC) == 0) {
 		if (!capable(CAP_MAC_ADMIN))
 			rc = -EPERM;
 		/*
@@ -704,9 +739,10 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
 	char *nsp;
 
 	/*
-	 * Not SMACK
+	 * Not SMACK or SMACKEXEC
 	 */
-	if (strcmp(name, XATTR_NAME_SMACK))
+	if (strcmp(name, XATTR_NAME_SMACK) &&
+	    strcmp(name, XATTR_NAME_SMACKEXEC))
 		return;
 
 	isp = dentry->d_inode->i_security;
@@ -716,10 +752,18 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
 	 * assignment.
 	 */
 	nsp = smk_import(value, size);
-	if (nsp != NULL)
-		isp->smk_inode = nsp;
-	else
-		isp->smk_inode = smack_known_invalid.smk_known;
+
+	if (strcmp(name, XATTR_NAME_SMACK) == 0) {
+		if (nsp != NULL)
+			isp->smk_inode = nsp;
+		else
+			isp->smk_inode = smack_known_invalid.smk_known;
+	} else {
+		if (nsp != NULL)
+			isp->smk_task = nsp;
+		else
+			isp->smk_task = smack_known_invalid.smk_known;
+	}
 
 	return;
 }
@@ -752,12 +796,14 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name)
  */
 static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 {
+	struct inode_smack *isp;
 	struct smk_audit_info ad;
 	int rc = 0;
 
 	if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
 	    strcmp(name, XATTR_NAME_SMACKIPIN) == 0 ||
-	    strcmp(name, XATTR_NAME_SMACKIPOUT) == 0) {
+	    strcmp(name, XATTR_NAME_SMACKIPOUT) == 0 ||
+	    strcmp(name, XATTR_NAME_SMACKEXEC) == 0) {
 		if (!capable(CAP_MAC_ADMIN))
 			rc = -EPERM;
 	} else
@@ -768,6 +814,11 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 	if (rc == 0)
 		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 
+	if (rc == 0) {
+		isp = dentry->d_inode->i_security;
+		isp->smk_task = NULL;
+	}
+
 	return rc;
 }
 
@@ -895,7 +946,7 @@ static int smack_file_permission(struct file *file, int mask)
  */
 static int smack_file_alloc_security(struct file *file)
 {
-	file->f_security = current_security();
+	file->f_security = smk_of_current();
 	return 0;
 }
 
@@ -1005,7 +1056,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
  */
 static int smack_file_set_fowner(struct file *file)
 {
-	file->f_security = current_security();
+	file->f_security = smk_of_current();
 	return 0;
 }
 
@@ -1025,7 +1076,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 {
 	struct file *file;
 	int rc;
-	char *tsp = tsk->cred->security;
+	char *tsp = smk_of_task(tsk->cred->security);
 	struct smk_audit_info ad;
 
 	/*
@@ -1082,7 +1133,9 @@ static int smack_file_receive(struct file *file)
  */
 static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
-	cred->security = NULL;
+	cred->security = kzalloc(sizeof(struct task_smack), gfp);
+	if (cred->security == NULL)
+		return -ENOMEM;
 	return 0;
 }
 
@@ -1097,7 +1150,7 @@ static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
  */
 static void smack_cred_free(struct cred *cred)
 {
-	cred->security = NULL;
+	kfree(cred->security);
 }
 
 /**
@@ -1111,7 +1164,16 @@ static void smack_cred_free(struct cred *cred)
 static int smack_cred_prepare(struct cred *new, const struct cred *old,
 			      gfp_t gfp)
 {
-	new->security = old->security;
+	struct task_smack *old_tsp = old->security;
+	struct task_smack *new_tsp;
+
+	new_tsp = kzalloc(sizeof(struct task_smack), gfp);
+	if (new_tsp == NULL)
+		return -ENOMEM;
+
+	new_tsp->smk_task = old_tsp->smk_task;
+	new_tsp->smk_forked = old_tsp->smk_task;
+	new->security = new_tsp;
 	return 0;
 }
 
@@ -1124,7 +1186,11 @@ static int smack_cred_prepare(struct cred *new, const struct cred *old,
  */
 static void smack_cred_transfer(struct cred *new, const struct cred *old)
 {
-	new->security = old->security;
+	struct task_smack *old_tsp = old->security;
+	struct task_smack *new_tsp = new->security;
+
+	new_tsp->smk_task = old_tsp->smk_task;
+	new_tsp->smk_forked = old_tsp->smk_task;
 }
 
 /**
@@ -1136,12 +1202,13 @@ static void smack_cred_transfer(struct cred *new, const struct cred *old)
  */
 static int smack_kernel_act_as(struct cred *new, u32 secid)
 {
+	struct task_smack *new_tsp = new->security;
 	char *smack = smack_from_secid(secid);
 
 	if (smack == NULL)
 		return -EINVAL;
 
-	new->security = smack;
+	new_tsp->smk_task = smack;
 	return 0;
 }
 
@@ -1157,8 +1224,10 @@ static int smack_kernel_create_files_as(struct cred *new,
 					struct inode *inode)
 {
 	struct inode_smack *isp = inode->i_security;
+	struct task_smack *tsp = new->security;
 
-	new->security = isp->smk_inode;
+	tsp->smk_forked = isp->smk_inode;
+	tsp->smk_task = isp->smk_inode;
 	return 0;
 }
 
@@ -1175,7 +1244,7 @@ static int smk_curacc_on_task(struct task_struct *p, int access)
 
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
 	smk_ad_setfield_u_tsk(&ad, p);
-	return smk_curacc(task_security(p), access, &ad);
+	return smk_curacc(smk_of_task(task_security(p)), access, &ad);
 }
 
 /**
@@ -1221,7 +1290,7 @@ static int smack_task_getsid(struct task_struct *p)
  */
 static void smack_task_getsecid(struct task_struct *p, u32 *secid)
 {
-	*secid = smack_to_secid(task_security(p));
+	*secid = smack_to_secid(smk_of_task(task_security(p)));
 }
 
 /**
@@ -1333,14 +1402,15 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 	 * can write the receiver.
 	 */
 	if (secid == 0)
-		return smk_curacc(task_security(p), MAY_WRITE, &ad);
+		return smk_curacc(smk_of_task(task_security(p)), MAY_WRITE,
+				  &ad);
 	/*
 	 * If the secid isn't 0 we're dealing with some USB IO
 	 * specific behavior. This is not clean. For one thing
 	 * we can't take privilege into account.
 	 */
-	return smk_access(smack_from_secid(secid), task_security(p),
-			  MAY_WRITE, &ad);
+	return smk_access(smack_from_secid(secid),
+			  smk_of_task(task_security(p)), MAY_WRITE, &ad);
 }
 
 /**
@@ -1352,12 +1422,12 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 static int smack_task_wait(struct task_struct *p)
 {
 	struct smk_audit_info ad;
-	char *sp = current_security();
-	char *tsp = task_security(p);
+	char *sp = smk_of_current();
+	char *tsp = smk_of_forked(task_security(p));
 	int rc;
 
 	/* we don't log here, we can be overriden */
-	rc = smk_access(sp, tsp, MAY_WRITE, NULL);
+	rc = smk_access(tsp, sp, MAY_WRITE, NULL);
 	if (rc == 0)
 		goto out_log;
 
@@ -1378,7 +1448,7 @@ static int smack_task_wait(struct task_struct *p)
  out_log:
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
 	smk_ad_setfield_u_tsk(&ad, p);
-	smack_log(sp, tsp, MAY_WRITE, rc, &ad);
+	smack_log(tsp, sp, MAY_WRITE, rc, &ad);
 	return rc;
 }
 
@@ -1392,7 +1462,7 @@ static int smack_task_wait(struct task_struct *p)
 static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
 {
 	struct inode_smack *isp = inode->i_security;
-	isp->smk_inode = task_security(p);
+	isp->smk_inode = smk_of_task(task_security(p));
 }
 
 /*
@@ -1411,7 +1481,7 @@ static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
  */
 static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags)
 {
-	char *csp = current_security();
+	char *csp = smk_of_current();
 	struct socket_smack *ssp;
 
 	ssp = kzalloc(sizeof(struct socket_smack), gfp_flags);
@@ -1752,7 +1822,7 @@ static int smack_flags_to_may(int flags)
  */
 static int smack_msg_msg_alloc_security(struct msg_msg *msg)
 {
-	msg->security = current_security();
+	msg->security = smk_of_current();
 	return 0;
 }
 
@@ -1788,7 +1858,7 @@ static int smack_shm_alloc_security(struct shmid_kernel *shp)
 {
 	struct kern_ipc_perm *isp = &shp->shm_perm;
 
-	isp->security = current_security();
+	isp->security = smk_of_current();
 	return 0;
 }
 
@@ -1911,7 +1981,7 @@ static int smack_sem_alloc_security(struct sem_array *sma)
 {
 	struct kern_ipc_perm *isp = &sma->sem_perm;
 
-	isp->security = current_security();
+	isp->security = smk_of_current();
 	return 0;
 }
 
@@ -2029,7 +2099,7 @@ static int smack_msg_queue_alloc_security(struct msg_queue *msq)
 {
 	struct kern_ipc_perm *kisp = &msq->q_perm;
 
-	kisp->security = current_security();
+	kisp->security = smk_of_current();
 	return 0;
 }
 
@@ -2201,7 +2271,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 	struct super_block *sbp;
 	struct superblock_smack *sbsp;
 	struct inode_smack *isp;
-	char *csp = current_security();
+	char *csp = smk_of_current();
 	char *fetched;
 	char *final;
 	struct dentry *dp;
@@ -2321,9 +2391,12 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 		 * Get the dentry for xattr.
 		 */
 		dp = dget(opt_dentry);
-		fetched = smk_fetch(inode, dp);
+		fetched = smk_fetch(XATTR_NAME_SMACK, inode, dp);
 		if (fetched != NULL)
 			final = fetched;
+		isp->smk_task = smk_fetch(XATTR_NAME_SMACKEXEC, inode,
+					  dp);
+
 		dput(dp);
 		break;
 	}
@@ -2358,7 +2431,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 	if (strcmp(name, "current") != 0)
 		return -EINVAL;
 
-	cp = kstrdup(task_security(p), GFP_KERNEL);
+	cp = kstrdup(smk_of_task(task_security(p)), GFP_KERNEL);
 	if (cp == NULL)
 		return -ENOMEM;
 
@@ -2382,6 +2455,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 static int smack_setprocattr(struct task_struct *p, char *name,
 			     void *value, size_t size)
 {
+	struct task_smack *tsp;
 	struct cred *new;
 	char *newsmack;
 
@@ -2414,7 +2488,13 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	new = prepare_creds();
 	if (new == NULL)
 		return -ENOMEM;
-	new->security = newsmack;
+	tsp = kzalloc(sizeof(struct task_smack), GFP_KERNEL);
+	if (tsp == NULL) {
+		kfree(new);
+		return -ENOMEM;
+	}
+	tsp->smk_task = newsmack;
+	new->security = tsp;
 	commit_creds(new);
 	return size;
 }
@@ -2715,7 +2795,7 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent)
 		return;
 
 	ssp = sk->sk_security;
-	ssp->smk_in = ssp->smk_out = current_security();
+	ssp->smk_in = ssp->smk_out = smk_of_current();
 	/* cssp->smk_packet is already set in smack_inet_csk_clone() */
 }
 
@@ -2836,7 +2916,7 @@ static void smack_inet_csk_clone(struct sock *sk,
 static int smack_key_alloc(struct key *key, const struct cred *cred,
 			   unsigned long flags)
 {
-	key->security = cred->security;
+	key->security = smk_of_task(cred->security);
 	return 0;
 }
 
@@ -2865,6 +2945,7 @@ static int smack_key_permission(key_ref_t key_ref,
 {
 	struct key *keyp;
 	struct smk_audit_info ad;
+	char *tsp = smk_of_task(cred->security);
 
 	keyp = key_ref_to_ptr(key_ref);
 	if (keyp == NULL)
@@ -2878,14 +2959,14 @@ static int smack_key_permission(key_ref_t key_ref,
 	/*
 	 * This should not occur
 	 */
-	if (cred->security == NULL)
+	if (tsp == NULL)
 		return -EACCES;
 #ifdef CONFIG_AUDIT
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
 	ad.a.u.key_struct.key = keyp->serial;
 	ad.a.u.key_struct.key_desc = keyp->description;
 #endif
-	return smk_access(cred->security, keyp->security,
+	return smk_access(tsp, keyp->security,
 				 MAY_READWRITE, &ad);
 }
 #endif /* CONFIG_KEYS */
@@ -3087,6 +3168,8 @@ struct security_operations smack_ops = {
 	.sb_mount = 			smack_sb_mount,
 	.sb_umount = 			smack_sb_umount,
 
+	.bprm_set_creds =		smack_bprm_set_creds,
+
 	.inode_alloc_security = 	smack_inode_alloc_security,
 	.inode_free_security = 		smack_inode_free_security,
 	.inode_init_security = 		smack_inode_init_security,
@@ -3223,9 +3306,16 @@ static __init void init_smack_know_list(void)
 static __init int smack_init(void)
 {
 	struct cred *cred;
+	struct task_smack *tsp;
 
-	if (!security_module_enable(&smack_ops))
+	tsp = kzalloc(sizeof(struct task_smack), GFP_KERNEL);
+	if (tsp == NULL)
+		return -ENOMEM;
+
+	if (!security_module_enable(&smack_ops)) {
+		kfree(tsp);
 		return 0;
+	}
 
 	printk(KERN_INFO "Smack:  Initializing.\n");
 
@@ -3233,7 +3323,9 @@ static __init int smack_init(void)
 	 * Set the security state for the initial task.
 	 */
 	cred = (struct cred *) current->cred;
-	cred->security = &smack_known_floor.smk_known;
+	tsp->smk_forked = smack_known_floor.smk_known;
+	tsp->smk_task = smack_known_floor.smk_known;
+	cred->security = tsp;
 
 	/* initialize the smack_know_list */
 	init_smack_know_list();
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index dc1fd6239f24..01a0be93d8d0 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -121,7 +121,7 @@ static void smk_netlabel_audit_set(struct netlbl_audit *nap)
 {
 	nap->loginuid = audit_get_loginuid(current);
 	nap->sessionid = audit_get_sessionid(current);
-	nap->secid = smack_to_secid(current_security());
+	nap->secid = smack_to_secid(smk_of_current());
 }
 
 /*
@@ -1160,7 +1160,7 @@ static ssize_t smk_write_onlycap(struct file *file, const char __user *buf,
 				 size_t count, loff_t *ppos)
 {
 	char in[SMK_LABELLEN];
-	char *sp = current->cred->security;
+	char *sp = smk_of_task(current->cred->security);
 
 	if (!capable(CAP_MAC_ADMIN))
 		return -EPERM;
-- 
cgit v1.2.3-71-gd317


From f689b34bfbd2154a8fa255060dd872a6db3b4742 Mon Sep 17 00:00:00 2001
From: Miloslav Trmač <mitr@redhat.com>
Date: Fri, 3 Dec 2010 13:51:52 +0800
Subject: include: Install linux/if_alg.h for user-space crypto API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Miloslav Trmač <mitr@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 831c4634162c..b3cca8c88e86 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -154,6 +154,7 @@ header-y += icmpv6.h
 header-y += if.h
 header-y += if_addr.h
 header-y += if_addrlabel.h
+header-y += if_alg.h
 header-y += if_arcnet.h
 header-y += if_arp.h
 header-y += if_bonding.h
-- 
cgit v1.2.3-71-gd317


From 3e3198f1adda8e0fbd499bde806781237d6c841f Mon Sep 17 00:00:00 2001
From: Roman Tereshonkov <roman.tereshonkov@nokia.com>
Date: Wed, 3 Nov 2010 12:55:19 +0200
Subject: mtd: onenand: add option and variable for cache program feature

A new option is added for cache program feature. A new variable
ongoing is introduced for onenand_chip to handle the multi-command
cache program operation.

Signed-off-by: Roman Tereshonkov <roman.tereshonkov@nokia.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/onenand.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 0c8815bfae1c..6da3fe314828 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -137,6 +137,14 @@ struct onenand_chip {
 	void			*bbm;
 
 	void			*priv;
+
+	/*
+	 * Shows that the current operation is composed
+	 * of sequence of commands. For example, cache program.
+	 * Such command status OnGo bit is checked at the end of
+	 * sequence.
+	 */
+	unsigned int		ongoing;
 };
 
 /*
@@ -171,6 +179,9 @@ struct onenand_chip {
 #define ONENAND_IS_2PLANE(this)			(0)
 #endif
 
+#define ONENAND_IS_CACHE_PROGRAM(this)					\
+	(this->options & ONENAND_HAS_CACHE_PROGRAM)
+
 /* Check byte access in OneNAND */
 #define ONENAND_CHECK_BYTE_ACCESS(addr)		(addr & 0x1)
 
@@ -181,6 +192,7 @@ struct onenand_chip {
 #define ONENAND_HAS_UNLOCK_ALL		(0x0002)
 #define ONENAND_HAS_2PLANE		(0x0004)
 #define ONENAND_HAS_4KB_PAGE		(0x0008)
+#define ONENAND_HAS_CACHE_PROGRAM	(0x0010)
 #define ONENAND_SKIP_UNLOCK_CHECK	(0x0100)
 #define ONENAND_PAGEBUF_ALLOC		(0x1000)
 #define ONENAND_OOBBUF_ALLOC		(0x2000)
-- 
cgit v1.2.3-71-gd317


From 65ab220c30bc2120eaa39753cadec68616e3f906 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@st.com>
Date: Mon, 8 Nov 2010 11:17:54 +0530
Subject: mtd: fsmc.h: including linux/io.h for definition of readl

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/fsmc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h
index 5d2556700ec2..e210b87e6cd6 100644
--- a/include/linux/mtd/fsmc.h
+++ b/include/linux/mtd/fsmc.h
@@ -16,6 +16,7 @@
 #ifndef __MTD_FSMC_H
 #define __MTD_FSMC_H
 
+#include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/physmap.h>
 #include <linux/types.h>
-- 
cgit v1.2.3-71-gd317


From cc31822250236ec173bb2aa149ebe2ba35405db2 Mon Sep 17 00:00:00 2001
From: Guillaume LECERF <glecerf@gmail.com>
Date: Wed, 17 Nov 2010 12:35:50 +0100
Subject: mtd: cfi_fixup: remove unused 'param' parameter

The 'param' parameter has never been used since its introduction, so
simply remove it.

Signed-off-by: Guillaume LECERF <glecerf@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0001.c | 54 ++++++++++-----------
 drivers/mtd/chips/cfi_cmdset_0002.c | 94 ++++++++++++++++++-------------------
 drivers/mtd/chips/cfi_util.c        |  2 +-
 drivers/mtd/chips/fwh_lock.h        |  2 +-
 include/linux/mtd/cfi.h             |  3 +-
 5 files changed, 77 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index ad9268b44416..44cbfc093ecc 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -162,7 +162,7 @@ static void cfi_tell_features(struct cfi_pri_intelext *extp)
 #endif
 
 /* Atmel chips don't use the same PRI format as Intel chips */
-static void fixup_convert_atmel_pri(struct mtd_info *mtd, void *param)
+static void fixup_convert_atmel_pri(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -202,7 +202,7 @@ static void fixup_convert_atmel_pri(struct mtd_info *mtd, void *param)
 	cfi->cfiq->BufWriteTimeoutMax = 0;
 }
 
-static void fixup_at49bv640dx_lock(struct mtd_info *mtd, void *param)
+static void fixup_at49bv640dx_lock(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -214,7 +214,7 @@ static void fixup_at49bv640dx_lock(struct mtd_info *mtd, void *param)
 
 #ifdef CMDSET0001_DISABLE_ERASE_SUSPEND_ON_WRITE
 /* Some Intel Strata Flash prior to FPO revision C has bugs in this area */
-static void fixup_intel_strataflash(struct mtd_info *mtd, void* param)
+static void fixup_intel_strataflash(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -227,7 +227,7 @@ static void fixup_intel_strataflash(struct mtd_info *mtd, void* param)
 #endif
 
 #ifdef CMDSET0001_DISABLE_WRITE_SUSPEND
-static void fixup_no_write_suspend(struct mtd_info *mtd, void* param)
+static void fixup_no_write_suspend(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -240,7 +240,7 @@ static void fixup_no_write_suspend(struct mtd_info *mtd, void* param)
 }
 #endif
 
-static void fixup_st_m28w320ct(struct mtd_info *mtd, void* param)
+static void fixup_st_m28w320ct(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -249,7 +249,7 @@ static void fixup_st_m28w320ct(struct mtd_info *mtd, void* param)
 	cfi->cfiq->BufWriteTimeoutMax = 0;	/* Not supported */
 }
 
-static void fixup_st_m28w320cb(struct mtd_info *mtd, void* param)
+static void fixup_st_m28w320cb(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -259,7 +259,7 @@ static void fixup_st_m28w320cb(struct mtd_info *mtd, void* param)
 		(cfi->cfiq->EraseRegionInfo[1] & 0xffff0000) | 0x3e;
 };
 
-static void fixup_use_point(struct mtd_info *mtd, void *param)
+static void fixup_use_point(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	if (!mtd->point && map_is_linear(map)) {
@@ -268,7 +268,7 @@ static void fixup_use_point(struct mtd_info *mtd, void *param)
 	}
 }
 
-static void fixup_use_write_buffers(struct mtd_info *mtd, void *param)
+static void fixup_use_write_buffers(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -282,7 +282,7 @@ static void fixup_use_write_buffers(struct mtd_info *mtd, void *param)
 /*
  * Some chips power-up with all sectors locked by default.
  */
-static void fixup_unlock_powerup_lock(struct mtd_info *mtd, void *param)
+static void fixup_unlock_powerup_lock(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -295,31 +295,31 @@ static void fixup_unlock_powerup_lock(struct mtd_info *mtd, void *param)
 }
 
 static struct cfi_fixup cfi_fixup_table[] = {
-	{ CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri, NULL },
-	{ CFI_MFR_ATMEL, AT49BV640D, fixup_at49bv640dx_lock, NULL },
-	{ CFI_MFR_ATMEL, AT49BV640DT, fixup_at49bv640dx_lock, NULL },
+	{ CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri },
+	{ CFI_MFR_ATMEL, AT49BV640D, fixup_at49bv640dx_lock },
+	{ CFI_MFR_ATMEL, AT49BV640DT, fixup_at49bv640dx_lock },
 #ifdef CMDSET0001_DISABLE_ERASE_SUSPEND_ON_WRITE
-	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_intel_strataflash, NULL },
+	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_intel_strataflash },
 #endif
 #ifdef CMDSET0001_DISABLE_WRITE_SUSPEND
-	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_no_write_suspend, NULL },
+	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_no_write_suspend },
 #endif
 #if !FORCE_WORD_WRITE
-	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_write_buffers, NULL },
+	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_write_buffers },
 #endif
-	{ CFI_MFR_ST, 0x00ba, /* M28W320CT */ fixup_st_m28w320ct, NULL },
-	{ CFI_MFR_ST, 0x00bb, /* M28W320CB */ fixup_st_m28w320cb, NULL },
-	{ CFI_MFR_INTEL, CFI_ID_ANY, fixup_unlock_powerup_lock, NULL, },
-	{ 0, 0, NULL, NULL }
+	{ CFI_MFR_ST, 0x00ba, /* M28W320CT */ fixup_st_m28w320ct },
+	{ CFI_MFR_ST, 0x00bb, /* M28W320CB */ fixup_st_m28w320cb },
+	{ CFI_MFR_INTEL, CFI_ID_ANY, fixup_unlock_powerup_lock },
+	{ 0, 0, NULL }
 };
 
 static struct cfi_fixup jedec_fixup_table[] = {
-	{ CFI_MFR_INTEL, I82802AB,   fixup_use_fwh_lock, NULL, },
-	{ CFI_MFR_INTEL, I82802AC,   fixup_use_fwh_lock, NULL, },
-	{ CFI_MFR_ST,    M50LPW080,  fixup_use_fwh_lock, NULL, },
-	{ CFI_MFR_ST,    M50FLW080A, fixup_use_fwh_lock, NULL, },
-	{ CFI_MFR_ST,    M50FLW080B, fixup_use_fwh_lock, NULL, },
-	{ 0, 0, NULL, NULL }
+	{ CFI_MFR_INTEL, I82802AB,   fixup_use_fwh_lock },
+	{ CFI_MFR_INTEL, I82802AC,   fixup_use_fwh_lock },
+	{ CFI_MFR_ST,    M50LPW080,  fixup_use_fwh_lock },
+	{ CFI_MFR_ST,    M50FLW080A, fixup_use_fwh_lock },
+	{ CFI_MFR_ST,    M50FLW080B, fixup_use_fwh_lock },
+	{ 0, 0, NULL }
 };
 static struct cfi_fixup fixup_table[] = {
 	/* The CFI vendor ids and the JEDEC vendor IDs appear
@@ -327,8 +327,8 @@ static struct cfi_fixup fixup_table[] = {
 	 * well.  This table is to pick all cases where
 	 * we know that is the case.
 	 */
-	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_point, NULL },
-	{ 0, 0, NULL, NULL }
+	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_point },
+	{ 0, 0, NULL }
 };
 
 static void cfi_fixup_major_minor(struct cfi_private *cfi,
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index 3b8e32d87977..9d68ab919f31 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -134,7 +134,7 @@ static void cfi_tell_features(struct cfi_pri_amdstd *extp)
 
 #ifdef AMD_BOOTLOC_BUG
 /* Wheee. Bring me the head of someone at AMD. */
-static void fixup_amd_bootblock(struct mtd_info *mtd, void* param)
+static void fixup_amd_bootblock(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -186,7 +186,7 @@ static void fixup_amd_bootblock(struct mtd_info *mtd, void* param)
 }
 #endif
 
-static void fixup_use_write_buffers(struct mtd_info *mtd, void *param)
+static void fixup_use_write_buffers(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -197,7 +197,7 @@ static void fixup_use_write_buffers(struct mtd_info *mtd, void *param)
 }
 
 /* Atmel chips don't use the same PRI format as AMD chips */
-static void fixup_convert_atmel_pri(struct mtd_info *mtd, void *param)
+static void fixup_convert_atmel_pri(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -228,14 +228,14 @@ static void fixup_convert_atmel_pri(struct mtd_info *mtd, void *param)
 	cfi->cfiq->BufWriteTimeoutMax = 0;
 }
 
-static void fixup_use_secsi(struct mtd_info *mtd, void *param)
+static void fixup_use_secsi(struct mtd_info *mtd)
 {
 	/* Setup for chips with a secsi area */
 	mtd->read_user_prot_reg = cfi_amdstd_secsi_read;
 	mtd->read_fact_prot_reg = cfi_amdstd_secsi_read;
 }
 
-static void fixup_use_erase_chip(struct mtd_info *mtd, void *param)
+static void fixup_use_erase_chip(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -250,7 +250,7 @@ static void fixup_use_erase_chip(struct mtd_info *mtd, void *param)
  * Some Atmel chips (e.g. the AT49BV6416) power-up with all sectors
  * locked by default.
  */
-static void fixup_use_atmel_lock(struct mtd_info *mtd, void *param)
+static void fixup_use_atmel_lock(struct mtd_info *mtd)
 {
 	mtd->lock = cfi_atmel_lock;
 	mtd->unlock = cfi_atmel_unlock;
@@ -271,7 +271,7 @@ static void fixup_old_sst_eraseregion(struct mtd_info *mtd)
 	cfi->cfiq->NumEraseRegions = 1;
 }
 
-static void fixup_sst39vf(struct mtd_info *mtd, void *param)
+static void fixup_sst39vf(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -282,7 +282,7 @@ static void fixup_sst39vf(struct mtd_info *mtd, void *param)
 	cfi->addr_unlock2 = 0x2AAA;
 }
 
-static void fixup_sst39vf_rev_b(struct mtd_info *mtd, void *param)
+static void fixup_sst39vf_rev_b(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -295,12 +295,12 @@ static void fixup_sst39vf_rev_b(struct mtd_info *mtd, void *param)
 	cfi->sector_erase_cmd = CMD(0x50);
 }
 
-static void fixup_sst38vf640x_sectorsize(struct mtd_info *mtd, void *param)
+static void fixup_sst38vf640x_sectorsize(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
 
-	fixup_sst39vf_rev_b(mtd, param);
+	fixup_sst39vf_rev_b(mtd);
 
 	/*
 	 * CFI reports 1024 sectors (0x03ff+1) of 64KBytes (0x0100*256) where
@@ -310,7 +310,7 @@ static void fixup_sst38vf640x_sectorsize(struct mtd_info *mtd, void *param)
 	pr_warning("%s: Bad 38VF640x CFI data; adjusting sector size from 64 to 8KiB\n", mtd->name);
 }
 
-static void fixup_s29gl064n_sectors(struct mtd_info *mtd, void *param)
+static void fixup_s29gl064n_sectors(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -321,7 +321,7 @@ static void fixup_s29gl064n_sectors(struct mtd_info *mtd, void *param)
 	}
 }
 
-static void fixup_s29gl032n_sectors(struct mtd_info *mtd, void *param)
+static void fixup_s29gl032n_sectors(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -334,47 +334,47 @@ static void fixup_s29gl032n_sectors(struct mtd_info *mtd, void *param)
 
 /* Used to fix CFI-Tables of chips without Extended Query Tables */
 static struct cfi_fixup cfi_nopri_fixup_table[] = {
-	{ CFI_MFR_SST, 0x234A, fixup_sst39vf, NULL, }, /* SST39VF1602 */
-	{ CFI_MFR_SST, 0x234B, fixup_sst39vf, NULL, }, /* SST39VF1601 */
-	{ CFI_MFR_SST, 0x235A, fixup_sst39vf, NULL, }, /* SST39VF3202 */
-	{ CFI_MFR_SST, 0x235B, fixup_sst39vf, NULL, }, /* SST39VF3201 */
-	{ CFI_MFR_SST, 0x235C, fixup_sst39vf_rev_b, NULL, }, /* SST39VF3202B */
-	{ CFI_MFR_SST, 0x235D, fixup_sst39vf_rev_b, NULL, }, /* SST39VF3201B */
-	{ CFI_MFR_SST, 0x236C, fixup_sst39vf_rev_b, NULL, }, /* SST39VF6402B */
-	{ CFI_MFR_SST, 0x236D, fixup_sst39vf_rev_b, NULL, }, /* SST39VF6401B */
-	{ 0, 0, NULL, NULL }
+	{ CFI_MFR_SST, 0x234a, fixup_sst39vf }, /* SST39VF1602 */
+	{ CFI_MFR_SST, 0x234b, fixup_sst39vf }, /* SST39VF1601 */
+	{ CFI_MFR_SST, 0x235a, fixup_sst39vf }, /* SST39VF3202 */
+	{ CFI_MFR_SST, 0x235b, fixup_sst39vf }, /* SST39VF3201 */
+	{ CFI_MFR_SST, 0x235c, fixup_sst39vf_rev_b }, /* SST39VF3202B */
+	{ CFI_MFR_SST, 0x235d, fixup_sst39vf_rev_b }, /* SST39VF3201B */
+	{ CFI_MFR_SST, 0x236c, fixup_sst39vf_rev_b }, /* SST39VF6402B */
+	{ CFI_MFR_SST, 0x236d, fixup_sst39vf_rev_b }, /* SST39VF6401B */
+	{ 0, 0, NULL }
 };
 
 static struct cfi_fixup cfi_fixup_table[] = {
-	{ CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri, NULL },
+	{ CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri },
 #ifdef AMD_BOOTLOC_BUG
-	{ CFI_MFR_AMD, CFI_ID_ANY, fixup_amd_bootblock, NULL },
-	{ CFI_MFR_MACRONIX, CFI_ID_ANY, fixup_amd_bootblock, NULL },
+	{ CFI_MFR_AMD, CFI_ID_ANY, fixup_amd_bootblock },
+	{ CFI_MFR_MACRONIX, CFI_ID_ANY, fixup_amd_bootblock },
 #endif
-	{ CFI_MFR_AMD, 0x0050, fixup_use_secsi, NULL, },
-	{ CFI_MFR_AMD, 0x0053, fixup_use_secsi, NULL, },
-	{ CFI_MFR_AMD, 0x0055, fixup_use_secsi, NULL, },
-	{ CFI_MFR_AMD, 0x0056, fixup_use_secsi, NULL, },
-	{ CFI_MFR_AMD, 0x005C, fixup_use_secsi, NULL, },
-	{ CFI_MFR_AMD, 0x005F, fixup_use_secsi, NULL, },
-	{ CFI_MFR_AMD, 0x0c01, fixup_s29gl064n_sectors, NULL, },
-	{ CFI_MFR_AMD, 0x1301, fixup_s29gl064n_sectors, NULL, },
-	{ CFI_MFR_AMD, 0x1a00, fixup_s29gl032n_sectors, NULL, },
-	{ CFI_MFR_AMD, 0x1a01, fixup_s29gl032n_sectors, NULL, },
-	{ CFI_MFR_SST, 0x536A, fixup_sst38vf640x_sectorsize, NULL, }, /* SST38VF6402 */
-	{ CFI_MFR_SST, 0x536B, fixup_sst38vf640x_sectorsize, NULL, }, /* SST38VF6401 */
-	{ CFI_MFR_SST, 0x536C, fixup_sst38vf640x_sectorsize, NULL, }, /* SST38VF6404 */
-	{ CFI_MFR_SST, 0x536D, fixup_sst38vf640x_sectorsize, NULL, }, /* SST38VF6403 */
+	{ CFI_MFR_AMD, 0x0050, fixup_use_secsi },
+	{ CFI_MFR_AMD, 0x0053, fixup_use_secsi },
+	{ CFI_MFR_AMD, 0x0055, fixup_use_secsi },
+	{ CFI_MFR_AMD, 0x0056, fixup_use_secsi },
+	{ CFI_MFR_AMD, 0x005C, fixup_use_secsi },
+	{ CFI_MFR_AMD, 0x005F, fixup_use_secsi },
+	{ CFI_MFR_AMD, 0x0c01, fixup_s29gl064n_sectors },
+	{ CFI_MFR_AMD, 0x1301, fixup_s29gl064n_sectors },
+	{ CFI_MFR_AMD, 0x1a00, fixup_s29gl032n_sectors },
+	{ CFI_MFR_AMD, 0x1a01, fixup_s29gl032n_sectors },
+	{ CFI_MFR_SST, 0x536a, fixup_sst38vf640x_sectorsize }, /* SST38VF6402 */
+	{ CFI_MFR_SST, 0x536b, fixup_sst38vf640x_sectorsize }, /* SST38VF6401 */
+	{ CFI_MFR_SST, 0x536c, fixup_sst38vf640x_sectorsize }, /* SST38VF6404 */
+	{ CFI_MFR_SST, 0x536d, fixup_sst38vf640x_sectorsize }, /* SST38VF6403 */
 #if !FORCE_WORD_WRITE
-	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_write_buffers, NULL, },
+	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_write_buffers },
 #endif
-	{ 0, 0, NULL, NULL }
+	{ 0, 0, NULL }
 };
 static struct cfi_fixup jedec_fixup_table[] = {
-	{ CFI_MFR_SST, SST49LF004B, fixup_use_fwh_lock, NULL, },
-	{ CFI_MFR_SST, SST49LF040B, fixup_use_fwh_lock, NULL, },
-	{ CFI_MFR_SST, SST49LF008A, fixup_use_fwh_lock, NULL, },
-	{ 0, 0, NULL, NULL }
+	{ CFI_MFR_SST, SST49LF004B, fixup_use_fwh_lock },
+	{ CFI_MFR_SST, SST49LF040B, fixup_use_fwh_lock },
+	{ CFI_MFR_SST, SST49LF008A, fixup_use_fwh_lock },
+	{ 0, 0, NULL }
 };
 
 static struct cfi_fixup fixup_table[] = {
@@ -383,9 +383,9 @@ static struct cfi_fixup fixup_table[] = {
 	 * well.  This table is to pick all cases where
 	 * we know that is the case.
 	 */
-	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_erase_chip, NULL },
-	{ CFI_MFR_ATMEL, AT49BV6416, fixup_use_atmel_lock, NULL },
-	{ 0, 0, NULL, NULL }
+	{ CFI_MFR_ANY, CFI_ID_ANY, fixup_use_erase_chip },
+	{ CFI_MFR_ATMEL, AT49BV6416, fixup_use_atmel_lock },
+	{ 0, 0, NULL }
 };
 
 
diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c
index 360525c637d2..6ae3d111e1e7 100644
--- a/drivers/mtd/chips/cfi_util.c
+++ b/drivers/mtd/chips/cfi_util.c
@@ -156,7 +156,7 @@ void cfi_fixup(struct mtd_info *mtd, struct cfi_fixup *fixups)
 	for (f=fixups; f->fixup; f++) {
 		if (((f->mfr == CFI_MFR_ANY) || (f->mfr == cfi->mfr)) &&
 		    ((f->id  == CFI_ID_ANY)  || (f->id  == cfi->id))) {
-			f->fixup(mtd, f->param);
+			f->fixup(mtd);
 		}
 	}
 }
diff --git a/drivers/mtd/chips/fwh_lock.h b/drivers/mtd/chips/fwh_lock.h
index d18064977192..5e3cc80128aa 100644
--- a/drivers/mtd/chips/fwh_lock.h
+++ b/drivers/mtd/chips/fwh_lock.h
@@ -98,7 +98,7 @@ static int fwh_unlock_varsize(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	return ret;
 }
 
-static void fixup_use_fwh_lock(struct mtd_info *mtd, void *param)
+static void fixup_use_fwh_lock(struct mtd_info *mtd)
 {
 	printk(KERN_NOTICE "using fwh lock/unlock method\n");
 	/* Setup for the chips with the fwh lock method */
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 4dd0c2cd7659..a9baee6864af 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -527,8 +527,7 @@ struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t s
 struct cfi_fixup {
 	uint16_t mfr;
 	uint16_t id;
-	void (*fixup)(struct mtd_info *mtd, void* param);
-	void* param;
+	void (*fixup)(struct mtd_info *mtd);
 };
 
 #define CFI_MFR_ANY		0xFFFF
-- 
cgit v1.2.3-71-gd317


From 1534b8b09757190ce6e97fa97f9ad77c49082cd8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 18 Nov 2010 15:02:21 -0800
Subject: mtd: fix nand kernel-doc warnings

Warning(include/linux/mtd/nand.h:543): No description found for parameter 'badblockbits'
Warning(drivers/mtd/nand/nand_bbt.c:1101): No description found for parameter 'mtd'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc:	David Woodhouse <dwmw2@infradead.org>
Cc:	linux-mtd@lists.infradead.org
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_bbt.c | 3 ++-
 include/linux/mtd/nand.h    | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 586b981f0e61..6ebd869993aa 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -1092,7 +1092,8 @@ static void mark_bbt_region(struct mtd_info *mtd, struct nand_bbt_descr *td)
 
 /**
  * verify_bbt_descr - verify the bad block description
- * @bd:			the table to verify
+ * @mtd:	MTD device structure
+ * @bd:		the table to verify
  *
  * This functions performs a few sanity checks on the bad block description
  * table.
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 63e17d01fde9..1f489b247a29 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -448,6 +448,8 @@ struct nand_buffers {
  *			See the defines for further explanation.
  * @badblockpos:	[INTERN] position of the bad block marker in the oob
  *			area.
+ * @badblockbits:	[INTERN] number of bits to left-shift the bad block
+ *			number
  * @cellinfo:		[INTERN] MLC/multichip data from chip ident
  * @numchips:		[INTERN] number of physical chips
  * @chipsize:		[INTERN] the size of one chip for multichip arrays
-- 
cgit v1.2.3-71-gd317


From a7e93dcd9aacb3ef4acfcc4310577f3ae0741821 Mon Sep 17 00:00:00 2001
From: Roman Tereshonkov <roman.tereshonkov@nokia.com>
Date: Tue, 23 Nov 2010 14:17:17 +0200
Subject: mtd: fix master device identification for mtd repartition

Function mtd_has_master renamed as mtd_is_partition to follow the function logic.
The patch fixes the problem of checking the right mtd device for partition creation.
To delete partition checking is not needed here so as it is done in mtd_del_partition.
By master we consider the mtd device which does not belong to any partition.

Signed-off-by: Roman Tereshonkov <roman.tereshonkov@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdchar.c          |  8 ++++----
 drivers/mtd/mtdpart.c          | 10 +++++-----
 include/linux/mtd/partitions.h |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index cad8fcc7b239..16de17b5b829 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -522,10 +522,6 @@ static int mtd_blkpg_ioctl(struct mtd_info *mtd,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	/* Only master mtd device must be used to control partitions */
-	if (!mtd_is_master(mtd))
-		return -EINVAL;
-
 	if (copy_from_user(&a, arg, sizeof(struct blkpg_ioctl_arg)))
 		return -EFAULT;
 
@@ -535,6 +531,10 @@ static int mtd_blkpg_ioctl(struct mtd_info *mtd,
 	switch (a.op) {
 	case BLKPG_ADD_PARTITION:
 
+		/* Only master mtd device must be used to add partitions */
+		if (mtd_is_partition(mtd))
+			return -EINVAL;
+
 		return mtd_add_partition(mtd, p.devname, p.start, p.length);
 
 	case BLKPG_DEL_PARTITION:
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 79e3689f1e16..1047ff0a4f55 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -720,19 +720,19 @@ int parse_mtd_partitions(struct mtd_info *master, const char **types,
 }
 EXPORT_SYMBOL_GPL(parse_mtd_partitions);
 
-int mtd_is_master(struct mtd_info *mtd)
+int mtd_is_partition(struct mtd_info *mtd)
 {
 	struct mtd_part *part;
-	int nopart = 0;
+	int ispart = 0;
 
 	mutex_lock(&mtd_partitions_mutex);
 	list_for_each_entry(part, &mtd_partitions, list)
 		if (&part->mtd == mtd) {
-			nopart = 1;
+			ispart = 1;
 			break;
 		}
 	mutex_unlock(&mtd_partitions_mutex);
 
-	return nopart;
+	return ispart;
 }
-EXPORT_SYMBOL_GPL(mtd_is_master);
+EXPORT_SYMBOL_GPL(mtd_is_partition);
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 2b54316591d2..4a0a8ba90a72 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -89,7 +89,7 @@ static inline int mtd_has_cmdlinepart(void) { return 1; }
 static inline int mtd_has_cmdlinepart(void) { return 0; }
 #endif
 
-int mtd_is_master(struct mtd_info *mtd);
+int mtd_is_partition(struct mtd_info *mtd);
 int mtd_add_partition(struct mtd_info *master, char *name,
 		      long long offset, long long length);
 int mtd_del_partition(struct mtd_info *master, int partno);
-- 
cgit v1.2.3-71-gd317


From 593cd8711221c9661dbf9beb2fb42fecca03e693 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Mon, 29 Nov 2010 13:52:19 +0100
Subject: mtd: FSMC NAND use the PrimeCell identifier macros

The FSMC actually has a standard ARM PrimeCell ID register, and
the "revision" part of that register contains the thing that
the code is looking at. Reuse the infrastructure from the AMBA
bus abstraction and rid local defines.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/fsmc_nand.c | 23 ++++++++++++++++++++---
 include/linux/mtd/fsmc.h     | 19 -------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c
index 5af4b3c122f5..205b10b9f9b9 100644
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -31,6 +31,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/mtd/fsmc.h>
+#include <linux/amba/bus.h>
 #include <mtd/mtd-abi.h>
 
 static struct nand_ecclayout fsmc_ecc1_layout = {
@@ -184,8 +185,9 @@ const char *part_probes[] = { "cmdlinepart", NULL };
 #endif
 
 /**
- * struct fsmc_nand_data - atructure for FSMC NAND device state
+ * struct fsmc_nand_data - structure for FSMC NAND device state
  *
+ * @pid:		Part ID on the AMBA PrimeCell format
  * @mtd:		MTD info for a NAND flash.
  * @nand:		Chip related info for a NAND flash.
  * @partitions:		Partition info for a NAND Flash.
@@ -201,6 +203,7 @@ const char *part_probes[] = { "cmdlinepart", NULL };
  * @regs_va:		FSMC regs base address.
  */
 struct fsmc_nand_data {
+	u32			pid;
 	struct mtd_info		mtd;
 	struct nand_chip	nand;
 	struct mtd_partition	*partitions;
@@ -541,6 +544,8 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 	struct fsmc_regs *regs;
 	struct resource *res;
 	int ret = 0;
+	u32 pid;
+	int i;
 
 	if (!pdata) {
 		dev_err(&pdev->dev, "platform data is NULL\n");
@@ -630,6 +635,18 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_probe1;
 
+	/*
+	 * This device ID is actually a common AMBA ID as used on the
+	 * AMBA PrimeCell bus. However it is not a PrimeCell.
+	 */
+	for (pid = 0, i = 0; i < 4; i++)
+		pid |= (readl(host->regs_va + resource_size(res) - 0x20 + 4 * i) & 255) << (i * 8);
+	host->pid = pid;
+	dev_info(&pdev->dev, "FSMC device partno %03x, manufacturer %02x, "
+		 "revision %02x, config %02x\n",
+		 AMBA_PART_BITS(pid), AMBA_MANF_BITS(pid),
+		 AMBA_REV_BITS(pid), AMBA_CONFIG_BITS(pid));
+
 	host->bank = pdata->bank;
 	host->select_chip = pdata->select_bank;
 	regs = host->regs_va;
@@ -657,7 +674,7 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 
 	fsmc_nand_setup(regs, host->bank, nand->options & NAND_BUSWIDTH_16);
 
-	if (get_fsmc_version(host->regs_va) == FSMC_VER8) {
+	if (AMBA_REV_BITS(host->pid) >= 8) {
 		nand->ecc.read_page = fsmc_read_page_hwecc;
 		nand->ecc.calculate = fsmc_read_hwecc_ecc4;
 		nand->ecc.correct = fsmc_correct_data;
@@ -677,7 +694,7 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 		goto err_probe;
 	}
 
-	if (get_fsmc_version(host->regs_va) == FSMC_VER8) {
+	if (AMBA_REV_BITS(host->pid) >= 8) {
 		if (host->mtd.writesize == 512) {
 			nand->ecc.layout = &fsmc_ecc4_sp_layout;
 			host->ecc_place = &fsmc_ecc4_sp_place;
diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h
index e210b87e6cd6..96e8e67a053e 100644
--- a/include/linux/mtd/fsmc.h
+++ b/include/linux/mtd/fsmc.h
@@ -115,25 +115,6 @@ struct fsmc_regs {
 #define FSMC_THOLD_4		(4 << 16)
 #define FSMC_THIZ_1		(1 << 24)
 
-/* peripid2 register definitions */
-#define FSMC_REVISION_MSK	(0xf)
-#define FSMC_REVISION_SHFT	(0x4)
-
-#define FSMC_VER1		1
-#define FSMC_VER2		2
-#define FSMC_VER3		3
-#define FSMC_VER4		4
-#define FSMC_VER5		5
-#define FSMC_VER6		6
-#define FSMC_VER7		7
-#define FSMC_VER8		8
-
-static inline uint32_t get_fsmc_version(struct fsmc_regs *regs)
-{
-	return (readl(&regs->peripid2) >> FSMC_REVISION_SHFT) &
-				FSMC_REVISION_MSK;
-}
-
 /*
  * There are 13 bytes of ecc for every 512 byte block in FSMC version 8
  * and it has to be read consecutively and immediately after the 512
-- 
cgit v1.2.3-71-gd317


From b5602e86432aaf0cc90dd207bf74e3a2bfb5078b Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Mon, 29 Nov 2010 13:52:27 +0100
Subject: mtd: FSMC NAND fix obvious speling errors

Fix spelling in the interface file.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/fsmc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h
index 96e8e67a053e..6987995ad3cf 100644
--- a/include/linux/mtd/fsmc.h
+++ b/include/linux/mtd/fsmc.h
@@ -28,7 +28,7 @@
 
 /*
  * The placement of the Command Latch Enable (CLE) and
- * Address Latch Enable (ALE) is twised around in the
+ * Address Latch Enable (ALE) is twisted around in the
  * SPEAR310 implementation.
  */
 #if defined(CONFIG_MACH_SPEAR310)
@@ -63,7 +63,7 @@ struct fsmc_nor_bank_regs {
 
 /* ctrl_tim register definitions */
 
-struct fsms_nand_bank_regs {
+struct fsmc_nand_bank_regs {
 	uint32_t pc;
 	uint32_t sts;
 	uint32_t comm;
@@ -79,7 +79,7 @@ struct fsms_nand_bank_regs {
 struct fsmc_regs {
 	struct fsmc_nor_bank_regs nor_bank_regs[FSMC_MAX_NOR_BANKS];
 	uint8_t reserved_1[0x40 - 0x20];
-	struct fsms_nand_bank_regs bank_regs[FSMC_MAX_NAND_BANKS];
+	struct fsmc_nand_bank_regs bank_regs[FSMC_MAX_NAND_BANKS];
 	uint8_t reserved_2[0xfe0 - 0xc0];
 	uint32_t peripid0;			/* 0xfe0 */
 	uint32_t peripid1;			/* 0xfe4 */
-- 
cgit v1.2.3-71-gd317


From 02c048b919455aaa38628563cdcc2e691c8a9f53 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 7 Dec 2010 20:16:56 +0100
Subject: fuse: allow batching of FORGET requests

Terje Malmedal reports that a fuse filesystem with 32 million inodes
on a machine with lots of memory can take up to 30 minutes to process
FORGET requests when all those inodes are evicted from the icache.

To solve this, create a BATCH_FORGET request that allows up to about
8000 FORGET requests to be sent in a single message.

This request is only sent if userspace supports interface version 7.16
or later, otherwise fall back to sending individual FORGET messages.

Reported-by: Terje Malmedal <terje.malmedal@usit.uio.no>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 92 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/fuse/fuse_i.h     |  3 +-
 include/linux/fuse.h | 16 ++++++++-
 3 files changed, 97 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fed65303eeeb..cf8d28d1fbad 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -254,8 +254,8 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 		       u64 nodeid, u64 nlookup)
 {
-	forget->nodeid = nodeid;
-	forget->nlookup = nlookup;
+	forget->forget_one.nodeid = nodeid;
+	forget->forget_one.nlookup = nlookup;
 
 	spin_lock(&fc->lock);
 	fc->forget_list_tail->next = forget;
@@ -974,15 +974,26 @@ __releases(fc->lock)
 	return err ? err : reqsize;
 }
 
-static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc)
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+					       unsigned max,
+					       unsigned *countp)
 {
-	struct fuse_forget_link *forget = fc->forget_list_head.next;
+	struct fuse_forget_link *head = fc->forget_list_head.next;
+	struct fuse_forget_link **newhead = &head;
+	unsigned count;
 
-	fc->forget_list_head.next = forget->next;
+	for (count = 0; *newhead != NULL && count < max; count++)
+		newhead = &(*newhead)->next;
+
+	fc->forget_list_head.next = *newhead;
+	*newhead = NULL;
 	if (fc->forget_list_head.next == NULL)
 		fc->forget_list_tail = &fc->forget_list_head;
 
-	return forget;
+	if (countp != NULL)
+		*countp = count;
+
+	return head;
 }
 
 static int fuse_read_single_forget(struct fuse_conn *fc,
@@ -991,13 +1002,13 @@ static int fuse_read_single_forget(struct fuse_conn *fc,
 __releases(fc->lock)
 {
 	int err;
-	struct fuse_forget_link *forget = dequeue_forget(fc);
+	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
 	struct fuse_forget_in arg = {
-		.nlookup = forget->nlookup,
+		.nlookup = forget->forget_one.nlookup,
 	};
 	struct fuse_in_header ih = {
 		.opcode = FUSE_FORGET,
-		.nodeid = forget->nodeid,
+		.nodeid = forget->forget_one.nodeid,
 		.unique = fuse_get_unique(fc),
 		.len = sizeof(ih) + sizeof(arg),
 	};
@@ -1018,6 +1029,65 @@ __releases(fc->lock)
 	return ih.len;
 }
 
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+				   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+	int err;
+	unsigned max_forgets;
+	unsigned count;
+	struct fuse_forget_link *head;
+	struct fuse_batch_forget_in arg = { .count = 0 };
+	struct fuse_in_header ih = {
+		.opcode = FUSE_BATCH_FORGET,
+		.unique = fuse_get_unique(fc),
+		.len = sizeof(ih) + sizeof(arg),
+	};
+
+	if (nbytes < ih.len) {
+		spin_unlock(&fc->lock);
+		return -EINVAL;
+	}
+
+	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+	head = dequeue_forget(fc, max_forgets, &count);
+	spin_unlock(&fc->lock);
+
+	arg.count = count;
+	ih.len += count * sizeof(struct fuse_forget_one);
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+
+	while (head) {
+		struct fuse_forget_link *forget = head;
+
+		if (!err) {
+			err = fuse_copy_one(cs, &forget->forget_one,
+					    sizeof(forget->forget_one));
+		}
+		head = forget->next;
+		kfree(forget);
+	}
+
+	fuse_copy_finish(cs);
+
+	if (err)
+		return err;
+
+	return ih.len;
+}
+
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+			    size_t nbytes)
+__releases(fc->lock)
+{
+	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+		return fuse_read_single_forget(fc, cs, nbytes);
+	else
+		return fuse_read_batch_forget(fc, cs, nbytes);
+}
+
 /*
  * Read a single request into the userspace filesystem's buffer.  This
  * function waits until a request is available, then removes it from
@@ -1058,7 +1128,7 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
 
 	if (forget_pending(fc)) {
 		if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
-			return fuse_read_single_forget(fc, cs, nbytes);
+			return fuse_read_forget(fc, cs, nbytes);
 
 		if (fc->forget_batch <= -8)
 			fc->forget_batch = 16;
@@ -1837,7 +1907,7 @@ __acquires(fc->lock)
 	end_requests(fc, &fc->pending);
 	end_requests(fc, &fc->processing);
 	while (forget_pending(fc))
-		kfree(dequeue_forget(fc));
+		kfree(dequeue_forget(fc, 1, NULL));
 }
 
 /*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 33369c63a522..ae5744a2f9e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -55,8 +55,7 @@ extern unsigned max_user_congthresh;
 
 /* One forget request */
 struct fuse_forget_link {
-	u64	nodeid;
-	u64	nlookup;
+	struct fuse_forget_one forget_one;
 	struct fuse_forget_link *next;
 };
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index c3c578e09833..cf11881f4938 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -41,6 +41,9 @@
  * 7.15
  *  - add store notify
  *  - add retrieve notify
+ *
+ * 7.16
+ *  - add BATCH_FORGET request
  */
 
 #ifndef _LINUX_FUSE_H
@@ -72,7 +75,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 15
+#define FUSE_KERNEL_MINOR_VERSION 16
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -256,6 +259,7 @@ enum fuse_opcode {
 	FUSE_IOCTL         = 39,
 	FUSE_POLL          = 40,
 	FUSE_NOTIFY_REPLY  = 41,
+	FUSE_BATCH_FORGET  = 42,
 
 	/* CUSE specific operations */
 	CUSE_INIT          = 4096,
@@ -290,6 +294,16 @@ struct fuse_forget_in {
 	__u64	nlookup;
 };
 
+struct fuse_forget_one {
+	__u64	nodeid;
+	__u64	nlookup;
+};
+
+struct fuse_batch_forget_in {
+	__u32	count;
+	__u32	dummy;
+};
+
 struct fuse_getattr_in {
 	__u32	getattr_flags;
 	__u32	dummy;
-- 
cgit v1.2.3-71-gd317


From 1baa26b2be92fe9917e2f7ef46d423b5dfa4da71 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 7 Dec 2010 20:16:56 +0100
Subject: fuse: fix ioctl ABI

In kernel ABI version 7.16 and later FUSE_IOCTL_RETRY reply from a
unrestricted IOCTL request shall return with an array of 'struct
fuse_ioctl_iovec' instead of 'struct iovec'.  This fixes the ABI
ambiguity of 32bit vs. 64bit.

Reported-by: "ccmail111" <ccmail111@yahoo.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: Tejun Heo <tj@kernel.org>
---
 fs/fuse/file.c       | 53 +++++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/fuse.h | 10 ++++++++++
 2 files changed, 58 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ca3b6bbb3790..95da1bc1c826 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1634,9 +1634,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
  * and 64bit.  Fortunately we can determine which structure the server
  * used from the size of the reply.
  */
-static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
-				 size_t transferred, unsigned count,
-				 bool is_compat)
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+				     size_t transferred, unsigned count,
+				     bool is_compat)
 {
 #ifdef CONFIG_COMPAT
 	if (count * sizeof(struct compat_iovec) == transferred) {
@@ -1680,6 +1680,42 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
 	return 0;
 }
 
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+				 void *src, size_t transferred, unsigned count,
+				 bool is_compat)
+{
+	unsigned i;
+	struct fuse_ioctl_iovec *fiov = src;
+
+	if (fc->minor < 16) {
+		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+						 count, is_compat);
+	}
+
+	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+		return -EIO;
+
+	for (i = 0; i < count; i++) {
+		/* Did the server supply an inappropriate value? */
+		if (fiov[i].base != (unsigned long) fiov[i].base ||
+		    fiov[i].len != (unsigned long) fiov[i].len)
+			return -EIO;
+
+		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+		dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+		if (is_compat &&
+		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+		     (compat_size_t) dst[i].iov_len != fiov[i].len))
+			return -EIO;
+#endif
+	}
+
+	return 0;
+}
+
+
 /*
  * For ioctls, there is no generic way to determine how much memory
  * needs to be read and/or written.  Furthermore, ioctls are allowed
@@ -1746,8 +1782,15 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 	size_t in_size, out_size, transferred;
 	int err;
 
+#if BITS_PER_LONG == 32
+	inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+	if (flags & FUSE_IOCTL_COMPAT)
+		inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+
 	/* assume all the iovs returned by client always fits in a page */
-	BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
 
 	err = -ENOMEM;
 	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
@@ -1862,7 +1905,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 			goto out;
 
 		vaddr = kmap_atomic(pages[0], KM_USER0);
-		err = fuse_copy_ioctl_iovec(iov_page, vaddr,
+		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
 					    transferred, in_iovs + out_iovs,
 					    (flags & FUSE_IOCTL_COMPAT) != 0);
 		kunmap_atomic(vaddr, KM_USER0);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index cf11881f4938..d464de53db43 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -44,6 +44,9 @@
  *
  * 7.16
  *  - add BATCH_FORGET request
+ *  - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
+ *    fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
+ *  - add FUSE_IOCTL_32BIT flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -203,12 +206,14 @@ struct fuse_file_lock {
  * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
  * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
  * FUSE_IOCTL_RETRY: retry with new iovecs
+ * FUSE_IOCTL_32BIT: 32bit ioctl
  *
  * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
  */
 #define FUSE_IOCTL_COMPAT	(1 << 0)
 #define FUSE_IOCTL_UNRESTRICTED	(1 << 1)
 #define FUSE_IOCTL_RETRY	(1 << 2)
+#define FUSE_IOCTL_32BIT	(1 << 3)
 
 #define FUSE_IOCTL_MAX_IOV	256
 
@@ -524,6 +529,11 @@ struct fuse_ioctl_in {
 	__u32	out_size;
 };
 
+struct fuse_ioctl_iovec {
+	__u64	base;
+	__u64	len;
+};
+
 struct fuse_ioctl_out {
 	__s32	result;
 	__u32	flags;
-- 
cgit v1.2.3-71-gd317


From 5c6d1125f8dbd1bfef39e38fbc2837003be78a59 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <ext-jarkko.2.sakkinen@nokia.com>
Date: Tue, 7 Dec 2010 13:34:01 +0200
Subject: Smack: Transmute labels on specified directories

In a situation where Smack access rules allow processes
with multiple labels to write to a directory it is easy
to get into a situation where the directory gets cluttered
with files that the owner can't deal with because while
they could be written to the directory a process at the
label of the directory can't write them. This is generally
the desired behavior, but when it isn't it is a real
issue.

This patch introduces a new attribute SMACK64TRANSMUTE that
instructs Smack to create the file with the label of the directory
under certain circumstances.

A new access mode, "t" for transmute, is made available to
Smack access rules, which are expanded from "rwxa" to "rwxat".
If a file is created in a directory marked as transmutable
and if access was granted to perform the operation by a rule
that included the transmute mode, then the file gets the
Smack label of the directory instead of the Smack label of the
creating process.

Note that this is equivalent to creating an empty file at the
label of the directory and then having the other process write
to it. The transmute scheme requires that both the access rule
allows transmutation and that the directory be explicitly marked.

Signed-off-by: Jarkko Sakkinen <ext-jarkko.2.sakkinen@nokia.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
---
 include/linux/xattr.h         |  2 ++
 security/smack/smack.h        | 17 +++++++++-
 security/smack/smack_access.c | 54 +++++++++++++++++++++++--------
 security/smack/smack_lsm.c    | 74 +++++++++++++++++++++++++++++--------------
 security/smack/smackfs.c      | 37 +++++++++++++++++++---
 5 files changed, 141 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 351c7901d74a..e6131ef98d8f 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -41,10 +41,12 @@
 #define XATTR_SMACK_IPIN "SMACK64IPIN"
 #define XATTR_SMACK_IPOUT "SMACK64IPOUT"
 #define XATTR_SMACK_EXEC "SMACK64EXEC"
+#define XATTR_SMACK_TRANSMUTE "SMACK64TRANSMUTE"
 #define XATTR_NAME_SMACK XATTR_SECURITY_PREFIX XATTR_SMACK_SUFFIX
 #define XATTR_NAME_SMACKIPIN	XATTR_SECURITY_PREFIX XATTR_SMACK_IPIN
 #define XATTR_NAME_SMACKIPOUT	XATTR_SECURITY_PREFIX XATTR_SMACK_IPOUT
 #define XATTR_NAME_SMACKEXEC	XATTR_SECURITY_PREFIX XATTR_SMACK_EXEC
+#define XATTR_NAME_SMACKTRANSMUTE XATTR_SECURITY_PREFIX XATTR_SMACK_TRANSMUTE
 
 #define XATTR_CAPS_SUFFIX "capability"
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
diff --git a/security/smack/smack.h b/security/smack/smack.h
index a2e2cdfab4ef..129c4eb8ffb1 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -62,6 +62,7 @@ struct task_smack {
 };
 
 #define	SMK_INODE_INSTANT	0x01	/* inode is instantiated */
+#define	SMK_INODE_TRANSMUTE	0x02	/* directory is transmuting */
 
 /*
  * A label access rule.
@@ -166,6 +167,10 @@ struct smack_known {
 #define SMACK_CIPSO_MAXLEVEL            255     /* CIPSO 2.2 standard */
 #define SMACK_CIPSO_MAXCATNUM           239     /* CIPSO 2.2 standard */
 
+/*
+ * Flag for transmute access
+ */
+#define MAY_TRANSMUTE	64
 /*
  * Just to make the common cases easier to deal with
  */
@@ -197,6 +202,7 @@ struct inode_smack *new_inode_smack(char *);
 /*
  * These functions are in smack_access.c
  */
+int smk_access_entry(char *, char *);
 int smk_access(char *, char *, int, struct smk_audit_info *);
 int smk_curacc(char *, u32, struct smk_audit_info *);
 int smack_to_cipso(const char *, struct smack_cipso *);
@@ -239,6 +245,15 @@ static inline void smack_catset_bit(int cat, char *catsetp)
 	catsetp[(cat - 1) / 8] |= 0x80 >> ((cat - 1) % 8);
 }
 
+/*
+ * Is the directory transmuting?
+ */
+static inline int smk_inode_transmutable(const struct inode *isp)
+{
+	struct inode_smack *sip = isp->i_security;
+	return (sip->smk_flags & SMK_INODE_TRANSMUTE) != 0;
+}
+
 /*
  * Present a pointer to the smack label in an inode blob.
  */
@@ -265,7 +280,7 @@ static inline char *smk_of_forked(const struct task_smack *tsp)
 }
 
 /*
- * Present a pointer to the smack label in the curren task blob.
+ * Present a pointer to the smack label in the current task blob.
  */
 static inline char *smk_of_current(void)
 {
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index 42becbc1ce33..7ba8478f599e 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -66,6 +66,46 @@ static u32 smack_next_secid = 10;
  */
 int log_policy = SMACK_AUDIT_DENIED;
 
+/**
+ * smk_access_entry - look up matching access rule
+ * @subject_label: a pointer to the subject's Smack label
+ * @object_label: a pointer to the object's Smack label
+ *
+ * This function looks up the subject/object pair in the
+ * access rule list and returns pointer to the matching rule if found,
+ * NULL otherwise.
+ *
+ * NOTE:
+ * Even though Smack labels are usually shared on smack_list
+ * labels that come in off the network can't be imported
+ * and added to the list for locking reasons.
+ *
+ * Therefore, it is necessary to check the contents of the labels,
+ * not just the pointer values. Of course, in most cases the labels
+ * will be on the list, so checking the pointers may be a worthwhile
+ * optimization.
+ */
+int smk_access_entry(char *subject_label, char *object_label)
+{
+	u32 may = MAY_NOT;
+	struct smack_rule *srp;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(srp, &smack_rule_list, list) {
+		if (srp->smk_subject == subject_label ||
+		    strcmp(srp->smk_subject, subject_label) == 0) {
+			if (srp->smk_object == object_label ||
+			    strcmp(srp->smk_object, object_label) == 0) {
+				may = srp->smk_access;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return may;
+}
+
 /**
  * smk_access - determine if a subject has a specific access to an object
  * @subject_label: a pointer to the subject's Smack label
@@ -90,7 +130,6 @@ int smk_access(char *subject_label, char *object_label, int request,
 	       struct smk_audit_info *a)
 {
 	u32 may = MAY_NOT;
-	struct smack_rule *srp;
 	int rc = 0;
 
 	/*
@@ -144,18 +183,7 @@ int smk_access(char *subject_label, char *object_label, int request,
 	 * access (e.g. read is included in readwrite) it's
 	 * good.
 	 */
-	rcu_read_lock();
-	list_for_each_entry_rcu(srp, &smack_rule_list, list) {
-		if (srp->smk_subject == subject_label ||
-		    strcmp(srp->smk_subject, subject_label) == 0) {
-			if (srp->smk_object == object_label ||
-			    strcmp(srp->smk_object, object_label) == 0) {
-				may = srp->smk_access;
-				break;
-			}
-		}
-	}
-	rcu_read_unlock();
+	may = smk_access_entry(subject_label, object_label);
 	/*
 	 * This is a bit map operation.
 	 */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 7e19afe0e738..05dc4da2e25f 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3,12 +3,14 @@
  *
  *  This file contains the smack hook function implementations.
  *
- *  Author:
+ *  Authors:
  *	Casey Schaufler <casey@schaufler-ca.com>
+ *	Jarkko Sakkinen <ext-jarkko.2.sakkinen@nokia.com>
  *
  *  Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
  *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
  *                Paul Moore <paul.moore@hp.com>
+ *  Copyright (C) 2010 Nokia Corporation
  *
  *	This program is free software; you can redistribute it and/or modify
  *	it under the terms of the GNU General Public License version 2,
@@ -35,6 +37,9 @@
 
 #define task_security(task)	(task_cred_xxx((task), security))
 
+#define TRANS_TRUE	"TRUE"
+#define TRANS_TRUE_SIZE	4
+
 /**
  * smk_fetch - Fetch the smack label from a file.
  * @ip: a pointer to the inode
@@ -468,6 +473,8 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir,
 				     char **name, void **value, size_t *len)
 {
 	char *isp = smk_of_inode(inode);
+	char *dsp = smk_of_inode(dir);
+	u32 may;
 
 	if (name) {
 		*name = kstrdup(XATTR_SMACK_SUFFIX, GFP_KERNEL);
@@ -476,6 +483,16 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir,
 	}
 
 	if (value) {
+		may = smk_access_entry(smk_of_current(), dsp);
+
+		/*
+		 * If the access rule allows transmutation and
+		 * the directory requests transmutation then
+		 * by all means transmute.
+		 */
+		if (((may & MAY_TRANSMUTE) != 0) && smk_inode_transmutable(dir))
+			isp = dsp;
+
 		*value = kstrdup(isp, GFP_KERNEL);
 		if (*value == NULL)
 			return -ENOMEM;
@@ -709,6 +726,12 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name,
 		if (size == 0 || size >= SMK_LABELLEN ||
 		    smk_import(value, size) == NULL)
 			rc = -EINVAL;
+	} else if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) {
+		if (!capable(CAP_MAC_ADMIN))
+			rc = -EPERM;
+		if (size != TRANS_TRUE_SIZE ||
+		    strncmp(value, TRANS_TRUE, TRANS_TRUE_SIZE) != 0)
+			rc = -EINVAL;
 	} else
 		rc = cap_inode_setxattr(dentry, name, value, size, flags);
 
@@ -735,35 +758,23 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name,
 static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
 				      const void *value, size_t size, int flags)
 {
-	struct inode_smack *isp;
 	char *nsp;
-
-	/*
-	 * Not SMACK or SMACKEXEC
-	 */
-	if (strcmp(name, XATTR_NAME_SMACK) &&
-	    strcmp(name, XATTR_NAME_SMACKEXEC))
-		return;
-
-	isp = dentry->d_inode->i_security;
-
-	/*
-	 * No locking is done here. This is a pointer
-	 * assignment.
-	 */
-	nsp = smk_import(value, size);
+	struct inode_smack *isp = dentry->d_inode->i_security;
 
 	if (strcmp(name, XATTR_NAME_SMACK) == 0) {
+		nsp = smk_import(value, size);
 		if (nsp != NULL)
 			isp->smk_inode = nsp;
 		else
 			isp->smk_inode = smack_known_invalid.smk_known;
-	} else {
+	} else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0) {
+		nsp = smk_import(value, size);
 		if (nsp != NULL)
 			isp->smk_task = nsp;
 		else
 			isp->smk_task = smack_known_invalid.smk_known;
-	}
+	} else if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0)
+		isp->smk_flags |= SMK_INODE_TRANSMUTE;
 
 	return;
 }
@@ -803,7 +814,8 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 	if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
 	    strcmp(name, XATTR_NAME_SMACKIPIN) == 0 ||
 	    strcmp(name, XATTR_NAME_SMACKIPOUT) == 0 ||
-	    strcmp(name, XATTR_NAME_SMACKEXEC) == 0) {
+	    strcmp(name, XATTR_NAME_SMACKEXEC) == 0 ||
+	    strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) {
 		if (!capable(CAP_MAC_ADMIN))
 			rc = -EPERM;
 	} else
@@ -2274,6 +2286,8 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 	char *csp = smk_of_current();
 	char *fetched;
 	char *final;
+	char trattr[TRANS_TRUE_SIZE];
+	int transflag = 0;
 	struct dentry *dp;
 
 	if (inode == NULL)
@@ -2392,10 +2406,19 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 		 */
 		dp = dget(opt_dentry);
 		fetched = smk_fetch(XATTR_NAME_SMACK, inode, dp);
-		if (fetched != NULL)
+		if (fetched != NULL) {
 			final = fetched;
-		isp->smk_task = smk_fetch(XATTR_NAME_SMACKEXEC, inode,
-					  dp);
+			if (S_ISDIR(inode->i_mode)) {
+				trattr[0] = '\0';
+				inode->i_op->getxattr(dp,
+					XATTR_NAME_SMACKTRANSMUTE,
+					trattr, TRANS_TRUE_SIZE);
+				if (strncmp(trattr, TRANS_TRUE,
+					    TRANS_TRUE_SIZE) == 0)
+					transflag = SMK_INODE_TRANSMUTE;
+			}
+		}
+		isp->smk_task = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp);
 
 		dput(dp);
 		break;
@@ -2406,7 +2429,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 	else
 		isp->smk_inode = final;
 
-	isp->smk_flags |= SMK_INODE_INSTANT;
+	isp->smk_flags |= (SMK_INODE_INSTANT | transflag);
 
 unlockandout:
 	mutex_unlock(&isp->smk_lock);
@@ -2456,6 +2479,7 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 			     void *value, size_t size)
 {
 	struct task_smack *tsp;
+	struct task_smack *oldtsp;
 	struct cred *new;
 	char *newsmack;
 
@@ -2485,6 +2509,7 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	if (newsmack == smack_known_web.smk_known)
 		return -EPERM;
 
+	oldtsp = p->cred->security;
 	new = prepare_creds();
 	if (new == NULL)
 		return -ENOMEM;
@@ -2494,6 +2519,7 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 		return -ENOMEM;
 	}
 	tsp->smk_task = newsmack;
+	tsp->smk_forked = oldtsp->smk_forked;
 	new->security = tsp;
 	commit_creds(new);
 	return size;
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 01a0be93d8d0..362d5eda948b 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -109,9 +109,12 @@ const char *smack_cipso_option = SMACK_CIPSO_OPTION;
  * SMK_ACCESSLEN: Maximum length for a rule access field
  * SMK_LOADLEN: Smack rule length
  */
-#define SMK_ACCESS    "rwxa"
-#define SMK_ACCESSLEN (sizeof(SMK_ACCESS) - 1)
-#define SMK_LOADLEN   (SMK_LABELLEN + SMK_LABELLEN + SMK_ACCESSLEN)
+#define SMK_OACCESS	"rwxa"
+#define SMK_ACCESS	"rwxat"
+#define SMK_OACCESSLEN	(sizeof(SMK_OACCESS) - 1)
+#define SMK_ACCESSLEN	(sizeof(SMK_ACCESS) - 1)
+#define SMK_OLOADLEN	(SMK_LABELLEN + SMK_LABELLEN + SMK_OACCESSLEN)
+#define SMK_LOADLEN	(SMK_LABELLEN + SMK_LABELLEN + SMK_ACCESSLEN)
 
 /**
  * smk_netlabel_audit_set - fill a netlbl_audit struct
@@ -175,6 +178,8 @@ static int load_seq_show(struct seq_file *s, void *v)
 		seq_putc(s, 'x');
 	if (srp->smk_access & MAY_APPEND)
 		seq_putc(s, 'a');
+	if (srp->smk_access & MAY_TRANSMUTE)
+		seq_putc(s, 't');
 	if (srp->smk_access == 0)
 		seq_putc(s, '-');
 
@@ -273,10 +278,15 @@ static ssize_t smk_write_load(struct file *file, const char __user *buf,
 	if (!capable(CAP_MAC_ADMIN))
 		return -EPERM;
 
-	if (*ppos != 0 || count != SMK_LOADLEN)
+	if (*ppos != 0)
+		return -EINVAL;
+	/*
+	 * Minor hack for backward compatability
+	 */
+	if (count < (SMK_OLOADLEN) || count > SMK_LOADLEN)
 		return -EINVAL;
 
-	data = kzalloc(count, GFP_KERNEL);
+	data = kzalloc(SMK_LOADLEN, GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
 
@@ -285,6 +295,12 @@ static ssize_t smk_write_load(struct file *file, const char __user *buf,
 		goto out;
 	}
 
+	/*
+	 * More on the minor hack for backward compatability
+	 */
+	if (count == (SMK_OLOADLEN))
+		data[SMK_OLOADLEN] = '-';
+
 	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
 	if (rule == NULL) {
 		rc = -ENOMEM;
@@ -345,6 +361,17 @@ static ssize_t smk_write_load(struct file *file, const char __user *buf,
 		goto out_free_rule;
 	}
 
+	switch (data[SMK_LABELLEN + SMK_LABELLEN + 4]) {
+	case '-':
+		break;
+	case 't':
+	case 'T':
+		rule->smk_access |= MAY_TRANSMUTE;
+		break;
+	default:
+		goto out_free_rule;
+	}
+
 	rc = smk_set_access(rule);
 
 	if (!rc)
-- 
cgit v1.2.3-71-gd317


From 9f843706bb87837b823228467f4f83973fd110e9 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Mon, 6 Dec 2010 00:12:44 +0000
Subject: mmc, sh: Move MMCIF_PROGRESS_* into sh_mmcif.h

Allow MMCIF_PROGRESS_* to be shared.

Cc: Magnus Damm <magnus.damm@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boot/romimage/mmcif-sh7724.c | 3 ---
 include/linux/mmc/sh_mmcif.h         | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/boot/romimage/mmcif-sh7724.c b/arch/sh/boot/romimage/mmcif-sh7724.c
index 14863d7292cb..16b9c6fa7ded 100644
--- a/arch/sh/boot/romimage/mmcif-sh7724.c
+++ b/arch/sh/boot/romimage/mmcif-sh7724.c
@@ -21,9 +21,6 @@
 #define HIZCRC		0xa405015c
 #define DRVCRA		0xa405018a
 
-enum { MMCIF_PROGRESS_ENTER, MMCIF_PROGRESS_INIT,
-       MMCIF_PROGRESS_LOAD, MMCIF_PROGRESS_DONE };
-
 /* SH7724 specific MMCIF loader
  *
  * loads the romImage from an MMC card starting from block 512
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index ffabf8c0a531..6a98e97c49f8 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -97,6 +97,9 @@ static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val)
 
 #define SH_MMCIF_BBS 512 /* boot block size */
 
+enum { MMCIF_PROGRESS_ENTER, MMCIF_PROGRESS_INIT,
+       MMCIF_PROGRESS_LOAD, MMCIF_PROGRESS_DONE };
+
 static inline void sh_mmcif_boot_cmd_send(void __iomem *base,
 					  unsigned long cmd, unsigned long arg)
 {
-- 
cgit v1.2.3-71-gd317


From 54b384634f7083bcacf9a9ed2e6f4c3d0a246e49 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Mon, 6 Dec 2010 00:12:45 +0000
Subject: mmc, sh: Remove sh_mmcif_boot_slurp()

As the only caller of sh_mmcif_boot_do_read() is
sh_mmcif_boot_slurp() the configuration portion of
sh_mmcif_boot_slurp() can be merged into sh_mmcif_boot_do_read().

Once this is done sh_mmcif_boot_slurp() is only a call
to sh_mmcif_boot_do_read() with platform specific information -
the offset that images are stored on MMC. So make the
sh_mmcif_boot_do_read() call directly from platform code
and remove sh_mmcif_boot_slurp() altogether.

Cc: Magnus Damm <magnus.damm@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boot/romimage/mmcif-sh7724.c |  4 +++-
 include/linux/mmc/sh_mmcif.h         | 34 +++++++++++-----------------------
 2 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/boot/romimage/mmcif-sh7724.c b/arch/sh/boot/romimage/mmcif-sh7724.c
index 16b9c6fa7ded..c84e7831018d 100644
--- a/arch/sh/boot/romimage/mmcif-sh7724.c
+++ b/arch/sh/boot/romimage/mmcif-sh7724.c
@@ -60,7 +60,9 @@ asmlinkage void mmcif_loader(unsigned char *buf, unsigned long no_bytes)
 	mmcif_update_progress(MMCIF_PROGRESS_LOAD);
 
 	/* load kernel via MMCIF interface */
-	sh_mmcif_boot_slurp(MMCIF_BASE, buf, no_bytes);
+	sh_mmcif_boot_do_read(MMCIF_BASE, 512,
+	                      (no_bytes + SH_MMCIF_BBS - 1) / SH_MMCIF_BBS,
+			      buf);
 
 	/* disable clock to the MMCIF hardware block */
 	__raw_writel(__raw_readl(MSTPCR2) | 0x20000000, MSTPCR2);
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index 6a98e97c49f8..cfcc6e320e2f 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -162,6 +162,17 @@ static inline int sh_mmcif_boot_do_read(void __iomem *base,
 	unsigned long k;
 	int ret = 0;
 
+	/* In data transfer mode: Set clock to Bus clock/4 (about 20Mhz) */
+	sh_mmcif_writel(base, MMCIF_CE_CLK_CTRL,
+			CLK_ENABLE | CLKDIV_4 | SRSPTO_256 |
+			SRBSYTO_29 | SRWDTO_29 | SCCSTO_29);
+
+	/* CMD9 - Get CSD */
+	sh_mmcif_boot_cmd(base, 0x09806000, 0x00010000);
+
+	/* CMD7 - Select the card */
+	sh_mmcif_boot_cmd(base, 0x07400000, 0x00010000);
+
 	/* CMD16 - Set the block size */
 	sh_mmcif_boot_cmd(base, 0x10400000, SH_MMCIF_BBS);
 
@@ -205,27 +216,4 @@ static inline void sh_mmcif_boot_init(void __iomem *base)
 	sh_mmcif_boot_cmd(base, 0x03400040, 0x00010000);
 }
 
-static inline void sh_mmcif_boot_slurp(void __iomem *base,
-				       unsigned char *buf,
-				       unsigned long no_bytes)
-{
-	unsigned long tmp;
-
-	/* In data transfer mode: Set clock to Bus clock/4 (about 20Mhz) */
-	sh_mmcif_writel(base, MMCIF_CE_CLK_CTRL,
-			CLK_ENABLE | CLKDIV_4 | SRSPTO_256 |
-			SRBSYTO_29 | SRWDTO_29 | SCCSTO_29);
-
-	/* CMD9 - Get CSD */
-	sh_mmcif_boot_cmd(base, 0x09806000, 0x00010000);
-
-	/* CMD7 - Select the card */
-	sh_mmcif_boot_cmd(base, 0x07400000, 0x00010000);
-
-	tmp = no_bytes / SH_MMCIF_BBS;
-	tmp += (no_bytes % SH_MMCIF_BBS) ? 1 : 0;
-
-	sh_mmcif_boot_do_read(base, 512, tmp, buf);
-}
-
 #endif /* __SH_MMCIF_H__ */
-- 
cgit v1.2.3-71-gd317


From 5bea7660bba973dc5e8e9d92b11fb1dd5b524ebf Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 7 Dec 2010 23:02:48 -0800
Subject: HID: add hid_hw_open/close/power() handlers

Instead of exposing the guts of hid->ll_driver relationship to HID
sub-drivers provide these helpers to encapsulate the details.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c   |  4 ++--
 drivers/hid/hid-picolcd.c |  6 +++---
 drivers/hid/hid-roccat.c  | 26 ++++++++++----------------
 drivers/hid/hidraw.c      | 21 +++++++++------------
 include/linux/hid.h       | 43 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 834ef47b76d6..b718f7144ce1 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -772,14 +772,14 @@ static int hidinput_open(struct input_dev *dev)
 {
 	struct hid_device *hid = input_get_drvdata(dev);
 
-	return hid->ll_driver->open(hid);
+	return hid_hw_open(hid);
 }
 
 static void hidinput_close(struct input_dev *dev)
 {
 	struct hid_device *hid = input_get_drvdata(dev);
 
-	hid->ll_driver->close(hid);
+	hid_hw_close(hid);
 }
 
 /*
diff --git a/drivers/hid/hid-picolcd.c b/drivers/hid/hid-picolcd.c
index bc2e07740628..38565fee7bf1 100644
--- a/drivers/hid/hid-picolcd.c
+++ b/drivers/hid/hid-picolcd.c
@@ -2635,7 +2635,7 @@ static int picolcd_probe(struct hid_device *hdev,
 		goto err_cleanup_data;
 	}
 
-	error = hdev->ll_driver->open(hdev);
+	error = hid_hw_open(hdev);
 	if (error) {
 		dev_err(&hdev->dev, "failed to open input interrupt pipe for key and IR events\n");
 		goto err_cleanup_hid_hw;
@@ -2668,7 +2668,7 @@ err_cleanup_sysfs2:
 err_cleanup_sysfs1:
 	device_remove_file(&hdev->dev, &dev_attr_operation_mode_delay);
 err_cleanup_hid_ll:
-	hdev->ll_driver->close(hdev);
+	hid_hw_close(hdev);
 err_cleanup_hid_hw:
 	hid_hw_stop(hdev);
 err_cleanup_data:
@@ -2699,7 +2699,7 @@ static void picolcd_remove(struct hid_device *hdev)
 	picolcd_exit_devfs(data);
 	device_remove_file(&hdev->dev, &dev_attr_operation_mode);
 	device_remove_file(&hdev->dev, &dev_attr_operation_mode_delay);
-	hdev->ll_driver->close(hdev);
+	hid_hw_close(hdev);
 	hid_hw_stop(hdev);
 	hid_set_drvdata(hdev, NULL);
 
diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c
index 5a6879e235ac..a9d9b29ff47f 100644
--- a/drivers/hid/hid-roccat.c
+++ b/drivers/hid/hid-roccat.c
@@ -173,19 +173,15 @@ static int roccat_open(struct inode *inode, struct file *file)
 
 	if (!device->open++) {
 		/* power on device on adding first reader */
-		if (device->hid->ll_driver->power) {
-			error = device->hid->ll_driver->power(device->hid,
-					PM_HINT_FULLON);
-			if (error < 0) {
-				--device->open;
-				goto exit_err;
-			}
+		error = hid_hw_power(device->hid, PM_HINT_FULLON);
+		if (error < 0) {
+			--device->open;
+			goto exit_err;
 		}
-		error = device->hid->ll_driver->open(device->hid);
+
+		error = hid_hw_open(device->hid);
 		if (error < 0) {
-			if (device->hid->ll_driver->power)
-				device->hid->ll_driver->power(device->hid,
-						PM_HINT_NORMAL);
+			hid_hw_power(device->hid, PM_HINT_NORMAL);
 			--device->open;
 			goto exit_err;
 		}
@@ -231,10 +227,8 @@ static int roccat_release(struct inode *inode, struct file *file)
 	if (!--device->open) {
 		/* removing last reader */
 		if (device->exist) {
-			if (device->hid->ll_driver->power)
-				device->hid->ll_driver->power(device->hid,
-						PM_HINT_NORMAL);
-			device->hid->ll_driver->close(device->hid);
+			hid_hw_power(device->hid, PM_HINT_NORMAL);
+			hid_hw_close(device->hid);
 		} else {
 			kfree(device);
 		}
@@ -370,7 +364,7 @@ void roccat_disconnect(int minor)
 	device_destroy(roccat_class, MKDEV(roccat_major, minor));
 
 	if (device->open) {
-		device->hid->ll_driver->close(device->hid);
+		hid_hw_close(device->hid);
 		wake_up_interruptible(&device->wait);
 	} else {
 		kfree(device);
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 8a4b32dca9f7..5aefe73cf795 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -193,15 +193,13 @@ static int hidraw_open(struct inode *inode, struct file *file)
 
 	dev = hidraw_table[minor];
 	if (!dev->open++) {
-		if (dev->hid->ll_driver->power) {
-			err = dev->hid->ll_driver->power(dev->hid, PM_HINT_FULLON);
-			if (err < 0)
-				goto out_unlock;
-		}
-		err = dev->hid->ll_driver->open(dev->hid);
+		err = hid_hw_power(dev->hid, PM_HINT_FULLON);
+		if (err < 0)
+			goto out_unlock;
+
+		err = hid_hw_open(dev->hid);
 		if (err < 0) {
-			if (dev->hid->ll_driver->power)
-				dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL);
+			hid_hw_power(dev->hid, PM_HINT_NORMAL);
 			dev->open--;
 		}
 	}
@@ -230,9 +228,8 @@ static int hidraw_release(struct inode * inode, struct file * file)
 	dev = hidraw_table[minor];
 	if (!--dev->open) {
 		if (list->hidraw->exist) {
-			if (dev->hid->ll_driver->power)
-				dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL);
-			dev->hid->ll_driver->close(dev->hid);
+			hid_hw_power(dev->hid, PM_HINT_NORMAL);
+			hid_hw_close(dev->hid);
 		} else {
 			kfree(list->hidraw);
 		}
@@ -434,7 +431,7 @@ void hidraw_disconnect(struct hid_device *hid)
 	device_destroy(hidraw_class, MKDEV(hidraw_major, hidraw->minor));
 
 	if (hidraw->open) {
-		hid->ll_driver->close(hid);
+		hid_hw_close(hid);
 		wake_up_interruptible(&hidraw->wait);
 	} else {
 		kfree(hidraw);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index bb0f56f5c01e..e2af195d8b46 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -820,6 +820,49 @@ static inline void hid_hw_stop(struct hid_device *hdev)
 	hdev->ll_driver->stop(hdev);
 }
 
+/**
+ * hid_hw_open - signal underlaying HW to start delivering events
+ *
+ * @hdev: hid device
+ *
+ * Tell underlying HW to start delivering events from the device.
+ * This function should be called sometime after successful call
+ * to hid_hiw_start().
+ */
+static inline int __must_check hid_hw_open(struct hid_device *hdev)
+{
+	return hdev->ll_driver->open(hdev);
+}
+
+/**
+ * hid_hw_close - signal underlaying HW to stop delivering events
+ *
+ * @hdev: hid device
+ *
+ * This function indicates that we are not interested in the events
+ * from this device anymore. Delivery of events may or may not stop,
+ * depending on the number of users still outstanding.
+ */
+static inline void hid_hw_close(struct hid_device *hdev)
+{
+	hdev->ll_driver->close(hdev);
+}
+
+/**
+ * hid_hw_power - requests underlying HW to go into given power mode
+ *
+ * @hdev: hid device
+ * @level: requested power level (one of %PM_HINT_* defines)
+ *
+ * This function requests underlying hardware to enter requested power
+ * mode.
+ */
+
+static inline int hid_hw_power(struct hid_device *hdev, int level)
+{
+	return hdev->ll_driver->power ? hdev->ll_driver->power(hdev, level) : 0;
+}
+
 void hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size,
 		int interrupt);
 
-- 
cgit v1.2.3-71-gd317


From dac36dd87de10d1fd81dc7b7a40256cb2e965abc Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Fri, 10 Dec 2010 01:57:07 +0900
Subject: poll: fix a typo in comment

Convert duplicated sys_poll to select. As Kosaki suggests, sys_poll() and
sys_select() are now hidden by SYSCALL_DEFINEx() macros so it would be
better to use plain select/poll syscall name.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/poll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/poll.h b/include/linux/poll.h
index 56e76af78102..1a2ccd6f3823 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -57,7 +57,7 @@ struct poll_table_entry {
 };
 
 /*
- * Structures and helpers for sys_poll/sys_poll
+ * Structures and helpers for select/poll syscall
  */
 struct poll_wqueues {
 	poll_table pt;
-- 
cgit v1.2.3-71-gd317


From 4291ee305e9bb0699504a66f0e2b7aefcf0512a5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 9 Dec 2010 19:29:03 -0800
Subject: HID: Add and use hid_<level>: dev_<level> equivalents

Neaten current uses of dev_<level> by adding and using
hid specific hid_<level> macros.

Convert existing uses of dev_<level> uses to hid_<level>.
Convert hid-pidff printk uses to hid_<level>.

Remove err_hid and use hid_err instead.

Add missing newlines to logging messages where necessary.
Coalesce format strings.

Add and use pr_fmt(fmt) KBUILD_MODNAME ": " fmt

Other miscellaneous changes:

Add const struct hid_device * argument to hid-core functions
extract() and implement() so hid_<level> can be used by them.
Fix bad indentation in hid-core hid_input_field function
that calls extract() function above.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-3m-pct.c       |   2 +-
 drivers/hid/hid-a4tech.c       |   6 +-
 drivers/hid/hid-apple.c        |  14 ++--
 drivers/hid/hid-axff.c         |  14 ++--
 drivers/hid/hid-belkin.c       |   4 +-
 drivers/hid/hid-cando.c        |   2 +-
 drivers/hid/hid-cherry.c       |   3 +-
 drivers/hid/hid-core.c         |  64 +++++++++-------
 drivers/hid/hid-cypress.c      |   4 +-
 drivers/hid/hid-debug.c        |   4 +-
 drivers/hid/hid-drff.c         |  14 ++--
 drivers/hid/hid-egalax.c       |   2 +-
 drivers/hid/hid-elecom.c       |   3 +-
 drivers/hid/hid-emsff.c        |  13 ++--
 drivers/hid/hid-gaff.c         |  13 ++--
 drivers/hid/hid-input.c        |   2 +-
 drivers/hid/hid-kye.c          |   4 +-
 drivers/hid/hid-lg.c           |  15 ++--
 drivers/hid/hid-lg2ff.c        |   9 +--
 drivers/hid/hid-lg3ff.c        |   9 +--
 drivers/hid/hid-lg4ff.c        |   9 +--
 drivers/hid/hid-lgff.c         |   8 +-
 drivers/hid/hid-magicmouse.c   |  15 ++--
 drivers/hid/hid-microsoft.c    |   7 +-
 drivers/hid/hid-monterey.c     |   3 +-
 drivers/hid/hid-mosart.c       |   2 +-
 drivers/hid/hid-ntrig.c        |   9 +--
 drivers/hid/hid-ortek.c        |   3 +-
 drivers/hid/hid-petalynx.c     |   7 +-
 drivers/hid/hid-picolcd.c      |  45 ++++++-----
 drivers/hid/hid-pl.c           |  16 ++--
 drivers/hid/hid-prodikeys.c    |  21 +++---
 drivers/hid/hid-quanta.c       |   2 +-
 drivers/hid/hid-roccat-kone.c  |  22 +++---
 drivers/hid/hid-roccat-pyra.c  |  20 +++--
 drivers/hid/hid-roccat.c       |  10 +--
 drivers/hid/hid-samsung.c      |   8 +-
 drivers/hid/hid-sjoy.c         |  16 ++--
 drivers/hid/hid-sony.c         |  11 ++-
 drivers/hid/hid-stantum.c      |   2 +-
 drivers/hid/hid-sunplus.c      |   3 +-
 drivers/hid/hid-tmff.c         |  27 +++----
 drivers/hid/hid-wacom.c        |  26 ++++---
 drivers/hid/hid-zpff.c         |  11 ++-
 drivers/hid/hid-zydacron.c     |  11 ++-
 drivers/hid/hidraw.c           |  12 +--
 drivers/hid/usbhid/hid-core.c  |  48 ++++++------
 drivers/hid/usbhid/hid-pidff.c | 164 +++++++++++++++++++----------------------
 drivers/hid/usbhid/hiddev.c    |   2 +-
 drivers/hid/usbhid/usbkbd.c    |  24 +++---
 include/linux/hid.h            |  32 ++++++--
 51 files changed, 398 insertions(+), 399 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-3m-pct.c b/drivers/hid/hid-3m-pct.c
index 02d8cd3b1b1b..4546c123eb77 100644
--- a/drivers/hid/hid-3m-pct.c
+++ b/drivers/hid/hid-3m-pct.c
@@ -274,7 +274,7 @@ static int mmm_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	md = kzalloc(sizeof(struct mmm_data), GFP_KERNEL);
 	if (!md) {
-		dev_err(&hdev->dev, "cannot allocate 3M data\n");
+		hid_err(hdev, "cannot allocate 3M data\n");
 		return -ENOMEM;
 	}
 	hid_set_drvdata(hdev, md);
diff --git a/drivers/hid/hid-a4tech.c b/drivers/hid/hid-a4tech.c
index 1666c1684e79..902d1dfeb1b5 100644
--- a/drivers/hid/hid-a4tech.c
+++ b/drivers/hid/hid-a4tech.c
@@ -93,7 +93,7 @@ static int a4_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	a4 = kzalloc(sizeof(*a4), GFP_KERNEL);
 	if (a4 == NULL) {
-		dev_err(&hdev->dev, "can't alloc device descriptor\n");
+		hid_err(hdev, "can't alloc device descriptor\n");
 		ret = -ENOMEM;
 		goto err_free;
 	}
@@ -104,13 +104,13 @@ static int a4_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index 8aa71750a23c..61aa71233392 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -16,6 +16,8 @@
  * any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/hid.h>
 #include <linux/module.h>
@@ -280,8 +282,8 @@ static __u8 *apple_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 
 	if ((asc->quirks & APPLE_RDESC_JIS) && *rsize >= 60 &&
 			rdesc[53] == 0x65 && rdesc[59] == 0x65) {
-		dev_info(&hdev->dev, "fixing up MacBook JIS keyboard report "
-				"descriptor\n");
+		hid_info(hdev,
+			 "fixing up MacBook JIS keyboard report descriptor\n");
 		rdesc[53] = rdesc[59] = 0xe7;
 	}
 	return rdesc;
@@ -351,7 +353,7 @@ static int apple_probe(struct hid_device *hdev,
 
 	asc = kzalloc(sizeof(*asc), GFP_KERNEL);
 	if (asc == NULL) {
-		dev_err(&hdev->dev, "can't alloc apple descriptor\n");
+		hid_err(hdev, "can't alloc apple descriptor\n");
 		return -ENOMEM;
 	}
 
@@ -361,7 +363,7 @@ static int apple_probe(struct hid_device *hdev,
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
@@ -372,7 +374,7 @@ static int apple_probe(struct hid_device *hdev,
 
 	ret = hid_hw_start(hdev, connect_mask);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
@@ -512,7 +514,7 @@ static int __init apple_init(void)
 
 	ret = hid_register_driver(&apple_driver);
 	if (ret)
-		printk(KERN_ERR "can't register apple driver\n");
+		pr_err("can't register apple driver\n");
 
 	return ret;
 }
diff --git a/drivers/hid/hid-axff.c b/drivers/hid/hid-axff.c
index f42ee140738a..e5b961d6ff22 100644
--- a/drivers/hid/hid-axff.c
+++ b/drivers/hid/hid-axff.c
@@ -73,14 +73,14 @@ static int axff_init(struct hid_device *hid)
 	int error;
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output reports found\n");
+		hid_err(hid, "no output reports found\n");
 		return -ENODEV;
 	}
 
 	report = list_first_entry(report_list, struct hid_report, list);
 
 	if (report->maxfield < 4) {
-		dev_err(&hid->dev, "no fields in the report: %d\n", report->maxfield);
+		hid_err(hid, "no fields in the report: %d\n", report->maxfield);
 		return -ENODEV;
 	}
 
@@ -101,7 +101,7 @@ static int axff_init(struct hid_device *hid)
 	axff->report->field[3]->value[0] = 0x00;
 	usbhid_submit_report(hid, axff->report, USB_DIR_OUT);
 
-	dev_info(&hid->dev, "Force Feedback for ACRUX game controllers by Sergei Kolzun<x0r@dv-life.ru>\n");
+	hid_info(hid, "Force Feedback for ACRUX game controllers by Sergei Kolzun<x0r@dv-life.ru>\n");
 
 	return 0;
 
@@ -114,17 +114,17 @@ static int ax_probe(struct hid_device *hdev, const struct hid_device_id *id)
 {
 	int error;
 
-	dev_dbg(&hdev->dev, "ACRUX HID hardware probe...");
+	dev_dbg(&hdev->dev, "ACRUX HID hardware probe...\n");
 
 	error = hid_parse(hdev);
 	if (error) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		return error;
 	}
 
 	error = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (error) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		return error;
 	}
 
@@ -134,7 +134,7 @@ static int ax_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		 * Do not fail device initialization completely as device
 		 * may still be partially operable, just warn.
 		 */
-		dev_warn(&hdev->dev,
+		hid_warn(hdev,
 			 "Failed to enable force feedback support, error: %d\n",
 			 error);
 	}
diff --git a/drivers/hid/hid-belkin.c b/drivers/hid/hid-belkin.c
index 4ce7aa3a519f..a1a765a5b08a 100644
--- a/drivers/hid/hid-belkin.c
+++ b/drivers/hid/hid-belkin.c
@@ -56,14 +56,14 @@ static int belkin_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT |
 		((quirks & BELKIN_HIDDEV) ? HID_CONNECT_HIDDEV_FORCE : 0));
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-cando.c b/drivers/hid/hid-cando.c
index 5925bdcd417d..375b50929a50 100644
--- a/drivers/hid/hid-cando.c
+++ b/drivers/hid/hid-cando.c
@@ -207,7 +207,7 @@ static int cando_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	td = kmalloc(sizeof(struct cando_data), GFP_KERNEL);
 	if (!td) {
-		dev_err(&hdev->dev, "cannot allocate Cando Touch data\n");
+		hid_err(hdev, "cannot allocate Cando Touch data\n");
 		return -ENOMEM;
 	}
 	hid_set_drvdata(hdev, td);
diff --git a/drivers/hid/hid-cherry.c b/drivers/hid/hid-cherry.c
index e880086c2311..888ece68a47c 100644
--- a/drivers/hid/hid-cherry.c
+++ b/drivers/hid/hid-cherry.c
@@ -30,8 +30,7 @@ static __u8 *ch_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		unsigned int *rsize)
 {
 	if (*rsize >= 17 && rdesc[11] == 0x3c && rdesc[12] == 0x02) {
-		dev_info(&hdev->dev, "fixing up Cherry Cymotion report "
-				"descriptor\n");
+		hid_info(hdev, "fixing up Cherry Cymotion report descriptor\n");
 		rdesc[11] = rdesc[16] = 0xff;
 		rdesc[12] = rdesc[17] = 0x03;
 	}
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 396f8c7e0076..1fd5e331fa8d 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -14,6 +14,8 @@
  * any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -672,7 +674,8 @@ int hid_parse_report(struct hid_device *device, __u8 *start,
 
 		if (dispatch_type[item.type](parser, &item)) {
 			dbg_hid("item %u %u %u %u parsing failed\n",
-				item.format, (unsigned)item.size, (unsigned)item.type, (unsigned)item.tag);
+				item.format, (unsigned)item.size,
+				(unsigned)item.type, (unsigned)item.tag);
 			goto err;
 		}
 
@@ -737,13 +740,14 @@ static u32 s32ton(__s32 value, unsigned n)
  * Search linux-kernel and linux-usb-devel archives for "hid-core extract".
  */
 
-static __inline__ __u32 extract(__u8 *report, unsigned offset, unsigned n)
+static __inline__ __u32 extract(const struct hid_device *hid, __u8 *report,
+				unsigned offset, unsigned n)
 {
 	u64 x;
 
 	if (n > 32)
-		printk(KERN_WARNING "HID: extract() called with n (%d) > 32! (%s)\n",
-				n, current->comm);
+		hid_warn(hid, "extract() called with n (%d) > 32! (%s)\n",
+			 n, current->comm);
 
 	report += offset >> 3;  /* adjust byte index */
 	offset &= 7;            /* now only need bit offset into one byte */
@@ -760,18 +764,19 @@ static __inline__ __u32 extract(__u8 *report, unsigned offset, unsigned n)
  * endianness of register values by considering a register
  * a "cached" copy of the little endiad bit stream.
  */
-static __inline__ void implement(__u8 *report, unsigned offset, unsigned n, __u32 value)
+static __inline__ void implement(const struct hid_device *hid, __u8 *report,
+				 unsigned offset, unsigned n, __u32 value)
 {
 	u64 x;
 	u64 m = (1ULL << n) - 1;
 
 	if (n > 32)
-		printk(KERN_WARNING "HID: implement() called with n (%d) > 32! (%s)\n",
-				n, current->comm);
+		hid_warn(hid, "%s() called with n (%d) > 32! (%s)\n",
+			 __func__, n, current->comm);
 
 	if (value > m)
-		printk(KERN_WARNING "HID: implement() called with too large value %d! (%s)\n",
-				value, current->comm);
+		hid_warn(hid, "%s() called with too large value %d! (%s)\n",
+			 __func__, value, current->comm);
 	WARN_ON(value > m);
 	value &= m;
 
@@ -892,13 +897,16 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field,
 
 	for (n = 0; n < count; n++) {
 
-			value[n] = min < 0 ? snto32(extract(data, offset + n * size, size), size) :
-						    extract(data, offset + n * size, size);
+		value[n] = min < 0 ?
+			snto32(extract(hid, data, offset + n * size, size),
+			       size) :
+			extract(hid, data, offset + n * size, size);
 
-			if (!(field->flags & HID_MAIN_ITEM_VARIABLE) /* Ignore report if ErrorRollOver */
-			    && value[n] >= min && value[n] <= max
-			    && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1)
-				goto exit;
+		/* Ignore report if ErrorRollOver */
+		if (!(field->flags & HID_MAIN_ITEM_VARIABLE) &&
+		    value[n] >= min && value[n] <= max &&
+		    field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1)
+			goto exit;
 	}
 
 	for (n = 0; n < count; n++) {
@@ -928,7 +936,8 @@ exit:
  * Output the field into the report.
  */
 
-static void hid_output_field(struct hid_field *field, __u8 *data)
+static void hid_output_field(const struct hid_device *hid,
+			     struct hid_field *field, __u8 *data)
 {
 	unsigned count = field->report_count;
 	unsigned offset = field->report_offset;
@@ -937,9 +946,11 @@ static void hid_output_field(struct hid_field *field, __u8 *data)
 
 	for (n = 0; n < count; n++) {
 		if (field->logical_minimum < 0)	/* signed values */
-			implement(data, offset + n * size, size, s32ton(field->value[n], size));
+			implement(hid, data, offset + n * size, size,
+				  s32ton(field->value[n], size));
 		else				/* unsigned values */
-			implement(data, offset + n * size, size, field->value[n]);
+			implement(hid, data, offset + n * size, size,
+				  field->value[n]);
 	}
 }
 
@@ -956,7 +967,7 @@ void hid_output_report(struct hid_report *report, __u8 *data)
 
 	memset(data, 0, ((report->size - 1) >> 3) + 1);
 	for (n = 0; n < report->maxfield; n++)
-		hid_output_field(report->field[n], data);
+		hid_output_field(report->device, report->field[n], data);
 }
 EXPORT_SYMBOL_GPL(hid_output_report);
 
@@ -1169,8 +1180,7 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask)
 		hdev->claimed |= HID_CLAIMED_HIDRAW;
 
 	if (!hdev->claimed) {
-		dev_err(&hdev->dev, "claimed by neither input, hiddev nor "
-				"hidraw\n");
+		hid_err(hdev, "claimed by neither input, hiddev nor hidraw\n");
 		return -ENODEV;
 	}
 
@@ -1210,9 +1220,9 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask)
 		bus = "<UNKNOWN>";
 	}
 
-	dev_info(&hdev->dev, "%s: %s HID v%x.%02x %s [%s] on %s\n",
-			buf, bus, hdev->version >> 8, hdev->version & 0xff,
-			type, hdev->name, hdev->phys);
+	hid_info(hdev, "%s: %s HID v%x.%02x %s [%s] on %s\n",
+		 buf, bus, hdev->version >> 8, hdev->version & 0xff,
+		 type, hdev->name, hdev->phys);
 
 	return 0;
 }
@@ -1956,12 +1966,12 @@ static int __init hid_init(void)
 	int ret;
 
 	if (hid_debug)
-		printk(KERN_WARNING "HID: hid_debug is now used solely for parser and driver debugging.\n"
-				"HID: debugfs is now used for inspecting the device (report descriptor, reports)\n");
+		pr_warn("hid_debug is now used solely for parser and driver debugging.\n"
+			"debugfs is now used for inspecting the device (report descriptor, reports)\n");
 
 	ret = bus_register(&hid_bus_type);
 	if (ret) {
-		printk(KERN_ERR "HID: can't register hid bus\n");
+		pr_err("can't register hid bus\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-cypress.c b/drivers/hid/hid-cypress.c
index 4cd0e2345991..2f0be4c66af7 100644
--- a/drivers/hid/hid-cypress.c
+++ b/drivers/hid/hid-cypress.c
@@ -107,13 +107,13 @@ static int cp_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 75c5e23d09d2..555382fc7417 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -26,6 +26,8 @@
  * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
@@ -393,7 +395,7 @@ char *hid_resolv_usage(unsigned usage, struct seq_file *f) {
 
 	buf = resolv_usage_page(usage >> 16, f);
 	if (IS_ERR(buf)) {
-		printk(KERN_ERR "error allocating HID debug buffer\n");
+		pr_err("error allocating HID debug buffer\n");
 		return NULL;
 	}
 
diff --git a/drivers/hid/hid-drff.c b/drivers/hid/hid-drff.c
index 968b04f9b796..afcf3d67eb02 100644
--- a/drivers/hid/hid-drff.c
+++ b/drivers/hid/hid-drff.c
@@ -96,18 +96,18 @@ static int drff_init(struct hid_device *hid)
 	int error;
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output reports found\n");
+		hid_err(hid, "no output reports found\n");
 		return -ENODEV;
 	}
 
 	report = list_first_entry(report_list, struct hid_report, list);
 	if (report->maxfield < 1) {
-		dev_err(&hid->dev, "no fields in the report\n");
+		hid_err(hid, "no fields in the report\n");
 		return -ENODEV;
 	}
 
 	if (report->field[0]->report_count < 7) {
-		dev_err(&hid->dev, "not enough values in the field\n");
+		hid_err(hid, "not enough values in the field\n");
 		return -ENODEV;
 	}
 
@@ -133,8 +133,8 @@ static int drff_init(struct hid_device *hid)
 	drff->report->field[0]->value[6] = 0x00;
 	usbhid_submit_report(hid, drff->report, USB_DIR_OUT);
 
-	dev_info(&hid->dev, "Force Feedback for DragonRise Inc. game "
-	       "controllers by Richard Walmsley <richwalm@gmail.com>\n");
+	hid_info(hid, "Force Feedback for DragonRise Inc. "
+		 "game controllers by Richard Walmsley <richwalm@gmail.com>\n");
 
 	return 0;
 }
@@ -153,13 +153,13 @@ static int dr_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-egalax.c b/drivers/hid/hid-egalax.c
index 54b017ad258d..bbab6cff3518 100644
--- a/drivers/hid/hid-egalax.c
+++ b/drivers/hid/hid-egalax.c
@@ -223,7 +223,7 @@ static int egalax_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	td = kmalloc(sizeof(struct egalax_data), GFP_KERNEL);
 	if (!td) {
-		dev_err(&hdev->dev, "cannot allocate eGalax data\n");
+		hid_err(hdev, "cannot allocate eGalax data\n");
 		return -ENOMEM;
 	}
 	hid_set_drvdata(hdev, td);
diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c
index 6e31f305397d..79d0c61e7214 100644
--- a/drivers/hid/hid-elecom.c
+++ b/drivers/hid/hid-elecom.c
@@ -24,8 +24,7 @@ static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		unsigned int *rsize)
 {
 	if (*rsize >= 48 && rdesc[46] == 0x05 && rdesc[47] == 0x0c) {
-		dev_info(&hdev->dev, "Fixing up Elecom BM084 "
-				"report descriptor.\n");
+		hid_info(hdev, "Fixing up Elecom BM084 report descriptor\n");
 		rdesc[47] = 0x00;
 	}
     return rdesc;
diff --git a/drivers/hid/hid-emsff.c b/drivers/hid/hid-emsff.c
index c61b192d7cff..81877c67caea 100644
--- a/drivers/hid/hid-emsff.c
+++ b/drivers/hid/hid-emsff.c
@@ -68,18 +68,18 @@ static int emsff_init(struct hid_device *hid)
 	int error;
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output reports found\n");
+		hid_err(hid, "no output reports found\n");
 		return -ENODEV;
 	}
 
 	report = list_first_entry(report_list, struct hid_report, list);
 	if (report->maxfield < 1) {
-		dev_err(&hid->dev, "no fields in the report\n");
+		hid_err(hid, "no fields in the report\n");
 		return -ENODEV;
 	}
 
 	if (report->field[0]->report_count < 7) {
-		dev_err(&hid->dev, "not enough values in the field\n");
+		hid_err(hid, "not enough values in the field\n");
 		return -ENODEV;
 	}
 
@@ -105,8 +105,7 @@ static int emsff_init(struct hid_device *hid)
 	emsff->report->field[0]->value[6] = 0x00;
 	usbhid_submit_report(hid, emsff->report, USB_DIR_OUT);
 
-	dev_info(&hid->dev, "force feedback for EMS based devices by "
-	       "Ignaz Forster <ignaz.forster@gmx.de>\n");
+	hid_info(hid, "force feedback for EMS based devices by Ignaz Forster <ignaz.forster@gmx.de>\n");
 
 	return 0;
 }
@@ -117,13 +116,13 @@ static int ems_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-gaff.c b/drivers/hid/hid-gaff.c
index 88dfcf49a5d7..279ba530003c 100644
--- a/drivers/hid/hid-gaff.c
+++ b/drivers/hid/hid-gaff.c
@@ -87,7 +87,7 @@ static int gaff_init(struct hid_device *hid)
 	int error;
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output reports found\n");
+		hid_err(hid, "no output reports found\n");
 		return -ENODEV;
 	}
 
@@ -95,12 +95,12 @@ static int gaff_init(struct hid_device *hid)
 
 	report = list_entry(report_ptr, struct hid_report, list);
 	if (report->maxfield < 1) {
-		dev_err(&hid->dev, "no fields in the report\n");
+		hid_err(hid, "no fields in the report\n");
 		return -ENODEV;
 	}
 
 	if (report->field[0]->report_count < 6) {
-		dev_err(&hid->dev, "not enough values in the field\n");
+		hid_err(hid, "not enough values in the field\n");
 		return -ENODEV;
 	}
 
@@ -128,8 +128,7 @@ static int gaff_init(struct hid_device *hid)
 
 	usbhid_submit_report(hid, gaff->report, USB_DIR_OUT);
 
-	dev_info(&hid->dev, "Force Feedback for GreenAsia 0x12"
-	       " devices by Lukasz Lubojanski <lukasz@lubojanski.info>\n");
+	hid_info(hid, "Force Feedback for GreenAsia 0x12 devices by Lukasz Lubojanski <lukasz@lubojanski.info>\n");
 
 	return 0;
 }
@@ -148,13 +147,13 @@ static int ga_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index b718f7144ce1..a1a2206714fc 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -826,7 +826,7 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 				if (!hidinput || !input_dev) {
 					kfree(hidinput);
 					input_free_device(input_dev);
-					err_hid("Out of memory during hid input probe");
+					hid_err(hid, "Out of memory during hid input probe\n");
 					goto out_unwind;
 				}
 
diff --git a/drivers/hid/hid-kye.c b/drivers/hid/hid-kye.c
index 817247ee006c..f2ba9efc3a53 100644
--- a/drivers/hid/hid-kye.c
+++ b/drivers/hid/hid-kye.c
@@ -32,8 +32,8 @@ static __u8 *kye_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		rdesc[65] == 0x29 && rdesc[66] == 0x0f &&
 		rdesc[71] == 0x75 && rdesc[72] == 0x08 &&
 		rdesc[73] == 0x95 && rdesc[74] == 0x01) {
-		dev_info(&hdev->dev, "fixing up Kye/Genius Ergo Mouse report "
-				"descriptor\n");
+		hid_info(hdev,
+			 "fixing up Kye/Genius Ergo Mouse report descriptor\n");
 		rdesc[62] = 0x09;
 		rdesc[64] = 0x04;
 		rdesc[66] = 0x07;
diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c
index b629fba5a057..aef4104da141 100644
--- a/drivers/hid/hid-lg.c
+++ b/drivers/hid/hid-lg.c
@@ -53,23 +53,22 @@ static __u8 *lg_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 
 	if ((quirks & LG_RDESC) && *rsize >= 90 && rdesc[83] == 0x26 &&
 			rdesc[84] == 0x8c && rdesc[85] == 0x02) {
-		dev_info(&hdev->dev, "fixing up Logitech keyboard report "
-				"descriptor\n");
+		hid_info(hdev,
+			 "fixing up Logitech keyboard report descriptor\n");
 		rdesc[84] = rdesc[89] = 0x4d;
 		rdesc[85] = rdesc[90] = 0x10;
 	}
 	if ((quirks & LG_RDESC_REL_ABS) && *rsize >= 50 &&
 			rdesc[32] == 0x81 && rdesc[33] == 0x06 &&
 			rdesc[49] == 0x81 && rdesc[50] == 0x06) {
-		dev_info(&hdev->dev, "fixing up rel/abs in Logitech "
-				"report descriptor\n");
+		hid_info(hdev,
+			 "fixing up rel/abs in Logitech report descriptor\n");
 		rdesc[33] = rdesc[50] = 0x02;
 	}
 	if ((quirks & LG_FF4) && *rsize >= 101 &&
 			rdesc[41] == 0x95 && rdesc[42] == 0x0B &&
 			rdesc[47] == 0x05 && rdesc[48] == 0x09) {
-		dev_info(&hdev->dev, "fixing up Logitech Speed Force Wireless "
-			"button descriptor\n");
+		hid_info(hdev, "fixing up Logitech Speed Force Wireless button descriptor\n");
 		rdesc[41] = 0x05;
 		rdesc[42] = 0x09;
 		rdesc[47] = 0x95;
@@ -288,7 +287,7 @@ static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
@@ -297,7 +296,7 @@ static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_hw_start(hdev, connect_mask);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-lg2ff.c b/drivers/hid/hid-lg2ff.c
index 4258253c36b3..3c31bc650e5d 100644
--- a/drivers/hid/hid-lg2ff.c
+++ b/drivers/hid/hid-lg2ff.c
@@ -72,18 +72,18 @@ int lg2ff_init(struct hid_device *hid)
 	int error;
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output report found\n");
+		hid_err(hid, "no output report found\n");
 		return -ENODEV;
 	}
 
 	report = list_entry(report_list->next, struct hid_report, list);
 
 	if (report->maxfield < 1) {
-		dev_err(&hid->dev, "output report is empty\n");
+		hid_err(hid, "output report is empty\n");
 		return -ENODEV;
 	}
 	if (report->field[0]->report_count < 7) {
-		dev_err(&hid->dev, "not enough values in the field\n");
+		hid_err(hid, "not enough values in the field\n");
 		return -ENODEV;
 	}
 
@@ -110,8 +110,7 @@ int lg2ff_init(struct hid_device *hid)
 
 	usbhid_submit_report(hid, report, USB_DIR_OUT);
 
-	dev_info(&hid->dev, "Force feedback for Logitech RumblePad/Rumblepad 2 by "
-	       "Anssi Hannula <anssi.hannula@gmail.com>\n");
+	hid_info(hid, "Force feedback for Logitech RumblePad/Rumblepad 2 by Anssi Hannula <anssi.hannula@gmail.com>\n");
 
 	return 0;
 }
diff --git a/drivers/hid/hid-lg3ff.c b/drivers/hid/hid-lg3ff.c
index 4002832ee4af..f98644c26c1d 100644
--- a/drivers/hid/hid-lg3ff.c
+++ b/drivers/hid/hid-lg3ff.c
@@ -141,20 +141,20 @@ int lg3ff_init(struct hid_device *hid)
 
 	/* Find the report to use */
 	if (list_empty(report_list)) {
-		err_hid("No output report found");
+		hid_err(hid, "No output report found\n");
 		return -1;
 	}
 
 	/* Check that the report looks ok */
 	report = list_entry(report_list->next, struct hid_report, list);
 	if (!report) {
-		err_hid("NULL output report");
+		hid_err(hid, "NULL output report\n");
 		return -1;
 	}
 
 	field = report->field[0];
 	if (!field) {
-		err_hid("NULL field");
+		hid_err(hid, "NULL field\n");
 		return -1;
 	}
 
@@ -169,8 +169,7 @@ int lg3ff_init(struct hid_device *hid)
 	if (test_bit(FF_AUTOCENTER, dev->ffbit))
 		dev->ff->set_autocenter = hid_lg3ff_set_autocenter;
 
-	dev_info(&hid->dev, "Force feedback for Logitech Flight System G940 by "
-			"Gary Stein <LordCnidarian@gmail.com>\n");
+	hid_info(hid, "Force feedback for Logitech Flight System G940 by Gary Stein <LordCnidarian@gmail.com>\n");
 	return 0;
 }
 
diff --git a/drivers/hid/hid-lg4ff.c b/drivers/hid/hid-lg4ff.c
index 7eef5a2ce948..fa550c8e1d1b 100644
--- a/drivers/hid/hid-lg4ff.c
+++ b/drivers/hid/hid-lg4ff.c
@@ -101,20 +101,20 @@ int lg4ff_init(struct hid_device *hid)
 
 	/* Find the report to use */
 	if (list_empty(report_list)) {
-		err_hid("No output report found");
+		hid_err(hid, "No output report found\n");
 		return -1;
 	}
 
 	/* Check that the report looks ok */
 	report = list_entry(report_list->next, struct hid_report, list);
 	if (!report) {
-		err_hid("NULL output report");
+		hid_err(hid, "NULL output report\n");
 		return -1;
 	}
 
 	field = report->field[0];
 	if (!field) {
-		err_hid("NULL field");
+		hid_err(hid, "NULL field\n");
 		return -1;
 	}
 
@@ -129,8 +129,7 @@ int lg4ff_init(struct hid_device *hid)
 	if (test_bit(FF_AUTOCENTER, dev->ffbit))
 		dev->ff->set_autocenter = hid_lg4ff_set_autocenter;
 
-	dev_info(&hid->dev, "Force feedback for Logitech Speed Force Wireless by "
-			"Simon Wood <simon@mungewell.org>\n");
+	hid_info(hid, "Force feedback for Logitech Speed Force Wireless by Simon Wood <simon@mungewell.org>\n");
 	return 0;
 }
 
diff --git a/drivers/hid/hid-lgff.c b/drivers/hid/hid-lgff.c
index 61142b76a9b1..90d0ef2c92be 100644
--- a/drivers/hid/hid-lgff.c
+++ b/drivers/hid/hid-lgff.c
@@ -27,6 +27,8 @@
  * e-mail - mail your message to <johann.deneux@it.uu.se>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/input.h>
 #include <linux/usb.h>
 #include <linux/hid.h>
@@ -146,7 +148,7 @@ int lgff_init(struct hid_device* hid)
 
 	/* Find the report to use */
 	if (list_empty(report_list)) {
-		err_hid("No output report found");
+		hid_err(hid, "No output report found\n");
 		return -1;
 	}
 
@@ -154,7 +156,7 @@ int lgff_init(struct hid_device* hid)
 	report = list_entry(report_list->next, struct hid_report, list);
 	field = report->field[0];
 	if (!field) {
-		err_hid("NULL field");
+		hid_err(hid, "NULL field\n");
 		return -1;
 	}
 
@@ -176,7 +178,7 @@ int lgff_init(struct hid_device* hid)
 	if ( test_bit(FF_AUTOCENTER, dev->ffbit) )
 		dev->ff->set_autocenter = hid_lgff_set_autocenter;
 
-	printk(KERN_INFO "Force feedback for Logitech force feedback devices by Johann Deneux <johann.deneux@it.uu.se>\n");
+	pr_info("Force feedback for Logitech force feedback devices by Johann Deneux <johann.deneux@it.uu.se>\n");
 
 	return 0;
 }
diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c
index e6dc15171664..11306b3d9c56 100644
--- a/drivers/hid/hid-magicmouse.c
+++ b/drivers/hid/hid-magicmouse.c
@@ -12,6 +12,8 @@
  * any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/hid.h>
 #include <linux/module.h>
@@ -446,7 +448,7 @@ static int magicmouse_probe(struct hid_device *hdev,
 
 	msc = kzalloc(sizeof(*msc), GFP_KERNEL);
 	if (msc == NULL) {
-		dev_err(&hdev->dev, "can't alloc magicmouse descriptor\n");
+		hid_err(hdev, "can't alloc magicmouse descriptor\n");
 		return -ENOMEM;
 	}
 
@@ -459,13 +461,13 @@ static int magicmouse_probe(struct hid_device *hdev,
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "magicmouse hid parse failed\n");
+		hid_err(hdev, "magicmouse hid parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
-		dev_err(&hdev->dev, "magicmouse hw start failed\n");
+		hid_err(hdev, "magicmouse hw start failed\n");
 		goto err_free;
 	}
 
@@ -486,7 +488,7 @@ static int magicmouse_probe(struct hid_device *hdev,
 	}
 
 	if (!report) {
-		dev_err(&hdev->dev, "unable to register touch report\n");
+		hid_err(hdev, "unable to register touch report\n");
 		ret = -ENOMEM;
 		goto err_stop_hw;
 	}
@@ -495,8 +497,7 @@ static int magicmouse_probe(struct hid_device *hdev,
 	ret = hdev->hid_output_raw_report(hdev, feature, sizeof(feature),
 			HID_FEATURE_REPORT);
 	if (ret != sizeof(feature)) {
-		dev_err(&hdev->dev, "unable to request touch data (%d)\n",
-				ret);
+		hid_err(hdev, "unable to request touch data (%d)\n", ret);
 		goto err_stop_hw;
 	}
 
@@ -540,7 +541,7 @@ static int __init magicmouse_init(void)
 
 	ret = hid_register_driver(&magicmouse_driver);
 	if (ret)
-		printk(KERN_ERR "can't register magicmouse driver\n");
+		pr_err("can't register magicmouse driver\n");
 
 	return ret;
 }
diff --git a/drivers/hid/hid-microsoft.c b/drivers/hid/hid-microsoft.c
index dc618c33d0a2..0f6fc54dc196 100644
--- a/drivers/hid/hid-microsoft.c
+++ b/drivers/hid/hid-microsoft.c
@@ -40,8 +40,7 @@ static __u8 *ms_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 
 	if ((quirks & MS_RDESC) && *rsize == 571 && rdesc[557] == 0x19 &&
 			rdesc[559] == 0x29) {
-		dev_info(&hdev->dev, "fixing up Microsoft Wireless Receiver "
-				"Model 1028 report descriptor\n");
+		hid_info(hdev, "fixing up Microsoft Wireless Receiver Model 1028 report descriptor\n");
 		rdesc[557] = 0x35;
 		rdesc[559] = 0x45;
 	}
@@ -155,14 +154,14 @@ static int ms_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT | ((quirks & MS_HIDINPUT) ?
 				HID_CONNECT_HIDINPUT_FORCE : 0));
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-monterey.c b/drivers/hid/hid-monterey.c
index c95c31e2d869..dedf757781ae 100644
--- a/drivers/hid/hid-monterey.c
+++ b/drivers/hid/hid-monterey.c
@@ -26,8 +26,7 @@ static __u8 *mr_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		unsigned int *rsize)
 {
 	if (*rsize >= 30 && rdesc[29] == 0x05 && rdesc[30] == 0x09) {
-		dev_info(&hdev->dev, "fixing up button/consumer in HID report "
-				"descriptor\n");
+		hid_info(hdev, "fixing up button/consumer in HID report descriptor\n");
 		rdesc[30] = 0x0c;
 	}
 	return rdesc;
diff --git a/drivers/hid/hid-mosart.c b/drivers/hid/hid-mosart.c
index ac5421d568f1..0668685380d5 100644
--- a/drivers/hid/hid-mosart.c
+++ b/drivers/hid/hid-mosart.c
@@ -199,7 +199,7 @@ static int mosart_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	td = kmalloc(sizeof(struct mosart_data), GFP_KERNEL);
 	if (!td) {
-		dev_err(&hdev->dev, "cannot allocate MosArt data\n");
+		hid_err(hdev, "cannot allocate MosArt data\n");
 		return -ENOMEM;
 	}
 	td->valid = false;
diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c
index 69169efa1e16..beb403421e72 100644
--- a/drivers/hid/hid-ntrig.c
+++ b/drivers/hid/hid-ntrig.c
@@ -130,8 +130,7 @@ static void ntrig_report_version(struct hid_device *hdev)
 	if (ret == 8) {
 		ret = ntrig_version_string(&data[2], buf);
 
-		dev_info(&hdev->dev,
-			 "Firmware version: %s (%02x%02x %02x%02x)\n",
+		hid_info(hdev, "Firmware version: %s (%02x%02x %02x%02x)\n",
 			 buf, data[2], data[3], data[4], data[5]);
 	}
 
@@ -831,7 +830,7 @@ static int ntrig_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	nd = kmalloc(sizeof(struct ntrig_data), GFP_KERNEL);
 	if (!nd) {
-		dev_err(&hdev->dev, "cannot allocate N-Trig data\n");
+		hid_err(hdev, "cannot allocate N-Trig data\n");
 		return -ENOMEM;
 	}
 
@@ -850,13 +849,13 @@ static int ntrig_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-ortek.c b/drivers/hid/hid-ortek.c
index 2e79716dca31..e90edfc63051 100644
--- a/drivers/hid/hid-ortek.c
+++ b/drivers/hid/hid-ortek.c
@@ -23,8 +23,7 @@ static __u8 *ortek_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		unsigned int *rsize)
 {
 	if (*rsize >= 56 && rdesc[54] == 0x25 && rdesc[55] == 0x01) {
-		dev_info(&hdev->dev, "Fixing up Ortek WKB-2000 "
-				"report descriptor.\n");
+		hid_info(hdev, "Fixing up Ortek WKB-2000 report descriptor\n");
 		rdesc[55] = 0x92;
 	}
 	return rdesc;
diff --git a/drivers/hid/hid-petalynx.c b/drivers/hid/hid-petalynx.c
index 308d6ae48a3e..f1ea3ff8a98d 100644
--- a/drivers/hid/hid-petalynx.c
+++ b/drivers/hid/hid-petalynx.c
@@ -29,8 +29,7 @@ static __u8 *pl_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 	if (*rsize >= 60 && rdesc[39] == 0x2a && rdesc[40] == 0xf5 &&
 			rdesc[41] == 0x00 && rdesc[59] == 0x26 &&
 			rdesc[60] == 0xf9 && rdesc[61] == 0x00) {
-		dev_info(&hdev->dev, "fixing up Petalynx Maxter Remote report "
-				"descriptor\n");
+		hid_info(hdev, "fixing up Petalynx Maxter Remote report descriptor\n");
 		rdesc[60] = 0xfa;
 		rdesc[40] = 0xfa;
 	}
@@ -77,13 +76,13 @@ static int pl_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-picolcd.c b/drivers/hid/hid-picolcd.c
index 38565fee7bf1..e4ce47156106 100644
--- a/drivers/hid/hid-picolcd.c
+++ b/drivers/hid/hid-picolcd.c
@@ -253,7 +253,7 @@ static struct hid_report *picolcd_report(int id, struct hid_device *hdev, int di
 		if (report->id == id)
 			return report;
 	}
-	dev_warn(&hdev->dev, "No report with id 0x%x found\n", id);
+	hid_warn(hdev, "No report with id 0x%x found\n", id);
 	return NULL;
 }
 
@@ -1329,7 +1329,7 @@ static int picolcd_check_version(struct hid_device *hdev)
 
 	verinfo = picolcd_send_and_wait(hdev, REPORT_VERSION, NULL, 0);
 	if (!verinfo) {
-		dev_err(&hdev->dev, "no version response from PicoLCD");
+		hid_err(hdev, "no version response from PicoLCD\n");
 		return -ENODEV;
 	}
 
@@ -1337,14 +1337,14 @@ static int picolcd_check_version(struct hid_device *hdev)
 		data->version[0] = verinfo->raw_data[1];
 		data->version[1] = verinfo->raw_data[0];
 		if (data->status & PICOLCD_BOOTLOADER) {
-			dev_info(&hdev->dev, "PicoLCD, bootloader version %d.%d\n",
-					verinfo->raw_data[1], verinfo->raw_data[0]);
+			hid_info(hdev, "PicoLCD, bootloader version %d.%d\n",
+				 verinfo->raw_data[1], verinfo->raw_data[0]);
 		} else {
-			dev_info(&hdev->dev, "PicoLCD, firmware version %d.%d\n",
-					verinfo->raw_data[1], verinfo->raw_data[0]);
+			hid_info(hdev, "PicoLCD, firmware version %d.%d\n",
+				 verinfo->raw_data[1], verinfo->raw_data[0]);
 		}
 	} else {
-		dev_err(&hdev->dev, "confused, got unexpected version response from PicoLCD\n");
+		hid_err(hdev, "confused, got unexpected version response from PicoLCD\n");
 		ret = -EINVAL;
 	}
 	kfree(verinfo);
@@ -2328,8 +2328,7 @@ static void picolcd_init_devfs(struct picolcd_data *data,
 			(flash_w ? S_IWUSR : 0) | (flash_r ? S_IRUSR : 0),
 			hdev->debug_dir, data, &picolcd_debug_flash_fops);
 	} else if (flash_r || flash_w)
-		dev_warn(&hdev->dev, "Unexpected FLASH access reports, "
-				"please submit rdesc for review\n");
+		hid_warn(hdev, "Unexpected FLASH access reports, please submit rdesc for review\n");
 }
 
 static void picolcd_exit_devfs(struct picolcd_data *data)
@@ -2457,13 +2456,13 @@ static int picolcd_init_keys(struct picolcd_data *data,
 		return -ENODEV;
 	if (report->maxfield != 1 || report->field[0]->report_count != 2 ||
 			report->field[0]->report_size != 8) {
-		dev_err(&hdev->dev, "unsupported KEY_STATE report");
+		hid_err(hdev, "unsupported KEY_STATE report\n");
 		return -EINVAL;
 	}
 
 	idev = input_allocate_device();
 	if (idev == NULL) {
-		dev_err(&hdev->dev, "failed to allocate input device");
+		hid_err(hdev, "failed to allocate input device\n");
 		return -ENOMEM;
 	}
 	input_set_drvdata(idev, hdev);
@@ -2485,7 +2484,7 @@ static int picolcd_init_keys(struct picolcd_data *data,
 		input_set_capability(idev, EV_KEY, data->keycode[i]);
 	error = input_register_device(idev);
 	if (error) {
-		dev_err(&hdev->dev, "error registering the input device");
+		hid_err(hdev, "error registering the input device\n");
 		input_free_device(idev);
 		return error;
 	}
@@ -2522,9 +2521,8 @@ static int picolcd_probe_lcd(struct hid_device *hdev, struct picolcd_data *data)
 		return error;
 
 	if (data->version[0] != 0 && data->version[1] != 3)
-		dev_info(&hdev->dev, "Device with untested firmware revision, "
-				"please submit /sys/kernel/debug/hid/%s/rdesc for this device.\n",
-				dev_name(&hdev->dev));
+		hid_info(hdev, "Device with untested firmware revision, please submit /sys/kernel/debug/hid/%s/rdesc for this device.\n",
+			 dev_name(&hdev->dev));
 
 	/* Setup keypad input device */
 	error = picolcd_init_keys(data, picolcd_in_report(REPORT_KEY_STATE, hdev));
@@ -2581,9 +2579,8 @@ static int picolcd_probe_bootloader(struct hid_device *hdev, struct picolcd_data
 		return error;
 
 	if (data->version[0] != 1 && data->version[1] != 0)
-		dev_info(&hdev->dev, "Device with untested bootloader revision, "
-				"please submit /sys/kernel/debug/hid/%s/rdesc for this device.\n",
-				dev_name(&hdev->dev));
+		hid_info(hdev, "Device with untested bootloader revision, please submit /sys/kernel/debug/hid/%s/rdesc for this device.\n",
+			 dev_name(&hdev->dev));
 
 	picolcd_init_devfs(data, NULL, NULL,
 			picolcd_out_report(REPORT_BL_READ_MEMORY, hdev),
@@ -2605,7 +2602,7 @@ static int picolcd_probe(struct hid_device *hdev,
 	 */
 	data = kzalloc(sizeof(struct picolcd_data), GFP_KERNEL);
 	if (data == NULL) {
-		dev_err(&hdev->dev, "can't allocate space for Minibox PicoLCD device data\n");
+		hid_err(hdev, "can't allocate space for Minibox PicoLCD device data\n");
 		error = -ENOMEM;
 		goto err_no_cleanup;
 	}
@@ -2621,7 +2618,7 @@ static int picolcd_probe(struct hid_device *hdev,
 	/* Parse the device reports and start it up */
 	error = hid_parse(hdev);
 	if (error) {
-		dev_err(&hdev->dev, "device report parse failed\n");
+		hid_err(hdev, "device report parse failed\n");
 		goto err_cleanup_data;
 	}
 
@@ -2631,25 +2628,25 @@ static int picolcd_probe(struct hid_device *hdev,
 	error = hid_hw_start(hdev, 0);
 	hdev->claimed = 0;
 	if (error) {
-		dev_err(&hdev->dev, "hardware start failed\n");
+		hid_err(hdev, "hardware start failed\n");
 		goto err_cleanup_data;
 	}
 
 	error = hid_hw_open(hdev);
 	if (error) {
-		dev_err(&hdev->dev, "failed to open input interrupt pipe for key and IR events\n");
+		hid_err(hdev, "failed to open input interrupt pipe for key and IR events\n");
 		goto err_cleanup_hid_hw;
 	}
 
 	error = device_create_file(&hdev->dev, &dev_attr_operation_mode_delay);
 	if (error) {
-		dev_err(&hdev->dev, "failed to create sysfs attributes\n");
+		hid_err(hdev, "failed to create sysfs attributes\n");
 		goto err_cleanup_hid_ll;
 	}
 
 	error = device_create_file(&hdev->dev, &dev_attr_operation_mode);
 	if (error) {
-		dev_err(&hdev->dev, "failed to create sysfs attributes\n");
+		hid_err(hdev, "failed to create sysfs attributes\n");
 		goto err_cleanup_sysfs1;
 	}
 
diff --git a/drivers/hid/hid-pl.c b/drivers/hid/hid-pl.c
index 9f41e2bd8483..06e5300d43d2 100644
--- a/drivers/hid/hid-pl.c
+++ b/drivers/hid/hid-pl.c
@@ -103,7 +103,7 @@ static int plff_init(struct hid_device *hid)
 	*/
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output reports found\n");
+		hid_err(hid, "no output reports found\n");
 		return -ENODEV;
 	}
 
@@ -112,14 +112,13 @@ static int plff_init(struct hid_device *hid)
 		report_ptr = report_ptr->next;
 
 		if (report_ptr == report_list) {
-			dev_err(&hid->dev, "required output report is "
-					"missing\n");
+			hid_err(hid, "required output report is missing\n");
 			return -ENODEV;
 		}
 
 		report = list_entry(report_ptr, struct hid_report, list);
 		if (report->maxfield < 1) {
-			dev_err(&hid->dev, "no fields in the report\n");
+			hid_err(hid, "no fields in the report\n");
 			return -ENODEV;
 		}
 
@@ -137,7 +136,7 @@ static int plff_init(struct hid_device *hid)
 			weak = &report->field[3]->value[0];
 			debug("detected 4-field device");
 		} else {
-			dev_err(&hid->dev, "not enough fields or values\n");
+			hid_err(hid, "not enough fields or values\n");
 			return -ENODEV;
 		}
 
@@ -164,8 +163,7 @@ static int plff_init(struct hid_device *hid)
 		usbhid_submit_report(hid, plff->report, USB_DIR_OUT);
 	}
 
-	dev_info(&hid->dev, "Force feedback for PantherLord/GreenAsia "
-	       "devices by Anssi Hannula <anssi.hannula@gmail.com>\n");
+	hid_info(hid, "Force feedback for PantherLord/GreenAsia devices by Anssi Hannula <anssi.hannula@gmail.com>\n");
 
 	return 0;
 }
@@ -185,13 +183,13 @@ static int pl_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c
index 5f0fa6c1bf87..ab19f2905d27 100644
--- a/drivers/hid/hid-prodikeys.c
+++ b/drivers/hid/hid-prodikeys.c
@@ -16,6 +16,8 @@
  * any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/usb.h>
@@ -285,11 +287,11 @@ static int pcmidi_get_output_report(struct pcmidi_snd *pm)
 			continue;
 
 		if (report->maxfield < 1) {
-			dev_err(&hdev->dev, "output report is empty\n");
+			hid_err(hdev, "output report is empty\n");
 			break;
 		}
 		if (report->field[0]->report_count != 2) {
-			dev_err(&hdev->dev, "field count too low\n");
+			hid_err(hdev, "field count too low\n");
 			break;
 		}
 		pm->pcmidi_report6 = report;
@@ -746,8 +748,8 @@ static __u8 *pk_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 	if (*rsize == 178 &&
 	      rdesc[111] == 0x06 && rdesc[112] == 0x00 &&
 	      rdesc[113] == 0xff) {
-		dev_info(&hdev->dev, "fixing up pc-midi keyboard report "
-			"descriptor\n");
+		hid_info(hdev,
+			 "fixing up pc-midi keyboard report descriptor\n");
 
 		rdesc[144] = 0x18; /* report 4: was 0x10 report count */
 	}
@@ -805,7 +807,7 @@ static int pk_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	pk = kzalloc(sizeof(*pk), GFP_KERNEL);
 	if (pk == NULL) {
-		dev_err(&hdev->dev, "prodikeys: can't alloc descriptor\n");
+		hid_err(hdev, "can't alloc descriptor\n");
 		return -ENOMEM;
 	}
 
@@ -813,8 +815,7 @@ static int pk_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	pm = kzalloc(sizeof(*pm), GFP_KERNEL);
 	if (pm == NULL) {
-		dev_err(&hdev->dev,
-			"prodikeys: can't alloc descriptor\n");
+		hid_err(hdev, "can't alloc descriptor\n");
 		ret = -ENOMEM;
 		goto err_free;
 	}
@@ -827,7 +828,7 @@ static int pk_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "prodikeys: hid parse failed\n");
+		hid_err(hdev, "hid parse failed\n");
 		goto err_free;
 	}
 
@@ -837,7 +838,7 @@ static int pk_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
-		dev_err(&hdev->dev, "prodikeys: hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
@@ -896,7 +897,7 @@ static int pk_init(void)
 
 	ret = hid_register_driver(&pk_driver);
 	if (ret)
-		printk(KERN_ERR "can't register prodikeys driver\n");
+		pr_err("can't register prodikeys driver\n");
 
 	return ret;
 }
diff --git a/drivers/hid/hid-quanta.c b/drivers/hid/hid-quanta.c
index 54d3db50605b..87a54df4d4ac 100644
--- a/drivers/hid/hid-quanta.c
+++ b/drivers/hid/hid-quanta.c
@@ -195,7 +195,7 @@ static int quanta_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	td = kmalloc(sizeof(struct quanta_data), GFP_KERNEL);
 	if (!td) {
-		dev_err(&hdev->dev, "cannot allocate Quanta Touch data\n");
+		hid_err(hdev, "cannot allocate Quanta Touch data\n");
 		return -ENOMEM;
 	}
 	td->valid = false;
diff --git a/drivers/hid/hid-roccat-kone.c b/drivers/hid/hid-roccat-kone.c
index f77695762cb5..73199de2e37f 100644
--- a/drivers/hid/hid-roccat-kone.c
+++ b/drivers/hid/hid-roccat-kone.c
@@ -90,8 +90,7 @@ static int kone_check_write(struct usb_device *usb_dev)
 		kfree(data);
 		return 0;
 	} else { /* unknown answer */
-		dev_err(&usb_dev->dev, "got retval %d when checking write\n",
-				*data);
+		hid_err(usb_dev, "got retval %d when checking write\n", *data);
 		kfree(data);
 		return -EIO;
 	}
@@ -556,7 +555,7 @@ static ssize_t kone_sysfs_set_tcu(struct device *dev,
 
 		retval = kone_set_settings(usb_dev, &kone->settings);
 		if (retval) {
-			dev_err(&usb_dev->dev, "couldn't set tcu state\n");
+			hid_err(usb_dev, "couldn't set tcu state\n");
 			/*
 			 * try to reread valid settings into buffer overwriting
 			 * first error code
@@ -570,7 +569,7 @@ static ssize_t kone_sysfs_set_tcu(struct device *dev,
 
 	retval = size;
 exit_no_settings:
-	dev_err(&usb_dev->dev, "couldn't read settings\n");
+	hid_err(usb_dev, "couldn't read settings\n");
 exit_unlock:
 	mutex_unlock(&kone->kone_lock);
 	return retval;
@@ -818,21 +817,20 @@ static int kone_init_specials(struct hid_device *hdev)
 
 		kone = kzalloc(sizeof(*kone), GFP_KERNEL);
 		if (!kone) {
-			dev_err(&hdev->dev, "can't alloc device descriptor\n");
+			hid_err(hdev, "can't alloc device descriptor\n");
 			return -ENOMEM;
 		}
 		hid_set_drvdata(hdev, kone);
 
 		retval = kone_init_kone_device_struct(usb_dev, kone);
 		if (retval) {
-			dev_err(&hdev->dev,
-					"couldn't init struct kone_device\n");
+			hid_err(hdev, "couldn't init struct kone_device\n");
 			goto exit_free;
 		}
 
 		retval = roccat_connect(hdev);
 		if (retval < 0) {
-			dev_err(&hdev->dev, "couldn't init char dev\n");
+			hid_err(hdev, "couldn't init char dev\n");
 			/* be tolerant about not getting chrdev */
 		} else {
 			kone->roccat_claimed = 1;
@@ -841,7 +839,7 @@ static int kone_init_specials(struct hid_device *hdev)
 
 		retval = kone_create_sysfs_attributes(intf);
 		if (retval) {
-			dev_err(&hdev->dev, "cannot create sysfs files\n");
+			hid_err(hdev, "cannot create sysfs files\n");
 			goto exit_free;
 		}
 	} else {
@@ -876,19 +874,19 @@ static int kone_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	retval = hid_parse(hdev);
 	if (retval) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto exit;
 	}
 
 	retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (retval) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto exit;
 	}
 
 	retval = kone_init_specials(hdev);
 	if (retval) {
-		dev_err(&hdev->dev, "couldn't install mouse\n");
+		hid_err(hdev, "couldn't install mouse\n");
 		goto exit_stop;
 	}
 
diff --git a/drivers/hid/hid-roccat-pyra.c b/drivers/hid/hid-roccat-pyra.c
index 9bf23047892a..e7b4affe2664 100644
--- a/drivers/hid/hid-roccat-pyra.c
+++ b/drivers/hid/hid-roccat-pyra.c
@@ -87,9 +87,8 @@ static int pyra_receive_control_status(struct usb_device *usb_dev)
 			control.value == 1)
 			return 0;
 	else {
-		dev_err(&usb_dev->dev, "receive control status: "
-				"unknown response 0x%x 0x%x\n",
-				control.request, control.value);
+		hid_err(usb_dev, "receive control status: unknown response 0x%x 0x%x\n",
+			control.request, control.value);
 		return -EINVAL;
 	}
 }
@@ -770,21 +769,20 @@ static int pyra_init_specials(struct hid_device *hdev)
 
 		pyra = kzalloc(sizeof(*pyra), GFP_KERNEL);
 		if (!pyra) {
-			dev_err(&hdev->dev, "can't alloc device descriptor\n");
+			hid_err(hdev, "can't alloc device descriptor\n");
 			return -ENOMEM;
 		}
 		hid_set_drvdata(hdev, pyra);
 
 		retval = pyra_init_pyra_device_struct(usb_dev, pyra);
 		if (retval) {
-			dev_err(&hdev->dev,
-					"couldn't init struct pyra_device\n");
+			hid_err(hdev, "couldn't init struct pyra_device\n");
 			goto exit_free;
 		}
 
 		retval = roccat_connect(hdev);
 		if (retval < 0) {
-			dev_err(&hdev->dev, "couldn't init char dev\n");
+			hid_err(hdev, "couldn't init char dev\n");
 		} else {
 			pyra->chrdev_minor = retval;
 			pyra->roccat_claimed = 1;
@@ -792,7 +790,7 @@ static int pyra_init_specials(struct hid_device *hdev)
 
 		retval = pyra_create_sysfs_attributes(intf);
 		if (retval) {
-			dev_err(&hdev->dev, "cannot create sysfs files\n");
+			hid_err(hdev, "cannot create sysfs files\n");
 			goto exit_free;
 		}
 	} else {
@@ -826,19 +824,19 @@ static int pyra_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	retval = hid_parse(hdev);
 	if (retval) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto exit;
 	}
 
 	retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (retval) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto exit;
 	}
 
 	retval = pyra_init_specials(hdev);
 	if (retval) {
-		dev_err(&hdev->dev, "couldn't install mouse\n");
+		hid_err(hdev, "couldn't install mouse\n");
 		goto exit_stop;
 	}
 	return 0;
diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c
index a9d9b29ff47f..4bc7a93dc1f5 100644
--- a/drivers/hid/hid-roccat.c
+++ b/drivers/hid/hid-roccat.c
@@ -21,6 +21,8 @@
  * It is inspired by hidraw, but uses only one circular buffer for all readers.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/cdev.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
@@ -165,8 +167,7 @@ static int roccat_open(struct inode *inode, struct file *file)
 	mutex_lock(&device->readers_lock);
 
 	if (!device) {
-		printk(KERN_EMERG "roccat device with minor %d doesn't exist\n",
-				minor);
+		pr_emerg("roccat device with minor %d doesn't exist\n", minor);
 		error = -ENODEV;
 		goto exit_err;
 	}
@@ -214,8 +215,7 @@ static int roccat_release(struct inode *inode, struct file *file)
 	device = devices[minor];
 	if (!device) {
 		mutex_unlock(&devices_lock);
-		printk(KERN_EMERG "roccat device with minor %d doesn't exist\n",
-				minor);
+		pr_emerg("roccat device with minor %d doesn't exist\n", minor);
 		return -ENODEV;
 	}
 
@@ -392,7 +392,7 @@ static int __init roccat_init(void)
 	roccat_major = MAJOR(dev_id);
 
 	if (retval < 0) {
-		printk(KERN_WARNING "roccat: can't get major number\n");
+		pr_warn("can't get major number\n");
 		return retval;
 	}
 
diff --git a/drivers/hid/hid-samsung.c b/drivers/hid/hid-samsung.c
index 35894444e000..3c1fd8af5e0c 100644
--- a/drivers/hid/hid-samsung.c
+++ b/drivers/hid/hid-samsung.c
@@ -57,8 +57,8 @@
 static inline void samsung_irda_dev_trace(struct hid_device *hdev,
 		unsigned int rsize)
 {
-	dev_info(&hdev->dev, "fixing up Samsung IrDA %d byte report "
-			"descriptor\n", rsize);
+	hid_info(hdev, "fixing up Samsung IrDA %d byte report descriptor\n",
+		 rsize);
 }
 
 static __u8 *samsung_irda_report_fixup(struct hid_device *hdev, __u8 *rdesc,
@@ -160,7 +160,7 @@ static int samsung_probe(struct hid_device *hdev,
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
@@ -174,7 +174,7 @@ static int samsung_probe(struct hid_device *hdev,
 
 	ret = hid_hw_start(hdev, cmask);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-sjoy.c b/drivers/hid/hid-sjoy.c
index e10a7687ebf2..16f7cafc9695 100644
--- a/drivers/hid/hid-sjoy.c
+++ b/drivers/hid/hid-sjoy.c
@@ -74,26 +74,25 @@ static int sjoyff_init(struct hid_device *hid)
 	int error;
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output reports found\n");
+		hid_err(hid, "no output reports found\n");
 		return -ENODEV;
 	}
 
 	report_ptr = report_ptr->next;
 
 	if (report_ptr == report_list) {
-		dev_err(&hid->dev, "required output report is "
-				"missing\n");
+		hid_err(hid, "required output report is missing\n");
 		return -ENODEV;
 	}
 
 	report = list_entry(report_ptr, struct hid_report, list);
 	if (report->maxfield < 1) {
-		dev_err(&hid->dev, "no fields in the report\n");
+		hid_err(hid, "no fields in the report\n");
 		return -ENODEV;
 	}
 
 	if (report->field[0]->report_count < 3) {
-		dev_err(&hid->dev, "not enough values in the field\n");
+		hid_err(hid, "not enough values in the field\n");
 		return -ENODEV;
 	}
 
@@ -117,8 +116,7 @@ static int sjoyff_init(struct hid_device *hid)
 	sjoyff->report->field[0]->value[2] = 0x00;
 	usbhid_submit_report(hid, sjoyff->report, USB_DIR_OUT);
 
-	dev_info(&hid->dev,
-		"Force feedback for SmartJoy PLUS PS2/USB adapter\n");
+	hid_info(hid, "Force feedback for SmartJoy PLUS PS2/USB adapter\n");
 
 	return 0;
 }
@@ -135,13 +133,13 @@ static int sjoy_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index 677bb3da10e8..68d7b36e31e4 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c
@@ -40,8 +40,7 @@ static __u8 *sony_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 
 	if ((sc->quirks & VAIO_RDESC_CONSTANT) &&
 			*rsize >= 56 && rdesc[54] == 0x81 && rdesc[55] == 0x07) {
-		dev_info(&hdev->dev, "Fixing up Sony Vaio VGX report "
-				"descriptor\n");
+		hid_info(hdev, "Fixing up Sony Vaio VGX report descriptor\n");
 		rdesc[55] = 0x06;
 	}
 	return rdesc;
@@ -89,7 +88,7 @@ static int sixaxis_set_operational_usb(struct hid_device *hdev)
 				 (3 << 8) | 0xf2, ifnum, buf, 17,
 				 USB_CTRL_GET_TIMEOUT);
 	if (ret < 0)
-		dev_err(&hdev->dev, "can't set operational mode\n");
+		hid_err(hdev, "can't set operational mode\n");
 
 	kfree(buf);
 
@@ -110,7 +109,7 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	sc = kzalloc(sizeof(*sc), GFP_KERNEL);
 	if (sc == NULL) {
-		dev_err(&hdev->dev, "can't alloc sony descriptor\n");
+		hid_err(hdev, "can't alloc sony descriptor\n");
 		return -ENOMEM;
 	}
 
@@ -119,14 +118,14 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT |
 			HID_CONNECT_HIDDEV_FORCE);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hid-stantum.c b/drivers/hid/hid-stantum.c
index 3171be28c3d5..b2be1d11916b 100644
--- a/drivers/hid/hid-stantum.c
+++ b/drivers/hid/hid-stantum.c
@@ -222,7 +222,7 @@ static int stantum_probe(struct hid_device *hdev,
 
 	sd = kmalloc(sizeof(struct stantum_data), GFP_KERNEL);
 	if (!sd) {
-		dev_err(&hdev->dev, "cannot allocate Stantum data\n");
+		hid_err(hdev, "cannot allocate Stantum data\n");
 		return -ENOMEM;
 	}
 	sd->valid = false;
diff --git a/drivers/hid/hid-sunplus.c b/drivers/hid/hid-sunplus.c
index 164ed568f6cf..d484a0043dd4 100644
--- a/drivers/hid/hid-sunplus.c
+++ b/drivers/hid/hid-sunplus.c
@@ -27,8 +27,7 @@ static __u8 *sp_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 {
 	if (*rsize >= 107 && rdesc[104] == 0x26 && rdesc[105] == 0x80 &&
 			rdesc[106] == 0x03) {
-		dev_info(&hdev->dev, "fixing up Sunplus Wireless Desktop "
-				"report descriptor\n");
+		hid_info(hdev, "fixing up Sunplus Wireless Desktop report descriptor\n");
 		rdesc[105] = rdesc[110] = 0x03;
 		rdesc[106] = rdesc[111] = 0x21;
 	}
diff --git a/drivers/hid/hid-tmff.c b/drivers/hid/hid-tmff.c
index 15434c814793..356a98fcb365 100644
--- a/drivers/hid/hid-tmff.c
+++ b/drivers/hid/hid-tmff.c
@@ -151,28 +151,23 @@ static int tmff_init(struct hid_device *hid, const signed short *ff_bits)
 			switch (field->usage[0].hid) {
 			case THRUSTMASTER_USAGE_FF:
 				if (field->report_count < 2) {
-					dev_warn(&hid->dev, "ignoring FF field "
-						"with report_count < 2\n");
+					hid_warn(hid, "ignoring FF field with report_count < 2\n");
 					continue;
 				}
 
 				if (field->logical_maximum ==
 						field->logical_minimum) {
-					dev_warn(&hid->dev, "ignoring FF field "
-							"with logical_maximum "
-							"== logical_minimum\n");
+					hid_warn(hid, "ignoring FF field with logical_maximum == logical_minimum\n");
 					continue;
 				}
 
 				if (tmff->report && tmff->report != report) {
-					dev_warn(&hid->dev, "ignoring FF field "
-							"in other report\n");
+					hid_warn(hid, "ignoring FF field in other report\n");
 					continue;
 				}
 
 				if (tmff->ff_field && tmff->ff_field != field) {
-					dev_warn(&hid->dev, "ignoring "
-							"duplicate FF field\n");
+					hid_warn(hid, "ignoring duplicate FF field\n");
 					continue;
 				}
 
@@ -185,16 +180,15 @@ static int tmff_init(struct hid_device *hid, const signed short *ff_bits)
 				break;
 
 			default:
-				dev_warn(&hid->dev, "ignoring unknown output "
-						"usage %08x\n",
-						field->usage[0].hid);
+				hid_warn(hid, "ignoring unknown output usage %08x\n",
+					 field->usage[0].hid);
 				continue;
 			}
 		}
 	}
 
 	if (!tmff->report) {
-		dev_err(&hid->dev, "can't find FF field in output reports\n");
+		hid_err(hid, "can't find FF field in output reports\n");
 		error = -ENODEV;
 		goto fail;
 	}
@@ -203,8 +197,7 @@ static int tmff_init(struct hid_device *hid, const signed short *ff_bits)
 	if (error)
 		goto fail;
 
-	dev_info(&hid->dev, "force feedback for ThrustMaster devices by Zinx "
-			"Verituse <zinx@epicsol.org>");
+	hid_info(hid, "force feedback for ThrustMaster devices by Zinx Verituse <zinx@epicsol.org>\n");
 	return 0;
 
 fail:
@@ -224,13 +217,13 @@ static int tm_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-wacom.c b/drivers/hid/hid-wacom.c
index 94caae731381..06888323828c 100644
--- a/drivers/hid/hid-wacom.c
+++ b/drivers/hid/hid-wacom.c
@@ -18,6 +18,8 @@
  * any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/hid.h>
 #include <linux/module.h>
@@ -141,8 +143,8 @@ static void wacom_poke(struct hid_device *hdev, u8 speed)
 	 * Note that if the raw queries fail, it's not a hard failure and it
 	 * is safe to continue
 	 */
-	dev_warn(&hdev->dev, "failed to poke device, command %d, err %d\n",
-				rep_data[0], ret);
+	hid_warn(hdev, "failed to poke device, command %d, err %d\n",
+		 rep_data[0], ret);
 	return;
 }
 
@@ -312,7 +314,7 @@ static int wacom_probe(struct hid_device *hdev,
 
 	wdata = kzalloc(sizeof(*wdata), GFP_KERNEL);
 	if (wdata == NULL) {
-		dev_err(&hdev->dev, "can't alloc wacom descriptor\n");
+		hid_err(hdev, "can't alloc wacom descriptor\n");
 		return -ENOMEM;
 	}
 
@@ -321,20 +323,20 @@ static int wacom_probe(struct hid_device *hdev,
 	/* Parse the HID report now */
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
 	ret = device_create_file(&hdev->dev, &dev_attr_speed);
 	if (ret)
-		dev_warn(&hdev->dev,
-			"can't create sysfs speed attribute err: %d\n", ret);
+		hid_warn(hdev,
+			 "can't create sysfs speed attribute err: %d\n", ret);
 
 	/* Set Wacom mode 2 with high reporting speed */
 	wacom_poke(hdev, 1);
@@ -349,8 +351,8 @@ static int wacom_probe(struct hid_device *hdev,
 
 	ret = power_supply_register(&hdev->dev, &wdata->battery);
 	if (ret) {
-		dev_warn(&hdev->dev,
-			"can't create sysfs battery attribute, err: %d\n", ret);
+		hid_warn(hdev, "can't create sysfs battery attribute, err: %d\n",
+			 ret);
 		/*
 		 * battery attribute is not critical for the tablet, but if it
 		 * failed then there is no need to create ac attribute
@@ -367,8 +369,8 @@ static int wacom_probe(struct hid_device *hdev,
 
 	ret = power_supply_register(&hdev->dev, &wdata->ac);
 	if (ret) {
-		dev_warn(&hdev->dev,
-			"can't create ac battery attribute, err: %d\n", ret);
+		hid_warn(hdev,
+			 "can't create ac battery attribute, err: %d\n", ret);
 		/*
 		 * ac attribute is not critical for the tablet, but if it
 		 * failed then we don't want to battery attribute to exist
@@ -454,7 +456,7 @@ static int __init wacom_init(void)
 
 	ret = hid_register_driver(&wacom_driver);
 	if (ret)
-		printk(KERN_ERR "can't register wacom driver\n");
+		pr_err("can't register wacom driver\n");
 	return ret;
 }
 
diff --git a/drivers/hid/hid-zpff.c b/drivers/hid/hid-zpff.c
index b7acceabba80..f31fab012f2f 100644
--- a/drivers/hid/hid-zpff.c
+++ b/drivers/hid/hid-zpff.c
@@ -75,14 +75,14 @@ static int zpff_init(struct hid_device *hid)
 	int error;
 
 	if (list_empty(report_list)) {
-		dev_err(&hid->dev, "no output report found\n");
+		hid_err(hid, "no output report found\n");
 		return -ENODEV;
 	}
 
 	report = list_entry(report_list->next, struct hid_report, list);
 
 	if (report->maxfield < 4) {
-		dev_err(&hid->dev, "not enough fields in report\n");
+		hid_err(hid, "not enough fields in report\n");
 		return -ENODEV;
 	}
 
@@ -105,8 +105,7 @@ static int zpff_init(struct hid_device *hid)
 	zpff->report->field[3]->value[0] = 0x00;
 	usbhid_submit_report(hid, zpff->report, USB_DIR_OUT);
 
-	dev_info(&hid->dev, "force feedback for Zeroplus based devices by "
-	       "Anssi Hannula <anssi.hannula@gmail.com>\n");
+	hid_info(hid, "force feedback for Zeroplus based devices by Anssi Hannula <anssi.hannula@gmail.com>\n");
 
 	return 0;
 }
@@ -123,13 +122,13 @@ static int zp_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF);
 	if (ret) {
-		dev_err(&hdev->dev, "hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err;
 	}
 
diff --git a/drivers/hid/hid-zydacron.c b/drivers/hid/hid-zydacron.c
index aac1f9273149..e90371508fd2 100644
--- a/drivers/hid/hid-zydacron.c
+++ b/drivers/hid/hid-zydacron.c
@@ -34,9 +34,8 @@ static __u8 *zc_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		rdesc[0x96] == 0xbc && rdesc[0x97] == 0xff &&
 		rdesc[0xca] == 0xbc && rdesc[0xcb] == 0xff &&
 		rdesc[0xe1] == 0xbc && rdesc[0xe2] == 0xff) {
-			dev_info(&hdev->dev,
-				"fixing up zydacron remote control report "
-				"descriptor\n");
+			hid_info(hdev,
+				"fixing up zydacron remote control report descriptor\n");
 			rdesc[0x96] = rdesc[0xca] = rdesc[0xe1] = 0x0c;
 			rdesc[0x97] = rdesc[0xcb] = rdesc[0xe2] = 0x00;
 		}
@@ -172,7 +171,7 @@ static int zc_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	zc = kzalloc(sizeof(*zc), GFP_KERNEL);
 	if (zc == NULL) {
-		dev_err(&hdev->dev, "zydacron: can't alloc descriptor\n");
+		hid_err(hdev, "can't alloc descriptor\n");
 		return -ENOMEM;
 	}
 
@@ -180,13 +179,13 @@ static int zc_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	ret = hid_parse(hdev);
 	if (ret) {
-		dev_err(&hdev->dev, "zydacron: parse failed\n");
+		hid_err(hdev, "parse failed\n");
 		goto err_free;
 	}
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
-		dev_err(&hdev->dev, "zydacron: hw start failed\n");
+		hid_err(hdev, "hw start failed\n");
 		goto err_free;
 	}
 
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 5aefe73cf795..eb16cd143e2a 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -19,6 +19,8 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/errno.h>
@@ -123,15 +125,15 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t
 	}
 
 	if (count > HID_MAX_BUFFER_SIZE) {
-		printk(KERN_WARNING "hidraw: pid %d passed too large report\n",
-				task_pid_nr(current));
+		hid_warn(dev, "pid %d passed too large report\n",
+			 task_pid_nr(current));
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (count < 2) {
-		printk(KERN_WARNING "hidraw: pid %d passed too short report\n",
-				task_pid_nr(current));
+		hid_warn(dev, "pid %d passed too short report\n",
+			 task_pid_nr(current));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -450,7 +452,7 @@ int __init hidraw_init(void)
 	hidraw_major = MAJOR(dev_id);
 
 	if (result < 0) {
-		printk(KERN_WARNING "hidraw: can't get major number\n");
+		pr_warn("can't get major number\n");
 		result = 0;
 		goto out;
 	}
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 5489eab3a6bd..276758f53ab5 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -136,10 +136,10 @@ static void hid_reset(struct work_struct *work)
 			hid_io_error(hid);
 		break;
 	default:
-		err_hid("can't reset device, %s-%s/input%d, status %d",
-				hid_to_usb_dev(hid)->bus->bus_name,
-				hid_to_usb_dev(hid)->devpath,
-				usbhid->ifnum, rc);
+		hid_err(hid, "can't reset device, %s-%s/input%d, status %d\n",
+			hid_to_usb_dev(hid)->bus->bus_name,
+			hid_to_usb_dev(hid)->devpath,
+			usbhid->ifnum, rc);
 		/* FALLTHROUGH */
 	case -EHOSTUNREACH:
 	case -ENODEV:
@@ -278,18 +278,18 @@ static void hid_irq_in(struct urb *urb)
 		hid_io_error(hid);
 		return;
 	default:		/* error */
-		dev_warn(&urb->dev->dev, "input irq status %d  "
-				"received\n", urb->status);
+		hid_warn(urb->dev, "input irq status %d received\n",
+			 urb->status);
 	}
 
 	status = usb_submit_urb(urb, GFP_ATOMIC);
 	if (status) {
 		clear_bit(HID_IN_RUNNING, &usbhid->iofl);
 		if (status != -EPERM) {
-			err_hid("can't resubmit intr, %s-%s/input%d, status %d",
-					hid_to_usb_dev(hid)->bus->bus_name,
-					hid_to_usb_dev(hid)->devpath,
-					usbhid->ifnum, status);
+			hid_err(hid, "can't resubmit intr, %s-%s/input%d, status %d\n",
+				hid_to_usb_dev(hid)->bus->bus_name,
+				hid_to_usb_dev(hid)->devpath,
+				usbhid->ifnum, status);
 			hid_io_error(hid);
 		}
 	}
@@ -313,7 +313,7 @@ static int hid_submit_out(struct hid_device *hid)
 		dbg_hid("submitting out urb\n");
 
 		if (usb_submit_urb(usbhid->urbout, GFP_ATOMIC)) {
-			err_hid("usb_submit_urb(out) failed");
+			hid_err(hid, "usb_submit_urb(out) failed\n");
 			return -1;
 		}
 		usbhid->last_out = jiffies;
@@ -375,7 +375,7 @@ static int hid_submit_ctrl(struct hid_device *hid)
 			usbhid->cr->wValue, usbhid->cr->wIndex, usbhid->cr->wLength);
 
 		if (usb_submit_urb(usbhid->urbctrl, GFP_ATOMIC)) {
-			err_hid("usb_submit_urb(ctrl) failed");
+			hid_err(hid, "usb_submit_urb(ctrl) failed\n");
 			return -1;
 		}
 		usbhid->last_ctrl = jiffies;
@@ -413,8 +413,8 @@ static void hid_irq_out(struct urb *urb)
 	case -ENOENT:
 		break;
 	default:		/* error */
-		dev_warn(&urb->dev->dev, "output irq status %d "
-				"received\n", urb->status);
+		hid_warn(urb->dev, "output irq status %d received\n",
+			 urb->status);
 	}
 
 	spin_lock_irqsave(&usbhid->lock, flags);
@@ -466,8 +466,7 @@ static void hid_ctrl(struct urb *urb)
 	case -EPIPE:		/* report not available */
 		break;
 	default:		/* error */
-		dev_warn(&urb->dev->dev, "ctrl urb status %d "
-				"received\n", status);
+		hid_warn(urb->dev, "ctrl urb status %d received\n", status);
 	}
 
 	if (unplug)
@@ -501,13 +500,13 @@ static void __usbhid_submit_report(struct hid_device *hid, struct hid_report *re
 
 	if (usbhid->urbout && dir == USB_DIR_OUT && report->type == HID_OUTPUT_REPORT) {
 		if ((head = (usbhid->outhead + 1) & (HID_OUTPUT_FIFO_SIZE - 1)) == usbhid->outtail) {
-			dev_warn(&hid->dev, "output queue full\n");
+			hid_warn(hid, "output queue full\n");
 			return;
 		}
 
 		usbhid->out[usbhid->outhead].raw_report = kmalloc(len, GFP_ATOMIC);
 		if (!usbhid->out[usbhid->outhead].raw_report) {
-			dev_warn(&hid->dev, "output queueing failed\n");
+			hid_warn(hid, "output queueing failed\n");
 			return;
 		}
 		hid_output_report(report, usbhid->out[usbhid->outhead].raw_report);
@@ -532,14 +531,14 @@ static void __usbhid_submit_report(struct hid_device *hid, struct hid_report *re
 	}
 
 	if ((head = (usbhid->ctrlhead + 1) & (HID_CONTROL_FIFO_SIZE - 1)) == usbhid->ctrltail) {
-		dev_warn(&hid->dev, "control queue full\n");
+		hid_warn(hid, "control queue full\n");
 		return;
 	}
 
 	if (dir == USB_DIR_OUT) {
 		usbhid->ctrl[usbhid->ctrlhead].raw_report = kmalloc(len, GFP_ATOMIC);
 		if (!usbhid->ctrl[usbhid->ctrlhead].raw_report) {
-			dev_warn(&hid->dev, "control queueing failed\n");
+			hid_warn(hid, "control queueing failed\n");
 			return;
 		}
 		hid_output_report(report, usbhid->ctrl[usbhid->ctrlhead].raw_report);
@@ -590,7 +589,7 @@ static int usb_hidinput_input_event(struct input_dev *dev, unsigned int type, un
 		return -1;
 
 	if ((offset = hidinput_find_field(hid, type, code, &field)) == -1) {
-		dev_warn(&dev->dev, "event field not found\n");
+		hid_warn(dev, "event field not found\n");
 		return -1;
 	}
 
@@ -722,7 +721,7 @@ void usbhid_init_reports(struct hid_device *hid)
 	}
 
 	if (err)
-		dev_warn(&hid->dev, "timeout initializing reports\n");
+		hid_warn(hid, "timeout initializing reports\n");
 }
 
 /*
@@ -1140,8 +1139,7 @@ static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *
 		if (usb_endpoint_is_int_in(&interface->endpoint[n].desc))
 			has_in++;
 	if (!has_in) {
-		dev_err(&intf->dev, "couldn't find an input interrupt "
-				"endpoint\n");
+		hid_err(intf, "couldn't find an input interrupt endpoint\n");
 		return -ENODEV;
 	}
 
@@ -1213,7 +1211,7 @@ static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *
 	ret = hid_add_device(hid);
 	if (ret) {
 		if (ret != -ENODEV)
-			dev_err(&intf->dev, "can't add hid device: %d\n", ret);
+			hid_err(intf, "can't add hid device: %d\n", ret);
 		goto err_free;
 	}
 
diff --git a/drivers/hid/usbhid/hid-pidff.c b/drivers/hid/usbhid/hid-pidff.c
index ef381d79cfa8..f91c136821f7 100644
--- a/drivers/hid/usbhid/hid-pidff.c
+++ b/drivers/hid/usbhid/hid-pidff.c
@@ -22,7 +22,7 @@
 
 /* #define DEBUG */
 
-#define debug(format, arg...) pr_debug("hid-pidff: " format "\n" , ## arg)
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/input.h>
 #include <linux/slab.h>
@@ -220,7 +220,7 @@ static int pidff_rescale_signed(int i, struct hid_field *field)
 static void pidff_set(struct pidff_usage *usage, u16 value)
 {
 	usage->value[0] = pidff_rescale(value, 0xffff, usage->field);
-	debug("calculated from %d to %d", value, usage->value[0]);
+	pr_debug("calculated from %d to %d\n", value, usage->value[0]);
 }
 
 static void pidff_set_signed(struct pidff_usage *usage, s16 value)
@@ -235,7 +235,7 @@ static void pidff_set_signed(struct pidff_usage *usage, s16 value)
 			usage->value[0] =
 			    pidff_rescale(value, 0x7fff, usage->field);
 	}
-	debug("calculated from %d to %d", value, usage->value[0]);
+	pr_debug("calculated from %d to %d\n", value, usage->value[0]);
 }
 
 /*
@@ -259,8 +259,9 @@ static void pidff_set_envelope_report(struct pidff_device *pidff,
 	pidff->set_envelope[PID_ATTACK_TIME].value[0] = envelope->attack_length;
 	pidff->set_envelope[PID_FADE_TIME].value[0] = envelope->fade_length;
 
-	debug("attack %u => %d", envelope->attack_level,
-	      pidff->set_envelope[PID_ATTACK_LEVEL].value[0]);
+	hid_dbg(pidff->hid, "attack %u => %d\n",
+		envelope->attack_level,
+		pidff->set_envelope[PID_ATTACK_LEVEL].value[0]);
 
 	usbhid_submit_report(pidff->hid, pidff->reports[PID_SET_ENVELOPE],
 			  USB_DIR_OUT);
@@ -466,33 +467,33 @@ static int pidff_request_effect_upload(struct pidff_device *pidff, int efnum)
 	pidff->create_new_effect_type->value[0] = efnum;
 	usbhid_submit_report(pidff->hid, pidff->reports[PID_CREATE_NEW_EFFECT],
 			  USB_DIR_OUT);
-	debug("create_new_effect sent, type: %d", efnum);
+	hid_dbg(pidff->hid, "create_new_effect sent, type: %d\n", efnum);
 
 	pidff->block_load[PID_EFFECT_BLOCK_INDEX].value[0] = 0;
 	pidff->block_load_status->value[0] = 0;
 	usbhid_wait_io(pidff->hid);
 
 	for (j = 0; j < 60; j++) {
-		debug("pid_block_load requested");
+		hid_dbg(pidff->hid, "pid_block_load requested\n");
 		usbhid_submit_report(pidff->hid, pidff->reports[PID_BLOCK_LOAD],
 				  USB_DIR_IN);
 		usbhid_wait_io(pidff->hid);
 		if (pidff->block_load_status->value[0] ==
 		    pidff->status_id[PID_BLOCK_LOAD_SUCCESS]) {
-			debug("device reported free memory: %d bytes",
-			      pidff->block_load[PID_RAM_POOL_AVAILABLE].value ?
-				pidff->block_load[PID_RAM_POOL_AVAILABLE].value[0] : -1);
+			hid_dbg(pidff->hid, "device reported free memory: %d bytes\n",
+				 pidff->block_load[PID_RAM_POOL_AVAILABLE].value ?
+				 pidff->block_load[PID_RAM_POOL_AVAILABLE].value[0] : -1);
 			return 0;
 		}
 		if (pidff->block_load_status->value[0] ==
 		    pidff->status_id[PID_BLOCK_LOAD_FULL]) {
-			debug("not enough memory free: %d bytes",
-			      pidff->block_load[PID_RAM_POOL_AVAILABLE].value ?
+			hid_dbg(pidff->hid, "not enough memory free: %d bytes\n",
+				pidff->block_load[PID_RAM_POOL_AVAILABLE].value ?
 				pidff->block_load[PID_RAM_POOL_AVAILABLE].value[0] : -1);
 			return -ENOSPC;
 		}
 	}
-	printk(KERN_ERR "hid-pidff: pid_block_load failed 60 times\n");
+	hid_err(pidff->hid, "pid_block_load failed 60 times\n");
 	return -EIO;
 }
 
@@ -546,7 +547,8 @@ static int pidff_erase_effect(struct input_dev *dev, int effect_id)
 	struct pidff_device *pidff = dev->ff->private;
 	int pid_id = pidff->pid_id[effect_id];
 
-	debug("starting to erase %d/%d", effect_id, pidff->pid_id[effect_id]);
+	hid_dbg(pidff->hid, "starting to erase %d/%d\n",
+		effect_id, pidff->pid_id[effect_id]);
 	/* Wait for the queue to clear. We do not want a full fifo to
 	   prevent the effect removal. */
 	usbhid_wait_io(pidff->hid);
@@ -604,8 +606,7 @@ static int pidff_upload_effect(struct input_dev *dev, struct ff_effect *effect,
 				type_id = PID_SAW_DOWN;
 				break;
 			default:
-				printk(KERN_ERR
-				       "hid-pidff: invalid waveform\n");
+				hid_err(pidff->hid, "invalid waveform\n");
 				return -EINVAL;
 			}
 
@@ -696,7 +697,7 @@ static int pidff_upload_effect(struct input_dev *dev, struct ff_effect *effect,
 		break;
 
 	default:
-		printk(KERN_ERR "hid-pidff: invalid type\n");
+		hid_err(pidff->hid, "invalid type\n");
 		return -EINVAL;
 	}
 
@@ -704,7 +705,7 @@ static int pidff_upload_effect(struct input_dev *dev, struct ff_effect *effect,
 		pidff->pid_id[effect->id] =
 		    pidff->block_load[PID_EFFECT_BLOCK_INDEX].value[0];
 
-	debug("uploaded");
+	hid_dbg(pidff->hid, "uploaded\n");
 
 	return 0;
 }
@@ -770,14 +771,14 @@ static int pidff_find_fields(struct pidff_usage *usage, const u8 *table,
 		for (i = 0; i < report->maxfield; i++) {
 			if (report->field[i]->maxusage !=
 			    report->field[i]->report_count) {
-				debug("maxusage and report_count do not match, "
-				      "skipping");
+				pr_debug("maxusage and report_count do not match, skipping\n");
 				continue;
 			}
 			for (j = 0; j < report->field[i]->maxusage; j++) {
 				if (report->field[i]->usage[j].hid ==
 				    (HID_UP_PID | table[k])) {
-					debug("found %d at %d->%d", k, i, j);
+					pr_debug("found %d at %d->%d\n",
+						 k, i, j);
 					usage[k].field = report->field[i];
 					usage[k].value =
 						&report->field[i]->value[j];
@@ -789,7 +790,7 @@ static int pidff_find_fields(struct pidff_usage *usage, const u8 *table,
 				break;
 		}
 		if (!found && strict) {
-			debug("failed to locate %d", k);
+			pr_debug("failed to locate %d\n", k);
 			return -1;
 		}
 	}
@@ -826,8 +827,8 @@ static void pidff_find_reports(struct hid_device *hid, int report_type,
 			continue;
 		ret = pidff_check_usage(report->field[0]->logical);
 		if (ret != -1) {
-			debug("found usage 0x%02x from field->logical",
-			      pidff_reports[ret]);
+			hid_dbg(hid, "found usage 0x%02x from field->logical\n",
+				pidff_reports[ret]);
 			pidff->reports[ret] = report;
 			continue;
 		}
@@ -845,8 +846,9 @@ static void pidff_find_reports(struct hid_device *hid, int report_type,
 			continue;
 		ret = pidff_check_usage(hid->collection[i - 1].usage);
 		if (ret != -1 && !pidff->reports[ret]) {
-			debug("found usage 0x%02x from collection array",
-			      pidff_reports[ret]);
+			hid_dbg(hid,
+				"found usage 0x%02x from collection array\n",
+				pidff_reports[ret]);
 			pidff->reports[ret] = report;
 		}
 	}
@@ -861,7 +863,7 @@ static int pidff_reports_ok(struct pidff_device *pidff)
 
 	for (i = 0; i <= PID_REQUIRED_REPORTS; i++) {
 		if (!pidff->reports[i]) {
-			debug("%d missing", i);
+			hid_dbg(pidff->hid, "%d missing\n", i);
 			return 0;
 		}
 	}
@@ -884,8 +886,7 @@ static struct hid_field *pidff_find_special_field(struct hid_report *report,
 			    report->field[i]->logical_minimum == 1)
 				return report->field[i];
 			else {
-				printk(KERN_ERR "hid-pidff: logical_minimum "
-					"is not 1 as it should be\n");
+				pr_err("logical_minimum is not 1 as it should be\n");
 				return NULL;
 			}
 		}
@@ -924,7 +925,7 @@ static int pidff_find_special_keys(int *keys, struct hid_field *fld,
  */
 static int pidff_find_special_fields(struct pidff_device *pidff)
 {
-	debug("finding special fields");
+	hid_dbg(pidff->hid, "finding special fields\n");
 
 	pidff->create_new_effect_type =
 		pidff_find_special_field(pidff->reports[PID_CREATE_NEW_EFFECT],
@@ -945,32 +946,30 @@ static int pidff_find_special_fields(struct pidff_device *pidff)
 		pidff_find_special_field(pidff->reports[PID_EFFECT_OPERATION],
 					 0x78, 1);
 
-	debug("search done");
+	hid_dbg(pidff->hid, "search done\n");
 
 	if (!pidff->create_new_effect_type || !pidff->set_effect_type) {
-		printk(KERN_ERR "hid-pidff: effect lists not found\n");
+		hid_err(pidff->hid, "effect lists not found\n");
 		return -1;
 	}
 
 	if (!pidff->effect_direction) {
-		printk(KERN_ERR "hid-pidff: direction field not found\n");
+		hid_err(pidff->hid, "direction field not found\n");
 		return -1;
 	}
 
 	if (!pidff->device_control) {
-		printk(KERN_ERR "hid-pidff: device control field not found\n");
+		hid_err(pidff->hid, "device control field not found\n");
 		return -1;
 	}
 
 	if (!pidff->block_load_status) {
-		printk(KERN_ERR
-		       "hid-pidff: block load status field not found\n");
+		hid_err(pidff->hid, "block load status field not found\n");
 		return -1;
 	}
 
 	if (!pidff->effect_operation_status) {
-		printk(KERN_ERR
-		       "hid-pidff: effect operation field not found\n");
+		hid_err(pidff->hid, "effect operation field not found\n");
 		return -1;
 	}
 
@@ -982,23 +981,22 @@ static int pidff_find_special_fields(struct pidff_device *pidff)
 
 	if (!PIDFF_FIND_SPECIAL_KEYS(type_id, create_new_effect_type,
 				     effect_types)) {
-		printk(KERN_ERR "hid-pidff: no effect types found\n");
+		hid_err(pidff->hid, "no effect types found\n");
 		return -1;
 	}
 
 	if (PIDFF_FIND_SPECIAL_KEYS(status_id, block_load_status,
 				    block_load_status) !=
 			sizeof(pidff_block_load_status)) {
-		printk(KERN_ERR
-		       "hidpidff: block load status identifiers not found\n");
+		hid_err(pidff->hid,
+			"block load status identifiers not found\n");
 		return -1;
 	}
 
 	if (PIDFF_FIND_SPECIAL_KEYS(operation_id, effect_operation_status,
 				    effect_operation_status) !=
 			sizeof(pidff_effect_operation_status)) {
-		printk(KERN_ERR
-		       "hidpidff: effect operation identifiers not found\n");
+		hid_err(pidff->hid, "effect operation identifiers not found\n");
 		return -1;
 	}
 
@@ -1017,8 +1015,8 @@ static int pidff_find_effects(struct pidff_device *pidff,
 		int pidff_type = pidff->type_id[i];
 		if (pidff->set_effect_type->usage[pidff_type].hid !=
 		    pidff->create_new_effect_type->usage[pidff_type].hid) {
-			printk(KERN_ERR "hid-pidff: "
-			       "effect type number %d is invalid\n", i);
+			hid_err(pidff->hid,
+				"effect type number %d is invalid\n", i);
 			return -1;
 		}
 	}
@@ -1073,27 +1071,23 @@ static int pidff_init_fields(struct pidff_device *pidff, struct input_dev *dev)
 	int envelope_ok = 0;
 
 	if (PIDFF_FIND_FIELDS(set_effect, PID_SET_EFFECT, 1)) {
-		printk(KERN_ERR
-		       "hid-pidff: unknown set_effect report layout\n");
+		hid_err(pidff->hid, "unknown set_effect report layout\n");
 		return -ENODEV;
 	}
 
 	PIDFF_FIND_FIELDS(block_load, PID_BLOCK_LOAD, 0);
 	if (!pidff->block_load[PID_EFFECT_BLOCK_INDEX].value) {
-		printk(KERN_ERR
-		       "hid-pidff: unknown pid_block_load report layout\n");
+		hid_err(pidff->hid, "unknown pid_block_load report layout\n");
 		return -ENODEV;
 	}
 
 	if (PIDFF_FIND_FIELDS(effect_operation, PID_EFFECT_OPERATION, 1)) {
-		printk(KERN_ERR
-		       "hid-pidff: unknown effect_operation report layout\n");
+		hid_err(pidff->hid, "unknown effect_operation report layout\n");
 		return -ENODEV;
 	}
 
 	if (PIDFF_FIND_FIELDS(block_free, PID_BLOCK_FREE, 1)) {
-		printk(KERN_ERR
-		       "hid-pidff: unknown pid_block_free report layout\n");
+		hid_err(pidff->hid, "unknown pid_block_free report layout\n");
 		return -ENODEV;
 	}
 
@@ -1105,27 +1099,26 @@ static int pidff_init_fields(struct pidff_device *pidff, struct input_dev *dev)
 
 	if (!envelope_ok) {
 		if (test_and_clear_bit(FF_CONSTANT, dev->ffbit))
-			printk(KERN_WARNING "hid-pidff: "
-			       "has constant effect but no envelope\n");
+			hid_warn(pidff->hid,
+				 "has constant effect but no envelope\n");
 		if (test_and_clear_bit(FF_RAMP, dev->ffbit))
-			printk(KERN_WARNING "hid-pidff: "
-				"has ramp effect but no envelope\n");
+			hid_warn(pidff->hid,
+				 "has ramp effect but no envelope\n");
 
 		if (test_and_clear_bit(FF_PERIODIC, dev->ffbit))
-			printk(KERN_WARNING "hid-pidff: "
-				"has periodic effect but no envelope\n");
+			hid_warn(pidff->hid,
+				 "has periodic effect but no envelope\n");
 	}
 
 	if (test_bit(FF_CONSTANT, dev->ffbit) &&
 	    PIDFF_FIND_FIELDS(set_constant, PID_SET_CONSTANT, 1)) {
-		printk(KERN_WARNING
-		       "hid-pidff: unknown constant effect layout\n");
+		hid_warn(pidff->hid, "unknown constant effect layout\n");
 		clear_bit(FF_CONSTANT, dev->ffbit);
 	}
 
 	if (test_bit(FF_RAMP, dev->ffbit) &&
 	    PIDFF_FIND_FIELDS(set_ramp, PID_SET_RAMP, 1)) {
-		printk(KERN_WARNING "hid-pidff: unknown ramp effect layout\n");
+		hid_warn(pidff->hid, "unknown ramp effect layout\n");
 		clear_bit(FF_RAMP, dev->ffbit);
 	}
 
@@ -1134,8 +1127,7 @@ static int pidff_init_fields(struct pidff_device *pidff, struct input_dev *dev)
 	     test_bit(FF_FRICTION, dev->ffbit) ||
 	     test_bit(FF_INERTIA, dev->ffbit)) &&
 	    PIDFF_FIND_FIELDS(set_condition, PID_SET_CONDITION, 1)) {
-		printk(KERN_WARNING
-		       "hid-pidff: unknown condition effect layout\n");
+		hid_warn(pidff->hid, "unknown condition effect layout\n");
 		clear_bit(FF_SPRING, dev->ffbit);
 		clear_bit(FF_DAMPER, dev->ffbit);
 		clear_bit(FF_FRICTION, dev->ffbit);
@@ -1144,8 +1136,7 @@ static int pidff_init_fields(struct pidff_device *pidff, struct input_dev *dev)
 
 	if (test_bit(FF_PERIODIC, dev->ffbit) &&
 	    PIDFF_FIND_FIELDS(set_periodic, PID_SET_PERIODIC, 1)) {
-		printk(KERN_WARNING
-		       "hid-pidff: unknown periodic effect layout\n");
+		hid_warn(pidff->hid, "unknown periodic effect layout\n");
 		clear_bit(FF_PERIODIC, dev->ffbit);
 	}
 
@@ -1184,12 +1175,12 @@ static void pidff_reset(struct pidff_device *pidff)
 	if (pidff->pool[PID_SIMULTANEOUS_MAX].value) {
 		while (pidff->pool[PID_SIMULTANEOUS_MAX].value[0] < 2) {
 			if (i++ > 20) {
-				printk(KERN_WARNING "hid-pidff: device reports "
-				       "%d simultaneous effects\n",
-				       pidff->pool[PID_SIMULTANEOUS_MAX].value[0]);
+				hid_warn(pidff->hid,
+					 "device reports %d simultaneous effects\n",
+					 pidff->pool[PID_SIMULTANEOUS_MAX].value[0]);
 				break;
 			}
-			debug("pid_pool requested again");
+			hid_dbg(pidff->hid, "pid_pool requested again\n");
 			usbhid_submit_report(hid, pidff->reports[PID_POOL],
 					  USB_DIR_IN);
 			usbhid_wait_io(hid);
@@ -1215,7 +1206,7 @@ static int pidff_check_autocenter(struct pidff_device *pidff,
 
 	error = pidff_request_effect_upload(pidff, 1);
 	if (error) {
-		printk(KERN_ERR "hid-pidff: upload request failed\n");
+		hid_err(pidff->hid, "upload request failed\n");
 		return error;
 	}
 
@@ -1224,8 +1215,8 @@ static int pidff_check_autocenter(struct pidff_device *pidff,
 		pidff_autocenter(pidff, 0xffff);
 		set_bit(FF_AUTOCENTER, dev->ffbit);
 	} else {
-		printk(KERN_NOTICE "hid-pidff: "
-		       "device has unknown autocenter control method\n");
+		hid_notice(pidff->hid,
+			   "device has unknown autocenter control method\n");
 	}
 
 	pidff_erase_pid(pidff,
@@ -1248,10 +1239,10 @@ int hid_pidff_init(struct hid_device *hid)
 	int max_effects;
 	int error;
 
-	debug("starting pid init");
+	hid_dbg(hid, "starting pid init\n");
 
 	if (list_empty(&hid->report_enum[HID_OUTPUT_REPORT].report_list)) {
-		debug("not a PID device, no output report");
+		hid_dbg(hid, "not a PID device, no output report\n");
 		return -ENODEV;
 	}
 
@@ -1265,7 +1256,7 @@ int hid_pidff_init(struct hid_device *hid)
 	pidff_find_reports(hid, HID_FEATURE_REPORT, pidff);
 
 	if (!pidff_reports_ok(pidff)) {
-		debug("reports not ok, aborting");
+		hid_dbg(hid, "reports not ok, aborting\n");
 		error = -ENODEV;
 		goto fail;
 	}
@@ -1278,8 +1269,8 @@ int hid_pidff_init(struct hid_device *hid)
 
 	if (test_bit(FF_GAIN, dev->ffbit)) {
 		pidff_set(&pidff->device_gain[PID_DEVICE_GAIN_FIELD], 0xffff);
-		usbhid_submit_report(pidff->hid, pidff->reports[PID_DEVICE_GAIN],
-				  USB_DIR_OUT);
+		usbhid_submit_report(hid, pidff->reports[PID_DEVICE_GAIN],
+				     USB_DIR_OUT);
 	}
 
 	error = pidff_check_autocenter(pidff, dev);
@@ -1290,23 +1281,23 @@ int hid_pidff_init(struct hid_device *hid)
 	    pidff->block_load[PID_EFFECT_BLOCK_INDEX].field->logical_maximum -
 	    pidff->block_load[PID_EFFECT_BLOCK_INDEX].field->logical_minimum +
 	    1;
-	debug("max effects is %d", max_effects);
+	hid_dbg(hid, "max effects is %d\n", max_effects);
 
 	if (max_effects > PID_EFFECTS_MAX)
 		max_effects = PID_EFFECTS_MAX;
 
 	if (pidff->pool[PID_SIMULTANEOUS_MAX].value)
-		debug("max simultaneous effects is %d",
-		      pidff->pool[PID_SIMULTANEOUS_MAX].value[0]);
+		hid_dbg(hid, "max simultaneous effects is %d\n",
+			pidff->pool[PID_SIMULTANEOUS_MAX].value[0]);
 
 	if (pidff->pool[PID_RAM_POOL_SIZE].value)
-		debug("device memory size is %d bytes",
-		      pidff->pool[PID_RAM_POOL_SIZE].value[0]);
+		hid_dbg(hid, "device memory size is %d bytes\n",
+			pidff->pool[PID_RAM_POOL_SIZE].value[0]);
 
 	if (pidff->pool[PID_DEVICE_MANAGED_POOL].value &&
 	    pidff->pool[PID_DEVICE_MANAGED_POOL].value[0] == 0) {
-		printk(KERN_NOTICE "hid-pidff: "
-		       "device does not support device managed pool\n");
+		hid_notice(hid,
+			   "device does not support device managed pool\n");
 		goto fail;
 	}
 
@@ -1322,8 +1313,7 @@ int hid_pidff_init(struct hid_device *hid)
 	ff->set_autocenter = pidff_set_autocenter;
 	ff->playback = pidff_playback;
 
-	printk(KERN_INFO "Force feedback for USB HID PID devices by "
-	       "Anssi Hannula <anssi.hannula@gmail.com>\n");
+	hid_info(dev, "Force feedback for USB HID PID devices by Anssi Hannula <anssi.hannula@gmail.com>\n");
 
 	return 0;
 
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index a9935f6709f7..fb78f75d49a9 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -906,7 +906,7 @@ int hiddev_connect(struct hid_device *hid, unsigned int force)
 	hiddev->exist = 1;
 	retval = usb_register_dev(usbhid->intf, &hiddev_class);
 	if (retval) {
-		err_hid("Not able to get a minor for this device.");
+		hid_err(hid, "Not able to get a minor for this device\n");
 		hid->hiddev = NULL;
 		kfree(hiddev);
 		return -1;
diff --git a/drivers/hid/usbhid/usbkbd.c b/drivers/hid/usbhid/usbkbd.c
index a948605564fb..065817329f03 100644
--- a/drivers/hid/usbhid/usbkbd.c
+++ b/drivers/hid/usbhid/usbkbd.c
@@ -24,6 +24,8 @@
  * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -104,16 +106,18 @@ static void usb_kbd_irq(struct urb *urb)
 			if (usb_kbd_keycode[kbd->old[i]])
 				input_report_key(kbd->dev, usb_kbd_keycode[kbd->old[i]], 0);
 			else
-				dev_info(&urb->dev->dev,
-						"Unknown key (scancode %#x) released.\n", kbd->old[i]);
+				hid_info(urb->dev,
+					 "Unknown key (scancode %#x) released.\n",
+					 kbd->old[i]);
 		}
 
 		if (kbd->new[i] > 3 && memscan(kbd->old + 2, kbd->new[i], 6) == kbd->old + 8) {
 			if (usb_kbd_keycode[kbd->new[i]])
 				input_report_key(kbd->dev, usb_kbd_keycode[kbd->new[i]], 1);
 			else
-				dev_info(&urb->dev->dev,
-						"Unknown key (scancode %#x) released.\n", kbd->new[i]);
+				hid_info(urb->dev,
+					 "Unknown key (scancode %#x) released.\n",
+					 kbd->new[i]);
 		}
 	}
 
@@ -124,9 +128,9 @@ static void usb_kbd_irq(struct urb *urb)
 resubmit:
 	i = usb_submit_urb (urb, GFP_ATOMIC);
 	if (i)
-		err_hid ("can't resubmit intr, %s-%s/input0, status %d",
-				kbd->usbdev->bus->bus_name,
-				kbd->usbdev->devpath, i);
+		hid_err(urb->dev, "can't resubmit intr, %s-%s/input0, status %d",
+			kbd->usbdev->bus->bus_name,
+			kbd->usbdev->devpath, i);
 }
 
 static int usb_kbd_event(struct input_dev *dev, unsigned int type,
@@ -150,7 +154,7 @@ static int usb_kbd_event(struct input_dev *dev, unsigned int type,
 	*(kbd->leds) = kbd->newleds;
 	kbd->led->dev = kbd->usbdev;
 	if (usb_submit_urb(kbd->led, GFP_ATOMIC))
-		err_hid("usb_submit_urb(leds) failed");
+		pr_err("usb_submit_urb(leds) failed\n");
 
 	return 0;
 }
@@ -160,7 +164,7 @@ static void usb_kbd_led(struct urb *urb)
 	struct usb_kbd *kbd = urb->context;
 
 	if (urb->status)
-		dev_warn(&urb->dev->dev, "led urb status %d received\n",
+		hid_warn(urb->dev, "led urb status %d received\n",
 			 urb->status);
 
 	if (*(kbd->leds) == kbd->newleds)
@@ -169,7 +173,7 @@ static void usb_kbd_led(struct urb *urb)
 	*(kbd->leds) = kbd->newleds;
 	kbd->led->dev = kbd->usbdev;
 	if (usb_submit_urb(kbd->led, GFP_ATOMIC))
-		err_hid("usb_submit_urb(leds) failed");
+		hid_err(urb->dev, "usb_submit_urb(leds) failed\n");
 }
 
 static int usb_kbd_open(struct input_dev *dev)
diff --git a/include/linux/hid.h b/include/linux/hid.h
index e2af195d8b46..20b9801f669b 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -881,12 +881,32 @@ int hid_pidff_init(struct hid_device *hid);
 #define hid_pidff_init NULL
 #endif
 
-#define dbg_hid(format, arg...) if (hid_debug) \
-				printk(KERN_DEBUG "%s: " format ,\
-				__FILE__ , ## arg)
-#define err_hid(format, arg...) printk(KERN_ERR "%s: " format "\n" , \
-		__FILE__ , ## arg)
-#endif /* HID_FF */
+#define dbg_hid(format, arg...)						\
+do {									\
+	if (hid_debug)							\
+		printk(KERN_DEBUG "%s: " format, __FILE__, ##arg);	\
+} while (0)
+
+#define hid_printk(level, hid, fmt, arg...)		\
+	dev_printk(level, &(hid)->dev, fmt, ##arg)
+#define hid_emerg(hid, fmt, arg...)			\
+	dev_emerg(&(hid)->dev, fmt, ##arg)
+#define hid_crit(hid, fmt, arg...)			\
+	dev_crit(&(hid)->dev, fmt, ##arg)
+#define hid_alert(hid, fmt, arg...)			\
+	dev_alert(&(hid)->dev, fmt, ##arg)
+#define hid_err(hid, fmt, arg...)			\
+	dev_err(&(hid)->dev, fmt, ##arg)
+#define hid_notice(hid, fmt, arg...)			\
+	dev_notice(&(hid)->dev, fmt, ##arg)
+#define hid_warn(hid, fmt, arg...)			\
+	dev_warn(&(hid)->dev, fmt, ##arg)
+#define hid_info(hid, fmt, arg...)			\
+	dev_info(&(hid)->dev, fmt, ##arg)
+#define hid_dbg(hid, fmt, arg...)			\
+	dev_dbg(&(hid)->dev, fmt, ##arg)
+
+#endif /* __KERNEL__ */
 
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 6610e0893b8bc6f59b14fed7f089c5997f035f88 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 23 Sep 2010 15:07:34 -0700
Subject: RTC: Rework RTC code to use timerqueue for events

This patch reworks a large portion of the generic RTC code
to in-effect virtualize the rtc interrupt code.

The current RTC interface is very much a raw hardware interface.
Via the proc, /dev/, or sysfs interfaces, applciations can set
the hardware to trigger interrupts in one of three modes:

AIE: Alarm interrupt
UIE: Update interrupt (ie: once per second)
PIE: Periodic interrupt (sub-second irqs)

The problem with this interface is that it limits the RTC hardware
so it can only be used by one application at a time.

The purpose of this patch is to extend the RTC code so that we can
multiplex multiple applications event needs onto a single RTC device.
This is done by utilizing the timerqueue infrastructure to manage
a list of events, which cause the RTC hardware to be programmed
to fire an interrupt for the next event in the list.

In order to preserve the functionality of the exsting proc,/dev/ and
sysfs interfaces, we emulate the different interrupt modes as follows:

AIE: We create a rtc_timer dedicated to AIE mode interrupts. There is
only one per device, so we don't change existing interface semantics.

UIE: Again, a dedicated rtc_timer, set for periodic mode, is used
to emulate UIE interrupts. Again, only one per device.

PIE: Since PIE mode interrupts fire faster then the RTC's clock read
granularity, we emulate PIE mode interrupts using a hrtimer. Again,
one per device.

With this patch, the rtctest.c application in Documentation/rtc.txt
passes fine on x86 hardware. However, there may very well still be
bugs, so greatly I'd appreciate any feedback or testing!

Signed-off-by: John Stultz <john.stultz@linaro.org>
LKML Reference: <1290136329-18291-4-git-send-email-john.stultz@linaro.org>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Richard Cochran <richardcochran@gmail.com>
---
 drivers/rtc/class.c     |  13 ++
 drivers/rtc/interface.c | 574 +++++++++++++++++++++++++++++-------------------
 drivers/rtc/rtc-lib.c   |  28 +++
 include/linux/rtc.h     |  43 +++-
 4 files changed, 428 insertions(+), 230 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 565562ba6ac9..95d2b82762d6 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -16,6 +16,7 @@
 #include <linux/kdev_t.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 #include "rtc-core.h"
 
@@ -152,6 +153,18 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 	spin_lock_init(&rtc->irq_task_lock);
 	init_waitqueue_head(&rtc->irq_queue);
 
+	/* Init timerqueue */
+	timerqueue_init_head(&rtc->timerqueue);
+	INIT_WORK(&rtc->irqwork, rtctimer_do_work);
+	/* Init aie timer */
+	rtctimer_init(&rtc->aie_timer, rtc_aie_update_irq, (void *)rtc);
+	/* Init uie timer */
+	rtctimer_init(&rtc->uie_rtctimer, rtc_uie_update_irq, (void *)rtc);
+	/* Init pie timer */
+	hrtimer_init(&rtc->pie_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rtc->pie_timer.function = rtc_pie_update_irq;
+	rtc->pie_enabled = 0;
+
 	strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
 	dev_set_name(&rtc->dev, "rtc%d", id);
 
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index a0c816238aa9..c81c50b497b7 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -14,15 +14,11 @@
 #include <linux/rtc.h>
 #include <linux/sched.h>
 #include <linux/log2.h>
+#include <linux/workqueue.h>
 
-int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
+static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
 {
 	int err;
-
-	err = mutex_lock_interruptible(&rtc->ops_lock);
-	if (err)
-		return err;
-
 	if (!rtc->ops)
 		err = -ENODEV;
 	else if (!rtc->ops->read_time)
@@ -31,7 +27,18 @@ int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
 		memset(tm, 0, sizeof(struct rtc_time));
 		err = rtc->ops->read_time(rtc->dev.parent, tm);
 	}
+	return err;
+}
+
+int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
+{
+	int err;
 
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return err;
+
+	err = __rtc_read_time(rtc, tm);
 	mutex_unlock(&rtc->ops_lock);
 	return err;
 }
@@ -106,188 +113,54 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
 }
 EXPORT_SYMBOL_GPL(rtc_set_mmss);
 
-static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
 	int err;
 
 	err = mutex_lock_interruptible(&rtc->ops_lock);
 	if (err)
 		return err;
-
-	if (rtc->ops == NULL)
-		err = -ENODEV;
-	else if (!rtc->ops->read_alarm)
-		err = -EINVAL;
-	else {
-		memset(alarm, 0, sizeof(struct rtc_wkalrm));
-		err = rtc->ops->read_alarm(rtc->dev.parent, alarm);
-	}
-
+	alarm->enabled = rtc->aie_timer.enabled;
+	if (alarm->enabled)
+		alarm->time = rtc_ktime_to_tm(rtc->aie_timer.node.expires);
 	mutex_unlock(&rtc->ops_lock);
-	return err;
+
+	return 0;
 }
+EXPORT_SYMBOL_GPL(rtc_read_alarm);
 
-int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
+	struct rtc_time tm;
+	long now, scheduled;
 	int err;
-	struct rtc_time before, now;
-	int first_time = 1;
-	unsigned long t_now, t_alm;
-	enum { none, day, month, year } missing = none;
-	unsigned days;
-
-	/* The lower level RTC driver may return -1 in some fields,
-	 * creating invalid alarm->time values, for reasons like:
-	 *
-	 *   - The hardware may not be capable of filling them in;
-	 *     many alarms match only on time-of-day fields, not
-	 *     day/month/year calendar data.
-	 *
-	 *   - Some hardware uses illegal values as "wildcard" match
-	 *     values, which non-Linux firmware (like a BIOS) may try
-	 *     to set up as e.g. "alarm 15 minutes after each hour".
-	 *     Linux uses only oneshot alarms.
-	 *
-	 * When we see that here, we deal with it by using values from
-	 * a current RTC timestamp for any missing (-1) values.  The
-	 * RTC driver prevents "periodic alarm" modes.
-	 *
-	 * But this can be racey, because some fields of the RTC timestamp
-	 * may have wrapped in the interval since we read the RTC alarm,
-	 * which would lead to us inserting inconsistent values in place
-	 * of the -1 fields.
-	 *
-	 * Reading the alarm and timestamp in the reverse sequence
-	 * would have the same race condition, and not solve the issue.
-	 *
-	 * So, we must first read the RTC timestamp,
-	 * then read the RTC alarm value,
-	 * and then read a second RTC timestamp.
-	 *
-	 * If any fields of the second timestamp have changed
-	 * when compared with the first timestamp, then we know
-	 * our timestamp may be inconsistent with that used by
-	 * the low-level rtc_read_alarm_internal() function.
-	 *
-	 * So, when the two timestamps disagree, we just loop and do
-	 * the process again to get a fully consistent set of values.
-	 *
-	 * This could all instead be done in the lower level driver,
-	 * but since more than one lower level RTC implementation needs it,
-	 * then it's probably best best to do it here instead of there..
-	 */
 
-	/* Get the "before" timestamp */
-	err = rtc_read_time(rtc, &before);
-	if (err < 0)
+	err = rtc_valid_tm(&alarm->time);
+	if (err)
 		return err;
-	do {
-		if (!first_time)
-			memcpy(&before, &now, sizeof(struct rtc_time));
-		first_time = 0;
-
-		/* get the RTC alarm values, which may be incomplete */
-		err = rtc_read_alarm_internal(rtc, alarm);
-		if (err)
-			return err;
-		if (!alarm->enabled)
-			return 0;
-
-		/* full-function RTCs won't have such missing fields */
-		if (rtc_valid_tm(&alarm->time) == 0)
-			return 0;
-
-		/* get the "after" timestamp, to detect wrapped fields */
-		err = rtc_read_time(rtc, &now);
-		if (err < 0)
-			return err;
-
-		/* note that tm_sec is a "don't care" value here: */
-	} while (   before.tm_min   != now.tm_min
-		 || before.tm_hour  != now.tm_hour
-		 || before.tm_mon   != now.tm_mon
-		 || before.tm_year  != now.tm_year);
-
-	/* Fill in the missing alarm fields using the timestamp; we
-	 * know there's at least one since alarm->time is invalid.
-	 */
-	if (alarm->time.tm_sec == -1)
-		alarm->time.tm_sec = now.tm_sec;
-	if (alarm->time.tm_min == -1)
-		alarm->time.tm_min = now.tm_min;
-	if (alarm->time.tm_hour == -1)
-		alarm->time.tm_hour = now.tm_hour;
-
-	/* For simplicity, only support date rollover for now */
-	if (alarm->time.tm_mday == -1) {
-		alarm->time.tm_mday = now.tm_mday;
-		missing = day;
-	}
-	if (alarm->time.tm_mon == -1) {
-		alarm->time.tm_mon = now.tm_mon;
-		if (missing == none)
-			missing = month;
-	}
-	if (alarm->time.tm_year == -1) {
-		alarm->time.tm_year = now.tm_year;
-		if (missing == none)
-			missing = year;
-	}
-
-	/* with luck, no rollover is needed */
-	rtc_tm_to_time(&now, &t_now);
-	rtc_tm_to_time(&alarm->time, &t_alm);
-	if (t_now < t_alm)
-		goto done;
-
-	switch (missing) {
+	rtc_tm_to_time(&alarm->time, &scheduled);
 
-	/* 24 hour rollover ... if it's now 10am Monday, an alarm that
-	 * that will trigger at 5am will do so at 5am Tuesday, which
-	 * could also be in the next month or year.  This is a common
-	 * case, especially for PCs.
-	 */
-	case day:
-		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day");
-		t_alm += 24 * 60 * 60;
-		rtc_time_to_tm(t_alm, &alarm->time);
-		break;
-
-	/* Month rollover ... if it's the 31th, an alarm on the 3rd will
-	 * be next month.  An alarm matching on the 30th, 29th, or 28th
-	 * may end up in the month after that!  Many newer PCs support
-	 * this type of alarm.
+	/* Make sure we're not setting alarms in the past */
+	err = __rtc_read_time(rtc, &tm);
+	rtc_tm_to_time(&tm, &now);
+	if (scheduled <= now)
+		return -ETIME;
+	/*
+	 * XXX - We just checked to make sure the alarm time is not
+	 * in the past, but there is still a race window where if
+	 * the is alarm set for the next second and the second ticks
+	 * over right here, before we set the alarm.
 	 */
-	case month:
-		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month");
-		do {
-			if (alarm->time.tm_mon < 11)
-				alarm->time.tm_mon++;
-			else {
-				alarm->time.tm_mon = 0;
-				alarm->time.tm_year++;
-			}
-			days = rtc_month_days(alarm->time.tm_mon,
-					alarm->time.tm_year);
-		} while (days < alarm->time.tm_mday);
-		break;
-
-	/* Year rollover ... easy except for leap years! */
-	case year:
-		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year");
-		do {
-			alarm->time.tm_year++;
-		} while (rtc_valid_tm(&alarm->time) != 0);
-		break;
-
-	default:
-		dev_warn(&rtc->dev, "alarm rollover not handled\n");
-	}
 
-done:
-	return 0;
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->set_alarm)
+		err = -EINVAL;
+	else
+		err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
+
+	return err;
 }
-EXPORT_SYMBOL_GPL(rtc_read_alarm);
 
 int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
@@ -300,16 +173,18 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 	err = mutex_lock_interruptible(&rtc->ops_lock);
 	if (err)
 		return err;
-
-	if (!rtc->ops)
-		err = -ENODEV;
-	else if (!rtc->ops->set_alarm)
-		err = -EINVAL;
-	else
-		err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
-
+	if (rtc->aie_timer.enabled) {
+		rtctimer_remove(rtc, &rtc->aie_timer);
+		rtc->aie_timer.enabled = 0;
+	}
+	rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
+	rtc->aie_timer.period = ktime_set(0, 0);
+	if (alarm->enabled) {
+		rtc->aie_timer.enabled = 1;
+		rtctimer_enqueue(rtc, &rtc->aie_timer);
+	}
 	mutex_unlock(&rtc->ops_lock);
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(rtc_set_alarm);
 
@@ -319,6 +194,16 @@ int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 	if (err)
 		return err;
 
+	if (rtc->aie_timer.enabled != enabled) {
+		if (enabled) {
+			rtc->aie_timer.enabled = 1;
+			rtctimer_enqueue(rtc, &rtc->aie_timer);
+		} else {
+			rtctimer_remove(rtc, &rtc->aie_timer);
+			rtc->aie_timer.enabled = 0;
+		}
+	}
+
 	if (!rtc->ops)
 		err = -ENODEV;
 	else if (!rtc->ops->alarm_irq_enable)
@@ -337,52 +222,53 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 	if (err)
 		return err;
 
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-	if (enabled == 0 && rtc->uie_irq_active) {
-		mutex_unlock(&rtc->ops_lock);
-		return rtc_dev_update_irq_enable_emul(rtc, enabled);
+	/* make sure we're changing state */
+	if (rtc->uie_rtctimer.enabled == enabled)
+		goto out;
+
+	if (enabled) {
+		struct rtc_time tm;
+		ktime_t now, onesec;
+
+		__rtc_read_time(rtc, &tm);
+		onesec = ktime_set(1, 0);
+		now = rtc_tm_to_ktime(tm);
+		rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);
+		rtc->uie_rtctimer.period = ktime_set(1, 0);
+		rtc->uie_rtctimer.enabled = 1;
+		rtctimer_enqueue(rtc, &rtc->uie_rtctimer);
+	} else {
+		rtctimer_remove(rtc, &rtc->uie_rtctimer);
+		rtc->uie_rtctimer.enabled = 0;
 	}
-#endif
-
-	if (!rtc->ops)
-		err = -ENODEV;
-	else if (!rtc->ops->update_irq_enable)
-		err = -EINVAL;
-	else
-		err = rtc->ops->update_irq_enable(rtc->dev.parent, enabled);
 
+out:
 	mutex_unlock(&rtc->ops_lock);
-
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-	/*
-	 * Enable emulation if the driver did not provide
-	 * the update_irq_enable function pointer or if returned
-	 * -EINVAL to signal that it has been configured without
-	 * interrupts or that are not available at the moment.
-	 */
-	if (err == -EINVAL)
-		err = rtc_dev_update_irq_enable_emul(rtc, enabled);
-#endif
 	return err;
+
 }
 EXPORT_SYMBOL_GPL(rtc_update_irq_enable);
 
+
 /**
- * rtc_update_irq - report RTC periodic, alarm, and/or update irqs
- * @rtc: the rtc device
- * @num: how many irqs are being reported (usually one)
- * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF
- * Context: any
+ * rtc_handle_legacy_irq - AIE, UIE and PIE event hook
+ * @rtc: pointer to the rtc device
+ *
+ * This function is called when an AIE, UIE or PIE mode interrupt
+ * has occured (or been emulated).
+ *
+ * Triggers the registered irq_task function callback.
  */
-void rtc_update_irq(struct rtc_device *rtc,
-		unsigned long num, unsigned long events)
+static void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode)
 {
 	unsigned long flags;
 
+	/* mark one irq of the appropriate mode */
 	spin_lock_irqsave(&rtc->irq_lock, flags);
-	rtc->irq_data = (rtc->irq_data + (num << 8)) | events;
+	rtc->irq_data = (rtc->irq_data + (num << 8)) | (RTC_IRQF|mode);
 	spin_unlock_irqrestore(&rtc->irq_lock, flags);
 
+	/* call the task func */
 	spin_lock_irqsave(&rtc->irq_task_lock, flags);
 	if (rtc->irq_task)
 		rtc->irq_task->func(rtc->irq_task->private_data);
@@ -391,6 +277,69 @@ void rtc_update_irq(struct rtc_device *rtc,
 	wake_up_interruptible(&rtc->irq_queue);
 	kill_fasync(&rtc->async_queue, SIGIO, POLL_IN);
 }
+
+
+/**
+ * rtc_aie_update_irq - AIE mode rtctimer hook
+ * @private: pointer to the rtc_device
+ *
+ * This functions is called when the aie_timer expires.
+ */
+void rtc_aie_update_irq(void *private)
+{
+	struct rtc_device *rtc = (struct rtc_device *)private;
+	rtc_handle_legacy_irq(rtc, 1, RTC_AF);
+}
+
+
+/**
+ * rtc_uie_update_irq - UIE mode rtctimer hook
+ * @private: pointer to the rtc_device
+ *
+ * This functions is called when the uie_timer expires.
+ */
+void rtc_uie_update_irq(void *private)
+{
+	struct rtc_device *rtc = (struct rtc_device *)private;
+	rtc_handle_legacy_irq(rtc, 1,  RTC_UF);
+}
+
+
+/**
+ * rtc_pie_update_irq - PIE mode hrtimer hook
+ * @timer: pointer to the pie mode hrtimer
+ *
+ * This function is used to emulate PIE mode interrupts
+ * using an hrtimer. This function is called when the periodic
+ * hrtimer expires.
+ */
+enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer)
+{
+	struct rtc_device *rtc;
+	ktime_t period;
+	int count;
+	rtc = container_of(timer, struct rtc_device, pie_timer);
+
+	period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
+	count = hrtimer_forward_now(timer, period);
+
+	rtc_handle_legacy_irq(rtc, count, RTC_PF);
+
+	return HRTIMER_RESTART;
+}
+
+/**
+ * rtc_update_irq - Triggered when a RTC interrupt occurs.
+ * @rtc: the rtc device
+ * @num: how many irqs are being reported (usually one)
+ * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF
+ * Context: any
+ */
+void rtc_update_irq(struct rtc_device *rtc,
+		unsigned long num, unsigned long events)
+{
+	schedule_work(&rtc->irqwork);
+}
 EXPORT_SYMBOL_GPL(rtc_update_irq);
 
 static int __rtc_match(struct device *dev, void *data)
@@ -477,18 +426,20 @@ int rtc_irq_set_state(struct rtc_device *rtc, struct rtc_task *task, int enabled
 	int err = 0;
 	unsigned long flags;
 
-	if (rtc->ops->irq_set_state == NULL)
-		return -ENXIO;
-
 	spin_lock_irqsave(&rtc->irq_task_lock, flags);
 	if (rtc->irq_task != NULL && task == NULL)
 		err = -EBUSY;
 	if (rtc->irq_task != task)
 		err = -EACCES;
-	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
 
-	if (err == 0)
-		err = rtc->ops->irq_set_state(rtc->dev.parent, enabled);
+	if (enabled) {
+		ktime_t period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
+		hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL);
+	} else {
+		hrtimer_cancel(&rtc->pie_timer);
+	}
+	rtc->pie_enabled = enabled;
+	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
 
 	return err;
 }
@@ -509,21 +460,194 @@ int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq)
 	int err = 0;
 	unsigned long flags;
 
-	if (rtc->ops->irq_set_freq == NULL)
-		return -ENXIO;
-
 	spin_lock_irqsave(&rtc->irq_task_lock, flags);
 	if (rtc->irq_task != NULL && task == NULL)
 		err = -EBUSY;
 	if (rtc->irq_task != task)
 		err = -EACCES;
-	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
-
 	if (err == 0) {
-		err = rtc->ops->irq_set_freq(rtc->dev.parent, freq);
-		if (err == 0)
-			rtc->irq_freq = freq;
+		rtc->irq_freq = freq;
+		if (rtc->pie_enabled) {
+			ktime_t period;
+			hrtimer_cancel(&rtc->pie_timer);
+			period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
+			hrtimer_start(&rtc->pie_timer, period,
+					HRTIMER_MODE_REL);
+		}
 	}
+	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
 	return err;
 }
 EXPORT_SYMBOL_GPL(rtc_irq_set_freq);
+
+/**
+ * rtctimer_enqueue - Adds a rtc_timer to the rtc_device timerqueue
+ * @rtc rtc device
+ * @timer timer being added.
+ *
+ * Enqueues a timer onto the rtc devices timerqueue and sets
+ * the next alarm event appropriately.
+ *
+ * Must hold ops_lock for proper serialization of timerqueue
+ */
+void rtctimer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
+{
+	timerqueue_add(&rtc->timerqueue, &timer->node);
+	if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) {
+		struct rtc_wkalrm alarm;
+		int err;
+		alarm.time = rtc_ktime_to_tm(timer->node.expires);
+		alarm.enabled = 1;
+		err = __rtc_set_alarm(rtc, &alarm);
+		if (err == -ETIME)
+			schedule_work(&rtc->irqwork);
+	}
+}
+
+/**
+ * rtctimer_remove - Removes a rtc_timer from the rtc_device timerqueue
+ * @rtc rtc device
+ * @timer timer being removed.
+ *
+ * Removes a timer onto the rtc devices timerqueue and sets
+ * the next alarm event appropriately.
+ *
+ * Must hold ops_lock for proper serialization of timerqueue
+ */
+void rtctimer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
+{
+	struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
+	timerqueue_del(&rtc->timerqueue, &timer->node);
+
+	if (next == &timer->node) {
+		struct rtc_wkalrm alarm;
+		int err;
+		next = timerqueue_getnext(&rtc->timerqueue);
+		if (!next)
+			return;
+		alarm.time = rtc_ktime_to_tm(next->expires);
+		alarm.enabled = 1;
+		err = __rtc_set_alarm(rtc, &alarm);
+		if (err == -ETIME)
+			schedule_work(&rtc->irqwork);
+	}
+}
+
+/**
+ * rtctimer_do_work - Expires rtc timers
+ * @rtc rtc device
+ * @timer timer being removed.
+ *
+ * Expires rtc timers. Reprograms next alarm event if needed.
+ * Called via worktask.
+ *
+ * Serializes access to timerqueue via ops_lock mutex
+ */
+void rtctimer_do_work(struct work_struct *work)
+{
+	struct rtc_timer *timer;
+	struct timerqueue_node *next;
+	ktime_t now;
+	struct rtc_time tm;
+
+	struct rtc_device *rtc =
+		container_of(work, struct rtc_device, irqwork);
+
+	mutex_lock(&rtc->ops_lock);
+again:
+	__rtc_read_time(rtc, &tm);
+	now = rtc_tm_to_ktime(tm);
+	while ((next = timerqueue_getnext(&rtc->timerqueue))) {
+		if (next->expires.tv64 > now.tv64)
+			break;
+
+		/* expire timer */
+		timer = container_of(next, struct rtc_timer, node);
+		timerqueue_del(&rtc->timerqueue, &timer->node);
+		timer->enabled = 0;
+		if (timer->task.func)
+			timer->task.func(timer->task.private_data);
+
+		/* Re-add/fwd periodic timers */
+		if (ktime_to_ns(timer->period)) {
+			timer->node.expires = ktime_add(timer->node.expires,
+							timer->period);
+			timer->enabled = 1;
+			timerqueue_add(&rtc->timerqueue, &timer->node);
+		}
+	}
+
+	/* Set next alarm */
+	if (next) {
+		struct rtc_wkalrm alarm;
+		int err;
+		alarm.time = rtc_ktime_to_tm(next->expires);
+		alarm.enabled = 1;
+		err = __rtc_set_alarm(rtc, &alarm);
+		if (err == -ETIME)
+			goto again;
+	}
+
+	mutex_unlock(&rtc->ops_lock);
+}
+
+
+/* rtctimer_init - Initializes an rtc_timer
+ * @timer: timer to be intiialized
+ * @f: function pointer to be called when timer fires
+ * @data: private data passed to function pointer
+ *
+ * Kernel interface to initializing an rtc_timer.
+ */
+void rtctimer_init(struct rtc_timer *timer, void (*f)(void* p), void* data)
+{
+	timerqueue_init(&timer->node);
+	timer->enabled = 0;
+	timer->task.func = f;
+	timer->task.private_data = data;
+}
+
+/* rtctimer_start - Sets an rtc_timer to fire in the future
+ * @ rtc: rtc device to be used
+ * @ timer: timer being set
+ * @ expires: time at which to expire the timer
+ * @ period: period that the timer will recur
+ *
+ * Kernel interface to set an rtc_timer
+ */
+int rtctimer_start(struct rtc_device *rtc, struct rtc_timer* timer,
+			ktime_t expires, ktime_t period)
+{
+	int ret = 0;
+	mutex_lock(&rtc->ops_lock);
+	if (timer->enabled)
+		rtctimer_remove(rtc, timer);
+
+	timer->node.expires = expires;
+	timer->period = period;
+
+	timer->enabled = 1;
+	rtctimer_enqueue(rtc, timer);
+
+	mutex_unlock(&rtc->ops_lock);
+	return ret;
+}
+
+/* rtctimer_cancel - Stops an rtc_timer
+ * @ rtc: rtc device to be used
+ * @ timer: timer being set
+ *
+ * Kernel interface to cancel an rtc_timer
+ */
+int rtctimer_cancel(struct rtc_device *rtc, struct rtc_timer* timer)
+{
+	int ret = 0;
+	mutex_lock(&rtc->ops_lock);
+	if (timer->enabled)
+		rtctimer_remove(rtc, timer);
+	timer->enabled = 0;
+	mutex_unlock(&rtc->ops_lock);
+	return ret;
+}
+
+
diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
index 773851f338b8..075f1708deae 100644
--- a/drivers/rtc/rtc-lib.c
+++ b/drivers/rtc/rtc-lib.c
@@ -117,4 +117,32 @@ int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time)
 }
 EXPORT_SYMBOL(rtc_tm_to_time);
 
+/*
+ * Convert rtc_time to ktime
+ */
+ktime_t rtc_tm_to_ktime(struct rtc_time tm)
+{
+	time_t time;
+	rtc_tm_to_time(&tm, &time);
+	return ktime_set(time, 0);
+}
+EXPORT_SYMBOL_GPL(rtc_tm_to_ktime);
+
+/*
+ * Convert ktime to rtc_time
+ */
+struct rtc_time rtc_ktime_to_tm(ktime_t kt)
+{
+	struct timespec ts;
+	struct rtc_time ret;
+
+	ts = ktime_to_timespec(kt);
+	/* Round up any ns */
+	if (ts.tv_nsec)
+		ts.tv_sec++;
+	rtc_time_to_tm(ts.tv_sec, &ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rtc_ktime_to_tm);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 14dbc83ded20..a3421abca703 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -107,12 +107,17 @@ extern int rtc_year_days(unsigned int day, unsigned int month, unsigned int year
 extern int rtc_valid_tm(struct rtc_time *tm);
 extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
 extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
+ktime_t rtc_tm_to_ktime(struct rtc_time tm);
+struct rtc_time rtc_ktime_to_tm(ktime_t kt);
+
 
 #include <linux/device.h>
 #include <linux/seq_file.h>
 #include <linux/cdev.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
+#include <linux/timerqueue.h>
+#include <linux/workqueue.h>
 
 extern struct class *rtc_class;
 
@@ -151,7 +156,19 @@ struct rtc_class_ops {
 };
 
 #define RTC_DEVICE_NAME_SIZE 20
-struct rtc_task;
+typedef struct rtc_task {
+	void (*func)(void *private_data);
+	void *private_data;
+} rtc_task_t;
+
+
+struct rtc_timer {
+	struct rtc_task	task;
+	struct timerqueue_node node;
+	ktime_t period;
+	int enabled;
+};
+
 
 /* flags */
 #define RTC_DEV_BUSY 0
@@ -179,6 +196,15 @@ struct rtc_device
 	spinlock_t irq_task_lock;
 	int irq_freq;
 	int max_user_freq;
+
+	struct timerqueue_head timerqueue;
+	struct rtc_timer aie_timer;
+	struct rtc_timer uie_rtctimer;
+	struct hrtimer pie_timer; /* sub second exp, so needs hrtimer */
+	int pie_enabled;
+	struct work_struct irqwork;
+
+
 #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
 	struct work_struct uie_task;
 	struct timer_list uie_timer;
@@ -224,15 +250,22 @@ extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled);
 extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc,
 						unsigned int enabled);
 
-typedef struct rtc_task {
-	void (*func)(void *private_data);
-	void *private_data;
-} rtc_task_t;
+void rtc_aie_update_irq(void *private);
+void rtc_uie_update_irq(void *private);
+enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer);
 
 int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
+void rtctimer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer);
+void rtctimer_remove(struct rtc_device *rtc, struct rtc_timer *timer);
+void rtctimer_init(struct rtc_timer *timer, void (*f)(void* p), void* data);
+int rtctimer_start(struct rtc_device *rtc, struct rtc_timer* timer,
+			ktime_t expires, ktime_t period);
+int rtctimer_cancel(struct rtc_device *rtc, struct rtc_timer* timer);
+void rtctimer_do_work(struct work_struct *work);
+
 static inline bool is_leap_year(unsigned int year)
 {
 	return (!(year % 4) && (year % 100)) || !(year % 400);
-- 
cgit v1.2.3-71-gd317


From 042620a018afcfba1d678062b62e463b9e43a68d Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 14 Oct 2010 16:22:33 -0700
Subject: RTC: Remove UIE emulation

Since we provide UIE interrupts via a rtc_timer, the old
emulation code can be removed.

Signed-off-by: John Stultz <john.stultz@linaro.org>
LKML Reference: <1290136329-18291-5-git-send-email-john.stultz@linaro.org>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Richard Cochran <richardcochran@gmail.com>
---
 drivers/rtc/rtc-dev.c | 104 --------------------------------------------------
 include/linux/rtc.h   |  12 ------
 2 files changed, 116 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 62227cd52410..212b16edafc0 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -46,105 +46,6 @@ static int rtc_dev_open(struct inode *inode, struct file *file)
 	return err;
 }
 
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-/*
- * Routine to poll RTC seconds field for change as often as possible,
- * after first RTC_UIE use timer to reduce polling
- */
-static void rtc_uie_task(struct work_struct *work)
-{
-	struct rtc_device *rtc =
-		container_of(work, struct rtc_device, uie_task);
-	struct rtc_time tm;
-	int num = 0;
-	int err;
-
-	err = rtc_read_time(rtc, &tm);
-
-	spin_lock_irq(&rtc->irq_lock);
-	if (rtc->stop_uie_polling || err) {
-		rtc->uie_task_active = 0;
-	} else if (rtc->oldsecs != tm.tm_sec) {
-		num = (tm.tm_sec + 60 - rtc->oldsecs) % 60;
-		rtc->oldsecs = tm.tm_sec;
-		rtc->uie_timer.expires = jiffies + HZ - (HZ/10);
-		rtc->uie_timer_active = 1;
-		rtc->uie_task_active = 0;
-		add_timer(&rtc->uie_timer);
-	} else if (schedule_work(&rtc->uie_task) == 0) {
-		rtc->uie_task_active = 0;
-	}
-	spin_unlock_irq(&rtc->irq_lock);
-	if (num)
-		rtc_update_irq(rtc, num, RTC_UF | RTC_IRQF);
-}
-static void rtc_uie_timer(unsigned long data)
-{
-	struct rtc_device *rtc = (struct rtc_device *)data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&rtc->irq_lock, flags);
-	rtc->uie_timer_active = 0;
-	rtc->uie_task_active = 1;
-	if ((schedule_work(&rtc->uie_task) == 0))
-		rtc->uie_task_active = 0;
-	spin_unlock_irqrestore(&rtc->irq_lock, flags);
-}
-
-static int clear_uie(struct rtc_device *rtc)
-{
-	spin_lock_irq(&rtc->irq_lock);
-	if (rtc->uie_irq_active) {
-		rtc->stop_uie_polling = 1;
-		if (rtc->uie_timer_active) {
-			spin_unlock_irq(&rtc->irq_lock);
-			del_timer_sync(&rtc->uie_timer);
-			spin_lock_irq(&rtc->irq_lock);
-			rtc->uie_timer_active = 0;
-		}
-		if (rtc->uie_task_active) {
-			spin_unlock_irq(&rtc->irq_lock);
-			flush_scheduled_work();
-			spin_lock_irq(&rtc->irq_lock);
-		}
-		rtc->uie_irq_active = 0;
-	}
-	spin_unlock_irq(&rtc->irq_lock);
-	return 0;
-}
-
-static int set_uie(struct rtc_device *rtc)
-{
-	struct rtc_time tm;
-	int err;
-
-	err = rtc_read_time(rtc, &tm);
-	if (err)
-		return err;
-	spin_lock_irq(&rtc->irq_lock);
-	if (!rtc->uie_irq_active) {
-		rtc->uie_irq_active = 1;
-		rtc->stop_uie_polling = 0;
-		rtc->oldsecs = tm.tm_sec;
-		rtc->uie_task_active = 1;
-		if (schedule_work(&rtc->uie_task) == 0)
-			rtc->uie_task_active = 0;
-	}
-	rtc->irq_data = 0;
-	spin_unlock_irq(&rtc->irq_lock);
-	return 0;
-}
-
-int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, unsigned int enabled)
-{
-	if (enabled)
-		return set_uie(rtc);
-	else
-		return clear_uie(rtc);
-}
-EXPORT_SYMBOL(rtc_dev_update_irq_enable_emul);
-
-#endif /* CONFIG_RTC_INTF_DEV_UIE_EMUL */
 
 static ssize_t
 rtc_dev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
@@ -493,11 +394,6 @@ void rtc_dev_prepare(struct rtc_device *rtc)
 
 	rtc->dev.devt = MKDEV(MAJOR(rtc_devt), rtc->id);
 
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-	INIT_WORK(&rtc->uie_task, rtc_uie_task);
-	setup_timer(&rtc->uie_timer, rtc_uie_timer, (unsigned long)rtc);
-#endif
-
 	cdev_init(&rtc->char_dev, &rtc_dev_fops);
 	rtc->char_dev.owner = rtc->owner;
 }
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index a3421abca703..44e18a2523a3 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -203,18 +203,6 @@ struct rtc_device
 	struct hrtimer pie_timer; /* sub second exp, so needs hrtimer */
 	int pie_enabled;
 	struct work_struct irqwork;
-
-
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-	struct work_struct uie_task;
-	struct timer_list uie_timer;
-	/* Those fields are protected by rtc->irq_lock */
-	unsigned int oldsecs;
-	unsigned int uie_irq_active:1;
-	unsigned int stop_uie_polling:1;
-	unsigned int uie_task_active:1;
-	unsigned int uie_timer_active:1;
-#endif
 };
 #define to_rtc_device(d) container_of(d, struct rtc_device, dev)
 
-- 
cgit v1.2.3-71-gd317


From 96c8f06a0fb359a9a89701a7afab6d837e466ab0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Dec 2010 22:45:48 +0100
Subject: rtc: Namespace fixup

rtctimer_* is already occupied by sound/core/rtctimer.c. Instead of
fiddling with that, rename the new functions to rtc_timer_* which
reads nicer anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
---
 drivers/rtc/class.c     |  6 +++---
 drivers/rtc/interface.c | 42 +++++++++++++++++++++---------------------
 include/linux/rtc.h     | 12 ++++++------
 3 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 95d2b82762d6..3243832a17cd 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -155,11 +155,11 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 
 	/* Init timerqueue */
 	timerqueue_init_head(&rtc->timerqueue);
-	INIT_WORK(&rtc->irqwork, rtctimer_do_work);
+	INIT_WORK(&rtc->irqwork, rtc_timer_do_work);
 	/* Init aie timer */
-	rtctimer_init(&rtc->aie_timer, rtc_aie_update_irq, (void *)rtc);
+	rtc_timer_init(&rtc->aie_timer, rtc_aie_update_irq, (void *)rtc);
 	/* Init uie timer */
-	rtctimer_init(&rtc->uie_rtctimer, rtc_uie_update_irq, (void *)rtc);
+	rtc_timer_init(&rtc->uie_rtctimer, rtc_uie_update_irq, (void *)rtc);
 	/* Init pie timer */
 	hrtimer_init(&rtc->pie_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rtc->pie_timer.function = rtc_pie_update_irq;
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index c81c50b497b7..90384b9f6b2c 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -174,14 +174,14 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 	if (err)
 		return err;
 	if (rtc->aie_timer.enabled) {
-		rtctimer_remove(rtc, &rtc->aie_timer);
+		rtc_timer_remove(rtc, &rtc->aie_timer);
 		rtc->aie_timer.enabled = 0;
 	}
 	rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
 	rtc->aie_timer.period = ktime_set(0, 0);
 	if (alarm->enabled) {
 		rtc->aie_timer.enabled = 1;
-		rtctimer_enqueue(rtc, &rtc->aie_timer);
+		rtc_timer_enqueue(rtc, &rtc->aie_timer);
 	}
 	mutex_unlock(&rtc->ops_lock);
 	return 0;
@@ -197,9 +197,9 @@ int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 	if (rtc->aie_timer.enabled != enabled) {
 		if (enabled) {
 			rtc->aie_timer.enabled = 1;
-			rtctimer_enqueue(rtc, &rtc->aie_timer);
+			rtc_timer_enqueue(rtc, &rtc->aie_timer);
 		} else {
-			rtctimer_remove(rtc, &rtc->aie_timer);
+			rtc_timer_remove(rtc, &rtc->aie_timer);
 			rtc->aie_timer.enabled = 0;
 		}
 	}
@@ -236,9 +236,9 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 		rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);
 		rtc->uie_rtctimer.period = ktime_set(1, 0);
 		rtc->uie_rtctimer.enabled = 1;
-		rtctimer_enqueue(rtc, &rtc->uie_rtctimer);
+		rtc_timer_enqueue(rtc, &rtc->uie_rtctimer);
 	} else {
-		rtctimer_remove(rtc, &rtc->uie_rtctimer);
+		rtc_timer_remove(rtc, &rtc->uie_rtctimer);
 		rtc->uie_rtctimer.enabled = 0;
 	}
 
@@ -481,7 +481,7 @@ int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq)
 EXPORT_SYMBOL_GPL(rtc_irq_set_freq);
 
 /**
- * rtctimer_enqueue - Adds a rtc_timer to the rtc_device timerqueue
+ * rtc_timer_enqueue - Adds a rtc_timer to the rtc_device timerqueue
  * @rtc rtc device
  * @timer timer being added.
  *
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(rtc_irq_set_freq);
  *
  * Must hold ops_lock for proper serialization of timerqueue
  */
-void rtctimer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
+void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
 {
 	timerqueue_add(&rtc->timerqueue, &timer->node);
 	if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) {
@@ -505,7 +505,7 @@ void rtctimer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
 }
 
 /**
- * rtctimer_remove - Removes a rtc_timer from the rtc_device timerqueue
+ * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue
  * @rtc rtc device
  * @timer timer being removed.
  *
@@ -514,7 +514,7 @@ void rtctimer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
  *
  * Must hold ops_lock for proper serialization of timerqueue
  */
-void rtctimer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
+void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
 {
 	struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
 	timerqueue_del(&rtc->timerqueue, &timer->node);
@@ -534,7 +534,7 @@ void rtctimer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
 }
 
 /**
- * rtctimer_do_work - Expires rtc timers
+ * rtc_timer_do_work - Expires rtc timers
  * @rtc rtc device
  * @timer timer being removed.
  *
@@ -543,7 +543,7 @@ void rtctimer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
  *
  * Serializes access to timerqueue via ops_lock mutex
  */
-void rtctimer_do_work(struct work_struct *work)
+void rtc_timer_do_work(struct work_struct *work)
 {
 	struct rtc_timer *timer;
 	struct timerqueue_node *next;
@@ -592,14 +592,14 @@ again:
 }
 
 
-/* rtctimer_init - Initializes an rtc_timer
+/* rtc_timer_init - Initializes an rtc_timer
  * @timer: timer to be intiialized
  * @f: function pointer to be called when timer fires
  * @data: private data passed to function pointer
  *
  * Kernel interface to initializing an rtc_timer.
  */
-void rtctimer_init(struct rtc_timer *timer, void (*f)(void* p), void* data)
+void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data)
 {
 	timerqueue_init(&timer->node);
 	timer->enabled = 0;
@@ -607,7 +607,7 @@ void rtctimer_init(struct rtc_timer *timer, void (*f)(void* p), void* data)
 	timer->task.private_data = data;
 }
 
-/* rtctimer_start - Sets an rtc_timer to fire in the future
+/* rtc_timer_start - Sets an rtc_timer to fire in the future
  * @ rtc: rtc device to be used
  * @ timer: timer being set
  * @ expires: time at which to expire the timer
@@ -615,36 +615,36 @@ void rtctimer_init(struct rtc_timer *timer, void (*f)(void* p), void* data)
  *
  * Kernel interface to set an rtc_timer
  */
-int rtctimer_start(struct rtc_device *rtc, struct rtc_timer* timer,
+int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
 			ktime_t expires, ktime_t period)
 {
 	int ret = 0;
 	mutex_lock(&rtc->ops_lock);
 	if (timer->enabled)
-		rtctimer_remove(rtc, timer);
+		rtc_timer_remove(rtc, timer);
 
 	timer->node.expires = expires;
 	timer->period = period;
 
 	timer->enabled = 1;
-	rtctimer_enqueue(rtc, timer);
+	rtc_timer_enqueue(rtc, timer);
 
 	mutex_unlock(&rtc->ops_lock);
 	return ret;
 }
 
-/* rtctimer_cancel - Stops an rtc_timer
+/* rtc_timer_cancel - Stops an rtc_timer
  * @ rtc: rtc device to be used
  * @ timer: timer being set
  *
  * Kernel interface to cancel an rtc_timer
  */
-int rtctimer_cancel(struct rtc_device *rtc, struct rtc_timer* timer)
+int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer)
 {
 	int ret = 0;
 	mutex_lock(&rtc->ops_lock);
 	if (timer->enabled)
-		rtctimer_remove(rtc, timer);
+		rtc_timer_remove(rtc, timer);
 	timer->enabled = 0;
 	mutex_unlock(&rtc->ops_lock);
 	return ret;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 44e18a2523a3..3c995b4d742c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -246,13 +246,13 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
-void rtctimer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer);
-void rtctimer_remove(struct rtc_device *rtc, struct rtc_timer *timer);
-void rtctimer_init(struct rtc_timer *timer, void (*f)(void* p), void* data);
-int rtctimer_start(struct rtc_device *rtc, struct rtc_timer* timer,
+void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer);
+void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer);
+void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data);
+int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
 			ktime_t expires, ktime_t period);
-int rtctimer_cancel(struct rtc_device *rtc, struct rtc_timer* timer);
-void rtctimer_do_work(struct work_struct *work);
+int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer);
+void rtc_timer_do_work(struct work_struct *work);
 
 static inline bool is_leap_year(unsigned int year)
 {
-- 
cgit v1.2.3-71-gd317


From c9aa308fd5c373faeda588cfb02b04f116904613 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 7 Dec 2010 10:22:29 +0800
Subject: Add CPER PCIe error section structure and constants definition

On some machine, PCIe error is reported via APEI (ACPI Platform Error
Interface).  The error data is passed from firmware to Linux via CPER
PCIe error section structure.

This patch adds CPER PCIe error section structure and constants
definition.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/cper.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 82 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cper.h b/include/linux/cper.h
index bf972f81e2a7..3104aaff5dd0 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -39,10 +39,12 @@
  * Severity difinition for error_severity in struct cper_record_header
  * and section_severity in struct cper_section_descriptor
  */
-#define CPER_SEV_RECOVERABLE			0x0
-#define CPER_SEV_FATAL				0x1
-#define CPER_SEV_CORRECTED			0x2
-#define CPER_SEV_INFORMATIONAL			0x3
+enum {
+	CPER_SEV_RECOVERABLE,
+	CPER_SEV_FATAL,
+	CPER_SEV_CORRECTED,
+	CPER_SEV_INFORMATIONAL,
+};
 
 /*
  * Validation bits difinition for validation_bits in struct
@@ -201,6 +203,47 @@
 	UUID_LE(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F,	\
 		0xDF, 0xAA, 0x84, 0xEC)
 
+#define CPER_PROC_VALID_TYPE			0x0001
+#define CPER_PROC_VALID_ISA			0x0002
+#define CPER_PROC_VALID_ERROR_TYPE		0x0004
+#define CPER_PROC_VALID_OPERATION		0x0008
+#define CPER_PROC_VALID_FLAGS			0x0010
+#define CPER_PROC_VALID_LEVEL			0x0020
+#define CPER_PROC_VALID_VERSION			0x0040
+#define CPER_PROC_VALID_BRAND_INFO		0x0080
+#define CPER_PROC_VALID_ID			0x0100
+#define CPER_PROC_VALID_TARGET_ADDRESS		0x0200
+#define CPER_PROC_VALID_REQUESTOR_ID		0x0400
+#define CPER_PROC_VALID_RESPONDER_ID		0x0800
+#define CPER_PROC_VALID_IP			0x1000
+
+#define CPER_MEM_VALID_ERROR_STATUS		0x0001
+#define CPER_MEM_VALID_PHYSICAL_ADDRESS		0x0002
+#define CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK	0x0004
+#define CPER_MEM_VALID_NODE			0x0008
+#define CPER_MEM_VALID_CARD			0x0010
+#define CPER_MEM_VALID_MODULE			0x0020
+#define CPER_MEM_VALID_BANK			0x0040
+#define CPER_MEM_VALID_DEVICE			0x0080
+#define CPER_MEM_VALID_ROW			0x0100
+#define CPER_MEM_VALID_COLUMN			0x0200
+#define CPER_MEM_VALID_BIT_POSITION		0x0400
+#define CPER_MEM_VALID_REQUESTOR_ID		0x0800
+#define CPER_MEM_VALID_RESPONDER_ID		0x1000
+#define CPER_MEM_VALID_TARGET_ID		0x2000
+#define CPER_MEM_VALID_ERROR_TYPE		0x4000
+
+#define CPER_PCIE_VALID_PORT_TYPE		0x0001
+#define CPER_PCIE_VALID_VERSION			0x0002
+#define CPER_PCIE_VALID_COMMAND_STATUS		0x0004
+#define CPER_PCIE_VALID_DEVICE_ID		0x0008
+#define CPER_PCIE_VALID_SERIAL_NUMBER		0x0010
+#define CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS	0x0020
+#define CPER_PCIE_VALID_CAPABILITY		0x0040
+#define CPER_PCIE_VALID_AER_INFO		0x0080
+
+#define CPER_PCIE_SLOT_SHIFT			3
+
 /*
  * All tables and structs must be byte-packed to match CPER
  * specification, since the tables are provided by the system BIOS
@@ -306,6 +349,41 @@ struct cper_sec_mem_err {
 	__u8	error_type;
 };
 
+struct cper_sec_pcie {
+	__u64		validation_bits;
+	__u32		port_type;
+	struct {
+		__u8	minor;
+		__u8	major;
+		__u8	reserved[2];
+	}		version;
+	__u16		command;
+	__u16		status;
+	__u32		reserved;
+	struct {
+		__u16	vendor_id;
+		__u16	device_id;
+		__u8	class_code[3];
+		__u8	function;
+		__u8	device;
+		__u16	segment;
+		__u8	bus;
+		__u8	secondary_bus;
+		__u16	slot;
+		__u8	reserved;
+	}		device_id;
+	struct {
+		__u32	lower;
+		__u32	upper;
+	}		serial_number;
+	struct {
+		__u16	secondary_status;
+		__u16	control;
+	}		bridge;
+	__u8	capability[60];
+	__u8	aer_info[96];
+};
+
 /* Reset to default packing */
 #pragma pack()
 
-- 
cgit v1.2.3-71-gd317


From 16f4232ce4d6855361b4eb56262f4a202295c978 Mon Sep 17 00:00:00 2001
From: Zhao Yakui <yakui.zhao@intel.com>
Date: Wed, 8 Dec 2010 10:10:16 +0800
Subject: IPMI: Add one interface to get more info of low-level IPMI device

The IPMI smi_watcher will be used to catch the IPMI interface as they
come or go.  In order to communicate with the correct IPMI device, it
should be confirmed whether it is what we wanted especially on the
system with multiple IPMI devices. But the new_smi callback function
of smi_watcher provides very limited info(only the interface number
and dev pointer) and there is no detailed info about the low level
interface. For example: which mechansim registers the IPMI
interface(ACPI, PCI, DMI and so on).

This is to add one interface that can get more info of low-level IPMI
device. For example: the ACPI device handle will be returned for the
pnp_acpi IPMI device.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
Signed-off-by: Corey Minyard <cminyard@mvista.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/char/ipmi/ipmi_msghandler.c | 27 ++++++++++++++++++++++++++
 drivers/char/ipmi/ipmi_si_intf.c    | 20 +++++++++++++++----
 include/linux/ipmi.h                | 38 +++++++++++++++++++++++++++++++++++++
 include/linux/ipmi_smi.h            |  8 ++++++++
 4 files changed, 89 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 2fe72f8edf44..38223e93aa98 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -970,6 +970,33 @@ out_kfree:
 }
 EXPORT_SYMBOL(ipmi_create_user);
 
+int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data)
+{
+	int           rv = 0;
+	ipmi_smi_t    intf;
+	struct ipmi_smi_handlers *handlers;
+
+	mutex_lock(&ipmi_interfaces_mutex);
+	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
+		if (intf->intf_num == if_num)
+			goto found;
+	}
+	/* Not found, return an error */
+	rv = -EINVAL;
+	mutex_unlock(&ipmi_interfaces_mutex);
+	return rv;
+
+found:
+	handlers = intf->handlers;
+	rv = -ENOSYS;
+	if (handlers->get_smi_info)
+		rv = handlers->get_smi_info(intf->send_info, data);
+	mutex_unlock(&ipmi_interfaces_mutex);
+
+	return rv;
+}
+EXPORT_SYMBOL(ipmi_get_smi_info);
+
 static void free_user(struct kref *ref)
 {
 	ipmi_user_t user = container_of(ref, struct ipmi_user, refcount);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 035da9e64a17..945ae4d5d21f 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -57,6 +57,7 @@
 #include <asm/irq.h>
 #include <linux/interrupt.h>
 #include <linux/rcupdate.h>
+#include <linux/ipmi.h>
 #include <linux/ipmi_smi.h>
 #include <asm/io.h>
 #include "ipmi_si_sm.h"
@@ -107,10 +108,6 @@ enum si_type {
 };
 static char *si_to_str[] = { "kcs", "smic", "bt" };
 
-enum ipmi_addr_src {
-	SI_INVALID = 0, SI_HOTMOD, SI_HARDCODED, SI_SPMI, SI_ACPI, SI_SMBIOS,
-	SI_PCI,	SI_DEVICETREE, SI_DEFAULT
-};
 static char *ipmi_addr_src_to_str[] = { NULL, "hotmod", "hardcoded", "SPMI",
 					"ACPI", "SMBIOS", "PCI",
 					"device-tree", "default" };
@@ -291,6 +288,7 @@ struct smi_info {
 	struct task_struct *thread;
 
 	struct list_head link;
+	union ipmi_smi_info_union addr_info;
 };
 
 #define smi_inc_stat(smi, stat) \
@@ -1186,6 +1184,18 @@ static int smi_start_processing(void       *send_info,
 	return 0;
 }
 
+static int get_smi_info(void *send_info, struct ipmi_smi_info *data)
+{
+	struct smi_info *smi = send_info;
+
+	data->addr_src = smi->addr_source;
+	data->dev = smi->dev;
+	data->addr_info = smi->addr_info;
+	get_device(smi->dev);
+
+	return 0;
+}
+
 static void set_maintenance_mode(void *send_info, int enable)
 {
 	struct smi_info   *smi_info = send_info;
@@ -1197,6 +1207,7 @@ static void set_maintenance_mode(void *send_info, int enable)
 static struct ipmi_smi_handlers handlers = {
 	.owner                  = THIS_MODULE,
 	.start_processing       = smi_start_processing,
+	.get_smi_info		= get_smi_info,
 	.sender			= sender,
 	.request_events		= request_events,
 	.set_maintenance_mode   = set_maintenance_mode,
@@ -2156,6 +2167,7 @@ static int __devinit ipmi_pnp_probe(struct pnp_dev *dev,
 	printk(KERN_INFO PFX "probing via ACPI\n");
 
 	handle = acpi_dev->handle;
+	info->addr_info.acpi_info.acpi_handle = handle;
 
 	/* _IFT tells us the interface type: KCS, BT, etc */
 	status = acpi_evaluate_integer(handle, "_IFT", NULL, &tmp);
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 65aae34759de..045f2f275cd0 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -454,6 +454,44 @@ unsigned int ipmi_addr_length(int addr_type);
 /* Validate that the given IPMI address is valid. */
 int ipmi_validate_addr(struct ipmi_addr *addr, int len);
 
+/*
+ * How did the IPMI driver find out about the device?
+ */
+enum ipmi_addr_src {
+	SI_INVALID = 0, SI_HOTMOD, SI_HARDCODED, SI_SPMI, SI_ACPI, SI_SMBIOS,
+	SI_PCI,	SI_DEVICETREE, SI_DEFAULT
+};
+
+union ipmi_smi_info_union {
+	/*
+	 * the acpi_info element is defined for the SI_ACPI
+	 * address type
+	 */
+	struct {
+		void *acpi_handle;
+	} acpi_info;
+};
+
+struct ipmi_smi_info {
+	enum ipmi_addr_src addr_src;
+
+	/*
+	 * Base device for the interface.  Don't forget to put this when
+	 * you are done.
+	 */
+	struct device *dev;
+
+	/*
+	 * The addr_info provides more detailed info for some IPMI
+	 * devices, depending on the addr_src.  Currently only SI_ACPI
+	 * info is provided.
+	 */
+	union ipmi_smi_info_union addr_info;
+};
+
+/* This is to get the private info of ipmi_smi_t */
+extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data);
+
 #endif /* __KERNEL__ */
 
 
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 4b48318ac542..906590aa6907 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -39,6 +39,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/platform_device.h>
+#include <linux/ipmi.h>
 
 /* This files describes the interface for IPMI system management interface
    drivers to bind into the IPMI message handler. */
@@ -86,6 +87,13 @@ struct ipmi_smi_handlers {
 	int (*start_processing)(void       *send_info,
 				ipmi_smi_t new_intf);
 
+	/*
+	 * Get the detailed private info of the low level interface and store
+	 * it into the structure of ipmi_smi_data. For example: the
+	 * ACPI device handle will be returned for the pnp_acpi IPMI device.
+	 */
+	int (*get_smi_info)(void *send_info, struct ipmi_smi_info *data);
+
 	/* Called to enqueue an SMI message to be sent.  This
 	   operation is not allowed to fail.  If an error occurs, it
 	   should report back the error in a received message.  It may
-- 
cgit v1.2.3-71-gd317


From ba9effa2ecdc08325bd297d541b4207b2df38184 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Mon, 13 Dec 2010 19:10:29 +0200
Subject: kbuild: export linux/{a.out,kvm,kvm_para}.h on headers_install_all

Export linux/a.out.h, linux/kvm.h and linux/kvm_para.h on
headers_install_all if at least one architecture has appropriate files
in arch-dependent headers.

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 include/linux/Kbuild | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 97319a8fc1e0..ae31f9521a6d 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -20,15 +20,18 @@ header-y += wimax/
 objhdr-y += version.h
 
 ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \
-		  $(srctree)/include/asm-$(SRCARCH)/a.out.h),)
+		  $(srctree)/include/asm-$(SRCARCH)/a.out.h \
+		  $(INSTALL_HDR_PATH)/include/asm-*/a.out.h),)
 header-y += a.out.h
 endif
 ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \
-		  $(srctree)/include/asm-$(SRCARCH)/kvm.h),)
+		  $(srctree)/include/asm-$(SRCARCH)/kvm.h \
+		  $(INSTALL_HDR_PATH)/include/asm-*/kvm.h),)
 header-y += kvm.h
 endif
 ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
-		  $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
+		  $(srctree)/include/asm-$(SRCARCH)/kvm_para.h \
+		  $(INSTALL_HDR_PATH)/include/asm-*/kvm_para.h),)
 header-y += kvm_para.h
 endif
 
-- 
cgit v1.2.3-71-gd317


From dddd9dc340ae1a41d90e084529ca979c77c4ecfe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:35 +0100
Subject: block: kill genhd_media_change_notify()

There's no user of the facility.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c         | 25 -------------------------
 include/linux/genhd.h |  1 -
 2 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44a72ff..0905ab22c8cf 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1110,29 +1110,6 @@ static int __init proc_genhd_init(void)
 module_init(proc_genhd_init);
 #endif /* CONFIG_PROC_FS */
 
-static void media_change_notify_thread(struct work_struct *work)
-{
-	struct gendisk *gd = container_of(work, struct gendisk, async_notify);
-	char event[] = "MEDIA_CHANGE=1";
-	char *envp[] = { event, NULL };
-
-	/*
-	 * set enviroment vars to indicate which event this is for
-	 * so that user space will know to go check the media status.
-	 */
-	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
-	put_device(gd->driverfs_dev);
-}
-
-#if 0
-void genhd_media_change_notify(struct gendisk *disk)
-{
-	get_device(disk->driverfs_dev);
-	schedule_work(&disk->async_notify);
-}
-EXPORT_SYMBOL_GPL(genhd_media_change_notify);
-#endif  /*  0  */
-
 dev_t blk_lookup_devt(const char *name, int partno)
 {
 	dev_t devt = MKDEV(0, 0);
@@ -1198,8 +1175,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 		disk_to_dev(disk)->class = &block_class;
 		disk_to_dev(disk)->type = &disk_type;
 		device_initialize(disk_to_dev(disk));
-		INIT_WORK(&disk->async_notify,
-			media_change_notify_thread);
 	}
 	return disk;
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7a7b9c1644e4..5e4e6928387c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -173,7 +173,6 @@ struct gendisk {
 	struct timer_rand_state *random;
 
 	atomic_t sync_io;		/* RAID */
-	struct work_struct async_notify;
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity *integrity;
 #endif
-- 
cgit v1.2.3-71-gd317


From d2bf1b6723ed0eab378363649d15b7893bf14e91 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:36 +0100
Subject: block: move register_disk() and del_gendisk() to block/genhd.c

There's no reason for register_disk() and del_gendisk() to be in
fs/partitions/check.c.  Move both to genhd.c.  While at it, collapse
unlink_gendisk(), which was artificially in a separate function due to
genhd.c / check.c split, into del_gendisk().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c          | 90 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/partitions/check.c  | 89 -------------------------------------------------
 include/linux/blkdev.h |  1 -
 include/linux/genhd.h  |  1 -
 4 files changed, 87 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 0905ab22c8cf..2e5e4c0a1133 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -502,6 +502,64 @@ static int exact_lock(dev_t devt, void *data)
 	return 0;
 }
 
+void register_disk(struct gendisk *disk)
+{
+	struct device *ddev = disk_to_dev(disk);
+	struct block_device *bdev;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+	int err;
+
+	ddev->parent = disk->driverfs_dev;
+
+	dev_set_name(ddev, disk->disk_name);
+
+	/* delay uevents, until we scanned partition table */
+	dev_set_uevent_suppress(ddev, 1);
+
+	if (device_add(ddev))
+		return;
+	if (!sysfs_deprecated) {
+		err = sysfs_create_link(block_depr, &ddev->kobj,
+					kobject_name(&ddev->kobj));
+		if (err) {
+			device_del(ddev);
+			return;
+		}
+	}
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+
+	/* No minors to use for partitions */
+	if (!disk_partitionable(disk))
+		goto exit;
+
+	/* No such device (e.g., media were just removed) */
+	if (!get_capacity(disk))
+		goto exit;
+
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		goto exit;
+
+	bdev->bd_invalidated = 1;
+	err = blkdev_get(bdev, FMODE_READ, NULL);
+	if (err < 0)
+		goto exit;
+	blkdev_put(bdev, FMODE_READ);
+
+exit:
+	/* announce disk after possible partitions are created */
+	dev_set_uevent_suppress(ddev, 0);
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
+}
+
 /**
  * add_disk - add partitioning information to kernel list
  * @disk: per-device partitioning information
@@ -552,17 +610,43 @@ void add_disk(struct gendisk *disk)
 				   "bdi");
 	WARN_ON(retval);
 }
-
 EXPORT_SYMBOL(add_disk);
-EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */
 
-void unlink_gendisk(struct gendisk *disk)
+void del_gendisk(struct gendisk *disk)
 {
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+
+	/* invalidate stuff */
+	disk_part_iter_init(&piter, disk,
+			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+	while ((part = disk_part_iter_next(&piter))) {
+		invalidate_partition(disk, part->partno);
+		delete_partition(disk, part->partno);
+	}
+	disk_part_iter_exit(&piter);
+
+	invalidate_partition(disk, 0);
+	blk_free_devt(disk_to_dev(disk)->devt);
+	set_capacity(disk, 0);
+	disk->flags &= ~GENHD_FL_UP;
+
 	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
 	blk_unregister_region(disk_devt(disk), disk->minors);
+
+	part_stat_set_all(&disk->part0, 0);
+	disk->part0.stamp = 0;
+
+	kobject_put(disk->part0.holder_dir);
+	kobject_put(disk->slave_dir);
+	disk->driverfs_dev = NULL;
+	if (!sysfs_deprecated)
+		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+	device_del(disk_to_dev(disk));
 }
+EXPORT_SYMBOL(del_gendisk);
 
 /**
  * get_gendisk - get partitioning information for a given device
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index bdf8d3cc95a4..9a48d65d9855 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -516,65 +516,6 @@ out_put:
 	return ERR_PTR(err);
 }
 
-/* Not exported, helper to add_disk(). */
-void register_disk(struct gendisk *disk)
-{
-	struct device *ddev = disk_to_dev(disk);
-	struct block_device *bdev;
-	struct disk_part_iter piter;
-	struct hd_struct *part;
-	int err;
-
-	ddev->parent = disk->driverfs_dev;
-
-	dev_set_name(ddev, disk->disk_name);
-
-	/* delay uevents, until we scanned partition table */
-	dev_set_uevent_suppress(ddev, 1);
-
-	if (device_add(ddev))
-		return;
-	if (!sysfs_deprecated) {
-		err = sysfs_create_link(block_depr, &ddev->kobj,
-					kobject_name(&ddev->kobj));
-		if (err) {
-			device_del(ddev);
-			return;
-		}
-	}
-	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-
-	/* No minors to use for partitions */
-	if (!disk_partitionable(disk))
-		goto exit;
-
-	/* No such device (e.g., media were just removed) */
-	if (!get_capacity(disk))
-		goto exit;
-
-	bdev = bdget_disk(disk, 0);
-	if (!bdev)
-		goto exit;
-
-	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ, NULL);
-	if (err < 0)
-		goto exit;
-	blkdev_put(bdev, FMODE_READ);
-
-exit:
-	/* announce disk after possible partitions are created */
-	dev_set_uevent_suppress(ddev, 0);
-	kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
-	/* announce possible partitions */
-	disk_part_iter_init(&piter, disk, 0);
-	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-	disk_part_iter_exit(&piter);
-}
-
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
 	const struct block_device_operations *bdops = disk->fops;
@@ -737,33 +678,3 @@ fail:
 }
 
 EXPORT_SYMBOL(read_dev_sector);
-
-void del_gendisk(struct gendisk *disk)
-{
-	struct disk_part_iter piter;
-	struct hd_struct *part;
-
-	/* invalidate stuff */
-	disk_part_iter_init(&piter, disk,
-			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-	while ((part = disk_part_iter_next(&piter))) {
-		invalidate_partition(disk, part->partno);
-		delete_partition(disk, part->partno);
-	}
-	disk_part_iter_exit(&piter);
-
-	invalidate_partition(disk, 0);
-	blk_free_devt(disk_to_dev(disk)->devt);
-	set_capacity(disk, 0);
-	disk->flags &= ~GENHD_FL_UP;
-	unlink_gendisk(disk);
-	part_stat_set_all(&disk->part0, 0);
-	disk->part0.stamp = 0;
-
-	kobject_put(disk->part0.holder_dir);
-	kobject_put(disk->slave_dir);
-	disk->driverfs_dev = NULL;
-	if (!sysfs_deprecated)
-		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-	device_del(disk_to_dev(disk));
-}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd10c4f..83031bcf8366 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -643,7 +643,6 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
-extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5e4e6928387c..56e17ed24816 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -394,7 +394,6 @@ extern void part_round_stats(int cpu, struct hd_struct *part);
 /* block/genhd.c */
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
-extern void unlink_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
-- 
cgit v1.2.3-71-gd317


From 77ea887e433ad8389d416826936c110fa7910f80 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:37 +0100
Subject: implement in-kernel gendisk events handling

Currently, media presence polling for removeable block devices is done
from userland.  There are several issues with this.

* Polling is done by periodically opening the device.  For SCSI
  devices, the command sequence generated by such action involves a
  few different commands including TEST_UNIT_READY.  This behavior,
  while perfectly legal, is different from Windows which only issues
  single command, GET_EVENT_STATUS_NOTIFICATION.  Unfortunately, some
  ATAPI devices lock up after being periodically queried such command
  sequences.

* There is no reliable and unintrusive way for a userland program to
  tell whether the target device is safe for media presence polling.
  For example, polling for media presence during an on-going burning
  session can make it fail.  The polling program can avoid this by
  opening the device with O_EXCL but then it risks making a valid
  exclusive user of the device fail w/ -EBUSY.

* Userland polling is unnecessarily heavy and in-kernel implementation
  is lighter and better coordinated (workqueue, timer slack).

This patch implements framework for in-kernel disk event handling,
which includes media presence polling.

* bdops->check_events() is added, which supercedes ->media_changed().
  It should check whether there's any pending event and return if so.
  Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
  DISK_EVENT_EJECT_REQUEST.  ->check_events() is guaranteed not to be
  called parallelly.

* gendisk->events and ->async_events are added.  These should be
  initialized by block driver before passing the device to add_disk().
  The former contains the mask of all supported events and the latter
  the mask of all events which the device can report without polling.
  /sys/block/*/events[_async] export these to userland.

* Kernel parameter block.events_dfl_poll_msecs controls the system
  polling interval (default is 0 which means disable) and
  /sys/block/*/events_poll_msecs control polling intervals for
  individual devices (default is -1 meaning use system setting).  Note
  that if a device can report all supported events asynchronously and
  its polling interval isn't explicitly set, the device won't be
  polled regardless of the system polling interval.

* If a device is opened exclusively with write access, event checking
  is automatically disabled until all write exclusive accesses are
  released.

* There are event 'clearing' events.  For example, both of currently
  defined events are cleared after the device has been successfully
  opened.  This information is passed to ->check_events() callback
  using @clearing argument as a hint.

* Event checking is always performed from system_nrt_wq and timer
  slack is set to 25% for polling.

* Nothing changes for drivers which implement ->media_changed() but
  not ->check_events().  Going forward, all drivers will be converted
  to ->check_events() and ->media_change() will be dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c          | 429 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/block_dev.c         |  41 ++++-
 include/linux/blkdev.h |   3 +
 include/linux/fs.h     |   1 +
 include/linux/genhd.h  |  18 ++-
 5 files changed, 484 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 2e5e4c0a1133..5465a824d489 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/log2.h>
 
 #include "blk.h"
 
@@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
 
+static void disk_add_events(struct gendisk *disk);
+static void disk_del_events(struct gendisk *disk);
+static void disk_release_events(struct gendisk *disk);
+
 /**
  * disk_get_part - get partition
  * @disk: disk to look partition from
@@ -609,6 +614,8 @@ void add_disk(struct gendisk *disk)
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
+
+	disk_add_events(disk);
 }
 EXPORT_SYMBOL(add_disk);
 
@@ -617,6 +624,8 @@ void del_gendisk(struct gendisk *disk)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
+	disk_del_events(disk);
+
 	/* invalidate stuff */
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@@ -1089,6 +1098,7 @@ static void disk_release(struct device *dev)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
+	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
@@ -1350,3 +1360,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
 }
 
 EXPORT_SYMBOL(invalidate_partition);
+
+/*
+ * Disk events - monitor disk events like media change and eject request.
+ */
+struct disk_events {
+	struct list_head	node;		/* all disk_event's */
+	struct gendisk		*disk;		/* the associated disk */
+	spinlock_t		lock;
+
+	int			block;		/* event blocking depth */
+	unsigned int		pending;	/* events already sent out */
+	unsigned int		clearing;	/* events being cleared */
+
+	long			poll_msecs;	/* interval, -1 for default */
+	struct delayed_work	dwork;
+};
+
+static const char *disk_events_strs[] = {
+	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "media_change",
+	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "eject_request",
+};
+
+static char *disk_uevents[] = {
+	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "DISK_MEDIA_CHANGE=1",
+	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "DISK_EJECT_REQUEST=1",
+};
+
+/* list of all disk_events */
+static DEFINE_MUTEX(disk_events_mutex);
+static LIST_HEAD(disk_events);
+
+/* disable in-kernel polling by default */
+static unsigned long disk_events_dfl_poll_msecs	= 0;
+
+static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
+{
+	struct disk_events *ev = disk->ev;
+	long intv_msecs = 0;
+
+	/*
+	 * If device-specific poll interval is set, always use it.  If
+	 * the default is being used, poll iff there are events which
+	 * can't be monitored asynchronously.
+	 */
+	if (ev->poll_msecs >= 0)
+		intv_msecs = ev->poll_msecs;
+	else if (disk->events & ~disk->async_events)
+		intv_msecs = disk_events_dfl_poll_msecs;
+
+	return msecs_to_jiffies(intv_msecs);
+}
+
+static void __disk_block_events(struct gendisk *disk, bool sync)
+{
+	struct disk_events *ev = disk->ev;
+	unsigned long flags;
+	bool cancel;
+
+	spin_lock_irqsave(&ev->lock, flags);
+	cancel = !ev->block++;
+	spin_unlock_irqrestore(&ev->lock, flags);
+
+	if (cancel) {
+		if (sync)
+			cancel_delayed_work_sync(&disk->ev->dwork);
+		else
+			cancel_delayed_work(&disk->ev->dwork);
+	}
+}
+
+static void __disk_unblock_events(struct gendisk *disk, bool check_now)
+{
+	struct disk_events *ev = disk->ev;
+	unsigned long intv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ev->lock, flags);
+
+	if (WARN_ON_ONCE(ev->block <= 0))
+		goto out_unlock;
+
+	if (--ev->block)
+		goto out_unlock;
+
+	/*
+	 * Not exactly a latency critical operation, set poll timer
+	 * slack to 25% and kick event check.
+	 */
+	intv = disk_events_poll_jiffies(disk);
+	set_timer_slack(&ev->dwork.timer, intv / 4);
+	if (check_now)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	else if (intv)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+out_unlock:
+	spin_unlock_irqrestore(&ev->lock, flags);
+}
+
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events().  Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
+{
+	if (disk->ev)
+		__disk_block_events(disk, true);
+}
+
+/**
+ * disk_unblock_events - unblock disk event checking
+ * @disk: disk to unblock events for
+ *
+ * Undo disk_block_events().  When the block count reaches zero, it
+ * starts events polling if configured.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_unblock_events(struct gendisk *disk)
+{
+	if (disk->ev)
+		__disk_unblock_events(disk, true);
+}
+
+/**
+ * disk_check_events - schedule immediate event checking
+ * @disk: disk to check events for
+ *
+ * Schedule immediate event checking on @disk if not blocked.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_check_events(struct gendisk *disk)
+{
+	if (disk->ev) {
+		__disk_block_events(disk, false);
+		__disk_unblock_events(disk, true);
+	}
+}
+EXPORT_SYMBOL_GPL(disk_check_events);
+
+/**
+ * disk_clear_events - synchronously check, clear and return pending events
+ * @disk: disk to fetch and clear events from
+ * @mask: mask of events to be fetched and clearted
+ *
+ * Disk events are synchronously checked and pending events in @mask
+ * are cleared and returned.  This ignores the block count.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+{
+	const struct block_device_operations *bdops = disk->fops;
+	struct disk_events *ev = disk->ev;
+	unsigned int pending;
+
+	if (!ev) {
+		/* for drivers still using the old ->media_changed method */
+		if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
+		    bdops->media_changed && bdops->media_changed(disk))
+			return DISK_EVENT_MEDIA_CHANGE;
+		return 0;
+	}
+
+	/* tell the workfn about the events being cleared */
+	spin_lock_irq(&ev->lock);
+	ev->clearing |= mask;
+	spin_unlock_irq(&ev->lock);
+
+	/* uncondtionally schedule event check and wait for it to finish */
+	__disk_block_events(disk, true);
+	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	flush_delayed_work(&ev->dwork);
+	__disk_unblock_events(disk, false);
+
+	/* then, fetch and clear pending events */
+	spin_lock_irq(&ev->lock);
+	WARN_ON_ONCE(ev->clearing & mask);	/* cleared by workfn */
+	pending = ev->pending & mask;
+	ev->pending &= ~mask;
+	spin_unlock_irq(&ev->lock);
+
+	return pending;
+}
+
+static void disk_events_workfn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
+	struct gendisk *disk = ev->disk;
+	char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+	unsigned int clearing = ev->clearing;
+	unsigned int events;
+	unsigned long intv;
+	int nr_events = 0, i;
+
+	/* check events */
+	events = disk->fops->check_events(disk, clearing);
+
+	/* accumulate pending events and schedule next poll if necessary */
+	spin_lock_irq(&ev->lock);
+
+	events &= ~ev->pending;
+	ev->pending |= events;
+	ev->clearing &= ~clearing;
+
+	intv = disk_events_poll_jiffies(disk);
+	if (!ev->block && intv)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+
+	spin_unlock_irq(&ev->lock);
+
+	/* tell userland about new events */
+	for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+		if (events & (1 << i))
+			envp[nr_events++] = disk_uevents[i];
+
+	if (nr_events)
+		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+
+/*
+ * A disk events enabled device has the following sysfs nodes under
+ * its /sys/block/X/ directory.
+ *
+ * events		: list of all supported events
+ * events_async		: list of events which can be detected w/o polling
+ * events_poll_msecs	: polling interval, 0: disable, -1: system default
+ */
+static ssize_t __disk_events_show(unsigned int events, char *buf)
+{
+	const char *delim = "";
+	ssize_t pos = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
+		if (events & (1 << i)) {
+			pos += sprintf(buf + pos, "%s%s",
+				       delim, disk_events_strs[i]);
+			delim = " ";
+		}
+	if (pos)
+		pos += sprintf(buf + pos, "\n");
+	return pos;
+}
+
+static ssize_t disk_events_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return __disk_events_show(disk->events, buf);
+}
+
+static ssize_t disk_events_async_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return __disk_events_show(disk->async_events, buf);
+}
+
+static ssize_t disk_events_poll_msecs_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
+}
+
+static ssize_t disk_events_poll_msecs_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	long intv;
+
+	if (!count || !sscanf(buf, "%ld", &intv))
+		return -EINVAL;
+
+	if (intv < 0 && intv != -1)
+		return -EINVAL;
+
+	__disk_block_events(disk, true);
+	disk->ev->poll_msecs = intv;
+	__disk_unblock_events(disk, true);
+
+	return count;
+}
+
+static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+			 disk_events_poll_msecs_show,
+			 disk_events_poll_msecs_store);
+
+static const struct attribute *disk_events_attrs[] = {
+	&dev_attr_events.attr,
+	&dev_attr_events_async.attr,
+	&dev_attr_events_poll_msecs.attr,
+	NULL,
+};
+
+/*
+ * The default polling interval can be specified by the kernel
+ * parameter block.events_dfl_poll_msecs which defaults to 0
+ * (disable).  This can also be modified runtime by writing to
+ * /sys/module/block/events_dfl_poll_msecs.
+ */
+static int disk_events_set_dfl_poll_msecs(const char *val,
+					  const struct kernel_param *kp)
+{
+	struct disk_events *ev;
+	int ret;
+
+	ret = param_set_ulong(val, kp);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&disk_events_mutex);
+
+	list_for_each_entry(ev, &disk_events, node)
+		disk_check_events(ev->disk);
+
+	mutex_unlock(&disk_events_mutex);
+
+	return 0;
+}
+
+static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
+	.set	= disk_events_set_dfl_poll_msecs,
+	.get	= param_get_ulong,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX	"block."
+
+module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
+		&disk_events_dfl_poll_msecs, 0644);
+
+/*
+ * disk_{add|del|release}_events - initialize and destroy disk_events.
+ */
+static void disk_add_events(struct gendisk *disk)
+{
+	struct disk_events *ev;
+
+	if (!disk->fops->check_events || !(disk->events | disk->async_events))
+		return;
+
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev) {
+		pr_warn("%s: failed to initialize events\n", disk->disk_name);
+		return;
+	}
+
+	if (sysfs_create_files(&disk_to_dev(disk)->kobj,
+			       disk_events_attrs) < 0) {
+		pr_warn("%s: failed to create sysfs files for events\n",
+			disk->disk_name);
+		kfree(ev);
+		return;
+	}
+
+	disk->ev = ev;
+
+	INIT_LIST_HEAD(&ev->node);
+	ev->disk = disk;
+	spin_lock_init(&ev->lock);
+	ev->block = 1;
+	ev->poll_msecs = -1;
+	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
+
+	mutex_lock(&disk_events_mutex);
+	list_add_tail(&ev->node, &disk_events);
+	mutex_unlock(&disk_events_mutex);
+
+	/*
+	 * Block count is initialized to 1 and the following initial
+	 * unblock kicks it into action.
+	 */
+	__disk_unblock_events(disk, true);
+}
+
+static void disk_del_events(struct gendisk *disk)
+{
+	if (!disk->ev)
+		return;
+
+	__disk_block_events(disk, true);
+
+	mutex_lock(&disk_events_mutex);
+	list_del_init(&disk->ev->node);
+	mutex_unlock(&disk_events_mutex);
+
+	sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+}
+
+static void disk_release_events(struct gendisk *disk)
+{
+	/* the block count should be 1 from disk_del_events() */
+	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+	kfree(disk->ev);
+}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c1c1b8c3fb99..6017389711ee 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -948,10 +948,11 @@ int check_disk_change(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	const struct block_device_operations *bdops = disk->fops;
+	unsigned int events;
 
-	if (!bdops->media_changed)
-		return 0;
-	if (!bdops->media_changed(bdev->bd_disk))
+	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
+				   DISK_EVENT_EJECT_REQUEST);
+	if (!(events & DISK_EVENT_MEDIA_CHANGE))
 		return 0;
 
 	flush_disk(bdev);
@@ -1158,9 +1159,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 
 	if (whole) {
 		/* finish claiming */
+		mutex_lock(&bdev->bd_mutex);
 		spin_lock(&bdev_lock);
 
-		if (res == 0) {
+		if (!res) {
 			BUG_ON(!bd_may_claim(bdev, whole, holder));
 			/*
 			 * Note that for a whole device bd_holders
@@ -1180,6 +1182,20 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 		wake_up_bit(&whole->bd_claiming, 0);
 
 		spin_unlock(&bdev_lock);
+
+		/*
+		 * Block event polling for write claims.  Any write
+		 * holder makes the write_holder state stick until all
+		 * are released.  This is good enough and tracking
+		 * individual writeable reference is too fragile given
+		 * the way @mode is used in blkdev_get/put().
+		 */
+		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+			bdev->bd_write_holder = true;
+			disk_block_events(bdev->bd_disk);
+		}
+
+		mutex_unlock(&bdev->bd_mutex);
 		bdput(whole);
 	}
 
@@ -1353,12 +1369,23 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 
 		spin_unlock(&bdev_lock);
 
-		/* if this was the last claim, holder link should go too */
-		if (bdev_free)
+		/*
+		 * If this was the last claim, remove holder link and
+		 * unblock evpoll if it was a write holder.
+		 */
+		if (bdev_free) {
 			bd_unlink_disk_holder(bdev);
+			if (bdev->bd_write_holder) {
+				disk_unblock_events(bdev->bd_disk);
+				bdev->bd_write_holder = false;
+			} else
+				disk_check_events(bdev->bd_disk);
+		}
 
 		mutex_unlock(&bdev->bd_mutex);
-	}
+	} else
+		disk_check_events(bdev->bd_disk);
+
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83031bcf8366..05667e6989f1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1251,6 +1251,9 @@ struct block_device_operations {
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
+	unsigned int (*check_events) (struct gendisk *disk,
+				      unsigned int clearing);
+	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
 	int (*media_changed) (struct gendisk *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f48501563917..997d22efdef0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -662,6 +662,7 @@ struct block_device {
 	void *			bd_claiming;
 	void *			bd_holder;
 	int			bd_holders;
+	bool			bd_write_holder;
 #ifdef CONFIG_SYSFS
 	struct gendisk *	bd_holder_disk;	/* for sysfs slave linkng */
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 56e17ed24816..13893aa2ac9d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -127,6 +127,11 @@ struct hd_struct {
 #define GENHD_FL_EXT_DEVT			64 /* allow extended devt */
 #define GENHD_FL_NATIVE_CAPACITY		128
 
+enum {
+	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
+	DISK_EVENT_EJECT_REQUEST		= 1 << 1, /* eject requested */
+};
+
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
 
@@ -143,6 +148,8 @@ struct disk_part_tbl {
 	struct hd_struct __rcu *part[];
 };
 
+struct disk_events;
+
 struct gendisk {
 	/* major, first_minor and minors are input parameters only,
 	 * don't use directly.  Use disk_devt() and disk_max_parts().
@@ -154,6 +161,10 @@ struct gendisk {
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
 	char *(*devnode)(struct gendisk *gd, mode_t *mode);
+
+	unsigned int events;		/* supported events */
+	unsigned int async_events;	/* async events, subset of all */
+
 	/* Array of pointers to partitions indexed by partno.
 	 * Protected with matching bdev lock but stat and other
 	 * non-critical accesses use RCU.  Always access through
@@ -171,8 +182,8 @@ struct gendisk {
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
-
 	atomic_t sync_io;		/* RAID */
+	struct disk_events *ev;
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity *integrity;
 #endif
@@ -405,6 +416,11 @@ static inline int get_disk_ro(struct gendisk *disk)
 	return disk->part0.policy;
 }
 
+extern void disk_block_events(struct gendisk *disk);
+extern void disk_unblock_events(struct gendisk *disk);
+extern void disk_check_events(struct gendisk *disk);
+extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
+
 /* drivers/char/random.c */
 extern void add_disk_randomness(struct gendisk *disk);
 extern void rand_initialize_disk(struct gendisk *disk);
-- 
cgit v1.2.3-71-gd317


From 2d9217296bfa6fdc0d3707264076e5296faffdbd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:38 +0100
Subject: cdrom: add ->check_events() support

In principle, cdrom just needs to pass through ->check_events() but
CDROM_MEDIA_CHANGED ioctl makes things a bit more complex.  Just as
with ->media_changed() support, cdrom code needs to buffer the events
and serve them to ioctl and vfs as requested.

As the code has to deal with both ->check_events() and
->media_changed(), and vfs and ioctl event buffering, this patch adds
check_events caching on top of the existing cdi->mc_flags buffering.

It may be a good idea to deprecate CDROM_MEDIA_CHANGED ioctl and
remove all this mess.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/cdrom/cdrom.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/cdrom.h |  6 ++++++
 2 files changed, 58 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index af13c62dc473..1ea8f8d363c6 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -1348,7 +1348,10 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
 	if (!CDROM_CAN(CDC_SELECT_DISC))
 		return -EDRIVE_CANT_DO_THIS;
 
-	(void) cdi->ops->media_changed(cdi, slot);
+	if (cdi->ops->check_events)
+		cdi->ops->check_events(cdi, 0, slot);
+	else
+		cdi->ops->media_changed(cdi, slot);
 
 	if (slot == CDSL_NONE) {
 		/* set media changed bits, on both queues */
@@ -1392,6 +1395,41 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
 	return slot;
 }
 
+/*
+ * As cdrom implements an extra ioctl consumer for media changed
+ * event, it needs to buffer ->check_events() output, such that event
+ * is not lost for both the usual VFS and ioctl paths.
+ * cdi->{vfs|ioctl}_events are used to buffer pending events for each
+ * path.
+ *
+ * XXX: Locking is non-existent.  cdi->ops->check_events() can be
+ * called in parallel and buffering fields are accessed without any
+ * exclusion.  The original media_changed code had the same problem.
+ * It might be better to simply deprecate CDROM_MEDIA_CHANGED ioctl
+ * and remove this cruft altogether.  It doesn't have much usefulness
+ * at this point.
+ */
+static void cdrom_update_events(struct cdrom_device_info *cdi,
+				unsigned int clearing)
+{
+	unsigned int events;
+
+	events = cdi->ops->check_events(cdi, clearing, CDSL_CURRENT);
+	cdi->vfs_events |= events;
+	cdi->ioctl_events |= events;
+}
+
+unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
+				unsigned int clearing)
+{
+	unsigned int events;
+
+	cdrom_update_events(cdi, clearing);
+	events = cdi->vfs_events;
+	cdi->vfs_events = 0;
+	return events;
+}
+
 /* We want to make media_changed accessible to the user through an
  * ioctl. The main problem now is that we must double-buffer the
  * low-level implementation, to assure that the VFS and the user both
@@ -1403,15 +1441,26 @@ int media_changed(struct cdrom_device_info *cdi, int queue)
 {
 	unsigned int mask = (1 << (queue & 1));
 	int ret = !!(cdi->mc_flags & mask);
+	bool changed;
 
 	if (!CDROM_CAN(CDC_MEDIA_CHANGED))
-	    return ret;
+		return ret;
+
 	/* changed since last call? */
-	if (cdi->ops->media_changed(cdi, CDSL_CURRENT)) {
+	if (cdi->ops->check_events) {
+		BUG_ON(!queue);	/* shouldn't be called from VFS path */
+		cdrom_update_events(cdi, DISK_EVENT_MEDIA_CHANGE);
+		changed = cdi->ioctl_events & DISK_EVENT_MEDIA_CHANGE;
+		cdi->ioctl_events = 0;
+	} else
+		changed = cdi->ops->media_changed(cdi, CDSL_CURRENT);
+
+	if (changed) {
 		cdi->mc_flags = 0x3;    /* set bit on both queues */
 		ret |= 1;
 		cdi->media_written = 0;
 	}
+
 	cdi->mc_flags &= ~mask;         /* clear bit */
 	return ret;
 }
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index 78e904796622..35eae4b67503 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -946,6 +946,8 @@ struct cdrom_device_info {
 /* device-related storage */
 	unsigned int options	: 30;	/* options flags */
 	unsigned mc_flags	: 2;	/* media change buffer flags */
+	unsigned int vfs_events;	/* cached events for vfs path */
+	unsigned int ioctl_events;	/* cached events for ioctl path */
     	int use_count;                  /* number of times device opened */
     	char name[20];                  /* name of the device type */
 /* per-device flags */
@@ -965,6 +967,8 @@ struct cdrom_device_ops {
 	int (*open) (struct cdrom_device_info *, int);
 	void (*release) (struct cdrom_device_info *);
 	int (*drive_status) (struct cdrom_device_info *, int);
+	unsigned int (*check_events) (struct cdrom_device_info *cdi,
+				      unsigned int clearing, int slot);
 	int (*media_changed) (struct cdrom_device_info *, int);
 	int (*tray_move) (struct cdrom_device_info *, int);
 	int (*lock_door) (struct cdrom_device_info *, int);
@@ -993,6 +997,8 @@ extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev,
 extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode);
 extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
 		       fmode_t mode, unsigned int cmd, unsigned long arg);
+extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
+				       unsigned int clearing);
 extern int cdrom_media_changed(struct cdrom_device_info *);
 
 extern int register_cdrom(struct cdrom_device_info *cdi);
-- 
cgit v1.2.3-71-gd317


From d9c407b138926132e1f93c01fb2dee50eb0bb615 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 14:55:50 +0000
Subject: NFS: Introduce new-style XDR encoding functions for NFSv3

We're interested in taking advantage of the safety benefits of
xdr_streams.  These data structures allow more careful checking for
buffer overflow while encoding.  More careful type checking is also
introduced in the new functions.

For efficiency, we also eventually want to be able to pass xdr_streams
from call_encode() to all XDR encoding functions, rather than building
an xdr_stream in every XDR encoding function in the kernel.  To do
this means all encoders must be ready to handle a passed-in
xdr_stream.

The new encoders follow the modern paradigm for XDR encoders: BUG on
error, and always return a zero status code.

Static helper functions are left without the "inline" directive.  This
allows the compiler to choose automatically how to optimize these for
size or speed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3xdr.c     | 833 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs3.h |   2 +
 2 files changed, 832 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f6cc60f06dac..3d1043f7667c 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,6 +37,7 @@
 #define NFS3_filename_sz	(1+(NFS3_MAXNAMLEN>>2))
 #define NFS3_path_sz		(1+(NFS3_MAXPATHLEN>>2))
 #define NFS3_fattr_sz		(21)
+#define NFS3_cookieverf_sz	(NFS3_COOKIEVERFSIZE>>2)
 #define NFS3_wcc_attr_sz		(6)
 #define NFS3_pre_op_attr_sz	(1+NFS3_wcc_attr_sz)
 #define NFS3_post_op_attr_sz	(1+NFS3_fattr_sz)
@@ -59,7 +60,8 @@
 #define NFS3_mknodargs_sz	(NFS3_diropargs_sz+2+NFS3_sattr_sz)
 #define NFS3_renameargs_sz	(NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz		(NFS3_fh_sz+NFS3_diropargs_sz)
-#define NFS3_readdirargs_sz	(NFS3_fh_sz+2)
+#define NFS3_readdirargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+4)
 #define NFS3_commitargs_sz	(NFS3_fh_sz+3)
 
 #define NFS3_attrstat_sz	(1+NFS3_fattr_sz)
@@ -107,6 +109,22 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 		func, xdr->end - xdr->p);
 }
 
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+				 unsigned int base, unsigned int len,
+				 unsigned int bufsize)
+{
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
+	unsigned int replen;
+
+	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+
+
 /*
  * Common NFS XDR functions as inlines
  */
@@ -153,7 +171,7 @@ out_overflow:
  * Encode/decode time.
  */
 static inline __be32 *
-xdr_encode_time3(__be32 *p, struct timespec *timep)
+xdr_encode_time3(__be32 *p, const struct timespec *timep)
 {
 	*p++ = htonl(timep->tv_sec);
 	*p++ = htonl(timep->tv_nsec);
@@ -205,7 +223,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 }
 
 static inline __be32 *
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
+xdr_encode_sattr(__be32 *p, const struct iattr *attr)
 {
 	if (attr->ia_valid & ATTR_MODE) {
 		*p++ = xdr_one;
@@ -306,6 +324,243 @@ xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
 	return xdr_decode_post_op_attr(p, fattr);
 }
 
+
+/*
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
+{
+	__be32 *p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+/*
+ * filename3
+ *
+ *	typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+			     const char *name, u32 length)
+{
+	__be32 *p;
+
+	BUG_ON(length > NFS3_MAXNAMLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+/*
+ * nfspath3
+ *
+ *	typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+			    const u32 length)
+{
+	BUG_ON(length > NFS3_MAXPATHLEN);
+	encode_uint32(xdr, length);
+	xdr_write_pages(xdr, pages, 0, length);
+}
+
+/*
+ * cookie3
+ *
+ *	typedef uint64 cookie3
+ */
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
+{
+	return xdr_encode_hyper(p, cookie);
+}
+
+/*
+ * cookieverf3
+ *
+ *	typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+	memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+	return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+
+/*
+ * createverf3
+ *
+ *	typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+	memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+
+/*
+ * ftype3
+ *
+ *	enum ftype3 {
+ *		NF3REG	= 1,
+ *		NF3DIR	= 2,
+ *		NF3BLK	= 3,
+ *		NF3CHR	= 4,
+ *		NF3LNK	= 5,
+ *		NF3SOCK	= 6,
+ *		NF3FIFO	= 7
+ *	};
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+	BUG_ON(type > NF3FIFO);
+	encode_uint32(xdr, type);
+}
+
+/*
+ * specdata3
+ *
+ *     struct specdata3 {
+ *             uint32  specdata1;
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(MAJOR(rdev));
+	*p = cpu_to_be32(MINOR(rdev));
+}
+
+/*
+ * nfs_fh3
+ *
+ *	struct nfs_fh3 {
+ *		opaque       data<NFS3_FHSIZE>;
+ *	};
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	BUG_ON(fh->size > NFS3_FHSIZE);
+	p = xdr_reserve_space(xdr, 4 + fh->size);
+	xdr_encode_opaque(p, fh->data, fh->size);
+}
+
+/*
+ * sattr3
+ *
+ *	enum time_how {
+ *		DONT_CHANGE		= 0,
+ *		SET_TO_SERVER_TIME	= 1,
+ *		SET_TO_CLIENT_TIME	= 2
+ *	};
+ *
+ *	union set_mode3 switch (bool set_it) {
+ *	case TRUE:
+ *		mode3	mode;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_uid3 switch (bool set_it) {
+ *	case TRUE:
+ *		uid3	uid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_gid3 switch (bool set_it) {
+ *	case TRUE:
+ *		gid3	gid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_size3 switch (bool set_it) {
+ *	case TRUE:
+ *		size3	size;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_atime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3	atime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_mtime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3  mtime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct sattr3 {
+ *		set_mode3	mode;
+ *		set_uid3	uid;
+ *		set_gid3	gid;
+ *		set_size3	size;
+ *		set_atime	atime;
+ *		set_mtime	mtime;
+ *	};
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+	u32 nbytes;
+	__be32 *p;
+
+	/*
+	 * In order to make only a single xdr_reserve_space() call,
+	 * pre-compute the total number of bytes to be reserved.
+	 * Six boolean values, one for each set_foo field, are always
+	 * present in the encoded result, so start there.
+	 */
+	nbytes = 6 * 4;
+	if (attr->ia_valid & ATTR_MODE)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_UID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_GID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_SIZE)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_ATIME_SET)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_MTIME_SET)
+		nbytes += 8;
+	p = xdr_reserve_space(xdr, nbytes);
+
+	xdr_encode_sattr(p, attr);
+}
+
+/*
+ * diropargs3
+ *
+ *	struct diropargs3 {
+ *		nfs_fh3		dir;
+ *		filename3	name;
+ *	};
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+			      const char *name, u32 length)
+{
+	encode_nfs_fh3(xdr, fh);
+	encode_filename3(xdr, name, length);
+}
+
+
 /*
  * NFS encode functions
  */
@@ -321,6 +576,23 @@ nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 	return 0;
 }
 
+/*
+ * 3.3.1  GETATTR3args
+ *
+ *	struct GETATTR3args {
+ *		nfs_fh3  object;
+ *	};
+ */
+static int nfs3_xdr_enc_getattr3args(struct rpc_rqst *req, __be32 *p,
+				     const struct nfs_fh *fh)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_nfs_fh3(&xdr, fh);
+	return 0;
+}
+
 /*
  * Encode SETATTR arguments
  */
@@ -336,6 +608,49 @@ nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.2  SETATTR3args
+ *
+ *	union sattrguard3 switch (bool check) {
+ *	case TRUE:
+ *		nfstime3  obj_ctime;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ *	struct SETATTR3args {
+ *		nfs_fh3		object;
+ *		sattr3		new_attributes;
+ *		sattrguard3	guard;
+ *	};
+ */
+static void encode_sattrguard3(struct xdr_stream *xdr,
+			       const struct nfs3_sattrargs *args)
+{
+	__be32 *p;
+
+	if (args->guard) {
+		p = xdr_reserve_space(xdr, 4 + 8);
+		*p++ = xdr_one;
+		xdr_encode_time3(p, &args->guardtime);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		*p = xdr_zero;
+	}
+}
+
+static int nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, __be32 *p,
+				     const struct nfs3_sattrargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_nfs_fh3(&xdr, args->fh);
+	encode_sattr3(&xdr, args->sattr);
+	encode_sattrguard3(&xdr, args);
+	return 0;
+}
+
 /*
  * Encode directory ops argument
  */
@@ -348,6 +663,23 @@ nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.3  LOOKUP3args
+ *
+ *	struct LOOKUP3args {
+ *		diropargs3  what;
+ *	};
+ */
+static int nfs3_xdr_enc_lookup3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs3_diropargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_diropargs3(&xdr, args->fh, args->name, args->len);
+	return 0;
+}
+
 /*
  * Encode REMOVE argument
  */
@@ -372,6 +704,50 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
 	return 0;
 }
 
+/*
+ * 3.3.4  ACCESS3args
+ *
+ *	struct ACCESS3args {
+ *		nfs_fh3		object;
+ *		uint32		access;
+ *	};
+ */
+static void encode_access3args(struct xdr_stream *xdr,
+			       const struct nfs3_accessargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->access);
+}
+
+static int nfs3_xdr_enc_access3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs3_accessargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_access3args(&xdr, args);
+	return 0;
+}
+
+/*
+ * 3.3.5  READLINK3args
+ *
+ *	struct READLINK3args {
+ *		nfs_fh3	symlink;
+ *	};
+ */
+static int nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, __be32 *p,
+				      const struct nfs3_readlinkargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_nfs_fh3(&xdr, args->fh);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->pglen, NFS3_readlinkres_sz);
+	return 0;
+}
+
 /*
  * Arguments to a READ call. Since we read data directly into the page
  * cache, we also set up the reply iovec here so that iov[1] points
@@ -397,6 +773,40 @@ nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.6  READ3args
+ *
+ *	struct READ3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
+ */
+static void encode_read3args(struct xdr_stream *xdr,
+			     const struct nfs_readargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static int nfs3_xdr_enc_read3args(struct rpc_rqst *req, __be32 *p,
+				  const struct nfs_readargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_read3args(&xdr, args);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->count, NFS3_readres_sz);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+	return 0;
+}
+
 /*
  * Write arguments. Splice the buffer to be written into the iovec.
  */
@@ -419,6 +829,52 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.7  WRITE3args
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *		stable_how	stable;
+ *		opaque		data<>;
+ *	};
+ */
+static void encode_write3args(struct xdr_stream *xdr,
+			      const struct nfs_writeargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p++ = cpu_to_be32(args->count);
+
+	BUG_ON(args->stable > NFS_FILE_SYNC);
+	*p++ = cpu_to_be32(args->stable);
+
+	*p = cpu_to_be32(args->count);
+	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static int nfs3_xdr_enc_write3args(struct rpc_rqst *req, __be32 *p,
+				   const struct nfs_writeargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_write3args(&xdr, args);
+	xdr.buf->flags |= XDRBUF_WRITE;
+	return 0;
+}
+
 /*
  * Encode CREATE arguments
  */
@@ -439,6 +895,56 @@ nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *arg
 	return 0;
 }
 
+/*
+ * 3.3.8  CREATE3args
+ *
+ *	enum createmode3 {
+ *		UNCHECKED = 0,
+ *		GUARDED   = 1,
+ *		EXCLUSIVE = 2
+ *	};
+ *
+ *	union createhow3 switch (createmode3 mode) {
+ *	case UNCHECKED:
+ *	case GUARDED:
+ *		sattr3       obj_attributes;
+ *	case EXCLUSIVE:
+ *		createverf3  verf;
+ *	};
+ *
+ *	struct CREATE3args {
+ *		diropargs3	where;
+ *		createhow3	how;
+ *	};
+ */
+static void encode_createhow3(struct xdr_stream *xdr,
+			      const struct nfs3_createargs *args)
+{
+	encode_uint32(xdr, args->createmode);
+	switch (args->createmode) {
+	case NFS3_CREATE_UNCHECKED:
+	case NFS3_CREATE_GUARDED:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NFS3_CREATE_EXCLUSIVE:
+		encode_createverf3(xdr, args->verifier);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int nfs3_xdr_enc_create3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs3_createargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_diropargs3(&xdr, args->fh, args->name, args->len);
+	encode_createhow3(&xdr, args);
+	return 0;
+}
+
 /*
  * Encode MKDIR arguments
  */
@@ -452,6 +958,25 @@ nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.9  MKDIR3args
+ *
+ *	struct MKDIR3args {
+ *		diropargs3	where;
+ *		sattr3		attributes;
+ *	};
+ */
+static int nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, __be32 *p,
+				   const struct nfs3_mkdirargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_diropargs3(&xdr, args->fh, args->name, args->len);
+	encode_sattr3(&xdr, args->sattr);
+	return 0;
+}
+
 /*
  * Encode SYMLINK arguments
  */
@@ -469,6 +994,37 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *a
 	return 0;
 }
 
+/*
+ * 3.3.10  SYMLINK3args
+ *
+ *	struct symlinkdata3 {
+ *		sattr3		symlink_attributes;
+ *		nfspath3	symlink_data;
+ *	};
+ *
+ *	struct SYMLINK3args {
+ *		diropargs3	where;
+ *		symlinkdata3	symlink;
+ *	};
+ */
+static void encode_symlinkdata3(struct xdr_stream *xdr,
+				const struct nfs3_symlinkargs *args)
+{
+	encode_sattr3(xdr, args->sattr);
+	encode_nfspath3(xdr, args->pages, args->pathlen);
+}
+
+static int nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, __be32 *p,
+				     const struct nfs3_symlinkargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_diropargs3(&xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_symlinkdata3(&xdr, args);
+	return 0;
+}
+
 /*
  * Encode MKNOD arguments
  */
@@ -488,6 +1044,86 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.11  MKNOD3args
+ *
+ *	struct devicedata3 {
+ *		sattr3		dev_attributes;
+ *		specdata3	spec;
+ *	};
+ *
+ *	union mknoddata3 switch (ftype3 type) {
+ *	case NF3CHR:
+ *	case NF3BLK:
+ *		devicedata3	device;
+ *	case NF3SOCK:
+ *	case NF3FIFO:
+ *		sattr3		pipe_attributes;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct MKNOD3args {
+ *		diropargs3	where;
+ *		mknoddata3	what;
+ *	};
+ */
+static void encode_devicedata3(struct xdr_stream *xdr,
+			       const struct nfs3_mknodargs *args)
+{
+	encode_sattr3(xdr, args->sattr);
+	encode_specdata3(xdr, args->rdev);
+}
+
+static void encode_mknoddata3(struct xdr_stream *xdr,
+			      const struct nfs3_mknodargs *args)
+{
+	encode_ftype3(xdr, args->type);
+	switch (args->type) {
+	case NF3CHR:
+	case NF3BLK:
+		encode_devicedata3(xdr, args);
+		break;
+	case NF3SOCK:
+	case NF3FIFO:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NF3REG:
+	case NF3DIR:
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, __be32 *p,
+				   const struct nfs3_mknodargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_diropargs3(&xdr, args->fh, args->name, args->len);
+	encode_mknoddata3(&xdr, args);
+	return 0;
+}
+
+/*
+ * 3.3.12  REMOVE3args
+ *
+ *	struct REMOVE3args {
+ *		diropargs3  object;
+ *	};
+ */
+static int nfs3_xdr_enc_remove3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs_removeargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_diropargs3(&xdr, args->fh, args->name.name, args->name.len);
+	return 0;
+}
+
 /*
  * Encode RENAME arguments
  */
@@ -502,6 +1138,27 @@ nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args
 	return 0;
 }
 
+/*
+ * 3.3.14  RENAME3args
+ *
+ *	struct RENAME3args {
+ *		diropargs3	from;
+ *		diropargs3	to;
+ *	};
+ */
+static int nfs3_xdr_enc_rename3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs_renameargs *args)
+{
+	const struct qstr *old = args->old_name;
+	const struct qstr *new = args->new_name;
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_diropargs3(&xdr, args->old_dir, old->name, old->len);
+	encode_diropargs3(&xdr, args->new_dir, new->name, new->len);
+	return 0;
+}
+
 /*
  * Encode LINK arguments
  */
@@ -515,6 +1172,25 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.15  LINK3args
+ *
+ *	struct LINK3args {
+ *		nfs_fh3		file;
+ *		diropargs3	link;
+ *	};
+ */
+static int nfs3_xdr_enc_link3args(struct rpc_rqst *req, __be32 *p,
+				  const struct nfs3_linkargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_nfs_fh3(&xdr, args->fromfh);
+	encode_diropargs3(&xdr, args->tofh, args->toname, args->tolen);
+	return 0;
+}
+
 /*
  * Encode arguments to readdir call
  */
@@ -543,6 +1219,84 @@ nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *a
 	return 0;
 }
 
+/*
+ * 3.3.16  READDIR3args
+ *
+ *	struct READDIR3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		count;
+ *	};
+ */
+static void encode_readdir3args(struct xdr_stream *xdr,
+				const struct nfs3_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
+	*p = cpu_to_be32(args->count);
+}
+
+static int nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, __be32 *p,
+				     const struct nfs3_readdirargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_readdir3args(&xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
+	return 0;
+}
+
+/*
+ * 3.3.17  READDIRPLUS3args
+ *
+ *	struct READDIRPLUS3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		dircount;
+ *		count3		maxcount;
+ *	};
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+				    const struct nfs3_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
+
+	/*
+	 * readdirplus: need dircount + buffer size.
+	 * We just make sure we make dircount big enough
+	 */
+	*p++ = cpu_to_be32(args->count >> 3);
+
+	*p = cpu_to_be32(args->count);
+}
+
+static int nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, __be32 *p,
+					 const struct nfs3_readdirargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_readdirplus3args(&xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
+	return 0;
+}
+
 /*
  * Decode the result of a readdir call.
  * We just check for syntactical correctness.
@@ -674,6 +1428,37 @@ nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 	return 0;
 }
 
+/*
+ * 3.3.21  COMMIT3args
+ *
+ *	struct COMMIT3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
+ */
+static void encode_commit3args(struct xdr_stream *xdr,
+			       const struct nfs_writeargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static int nfs3_xdr_enc_commit3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs_writeargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_commit3args(&xdr, args);
+	return 0;
+}
+
 #ifdef CONFIG_NFS_V3_ACL
 /*
  * Encode GETACL arguments
@@ -699,6 +1484,21 @@ nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
 	return 0;
 }
 
+static int nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs3_getaclargs *args)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_nfs_fh3(&xdr, args->fh);
+	encode_uint32(&xdr, args->mask);
+	if (args->mask & (NFS_ACL | NFS_DFACL))
+		prepare_reply_buffer(req, args->pages, 0,
+					NFSACL_MAXPAGES << PAGE_SHIFT,
+					ACL3_getaclres_sz);
+	return 0;
+}
+
 /*
  * Encode SETACL arguments
  */
@@ -731,6 +1531,33 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
 				    NFS_ACL_DEFAULT);
 	return (err > 0) ? 0 : err;
 }
+
+static int nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, __be32 *p,
+				    const struct nfs3_setaclargs *args)
+{
+	struct xdr_stream xdr;
+	unsigned int base;
+	int error;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_nfs_fh3(&xdr, NFS_FH(args->inode));
+	encode_uint32(&xdr, args->mask);
+	if (args->npages != 0)
+		xdr_write_pages(&xdr, args->pages, 0, args->len);
+
+	base = req->rq_slen;
+	error = nfsacl_encode(xdr.buf, base, args->inode,
+			    (args->mask & NFS_ACL) ?
+			    args->acl_access : NULL, 1, 0);
+	BUG_ON(error < 0);
+	error = nfsacl_encode(xdr.buf, base + error, args->inode,
+			    (args->mask & NFS_DFACL) ?
+			    args->acl_default : NULL, 1,
+			    NFS_ACL_DEFAULT);
+	BUG_ON(error < 0);
+	return 0;
+}
+
 #endif  /* CONFIG_NFS_V3_ACL */
 
 /*
diff --git a/include/linux/nfs3.h b/include/linux/nfs3.h
index ac33806ec7f9..e2ee14fd0083 100644
--- a/include/linux/nfs3.h
+++ b/include/linux/nfs3.h
@@ -11,6 +11,8 @@
 #define NFS3_MAXGROUPS		16
 #define NFS3_FHSIZE		64
 #define NFS3_COOKIESIZE		4
+#define NFS3_CREATEVERFSIZE	8
+#define NFS3_COOKIEVERFSIZE	8
 #define NFS3_FIFO_DEV		(-1)
 #define NFS3MODE_FMT		0170000
 #define NFS3MODE_DIR		0040000
-- 
cgit v1.2.3-71-gd317


From e4f9323409369a3aeb01885c0c4409d2eeec794a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 14:56:30 +0000
Subject: NFS: Introduce new-style XDR decoding functions for NFSv2

We'd like to prevent local buffer overflows caused by malicious or
broken servers.  New xdr_stream style decoders can do that.

For efficiency, we also eventually want to be able to pass xdr_streams
from call_decode() to all XDR decoding functions, rather than building
an xdr_stream in every XDR decoding function in the kernel.

Static helper functions are left without the "inline" directive.  This
allows the compiler to choose automatically how to optimize these for
size or speed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3xdr.c     | 1534 +++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/nfs3.h |    1 +
 2 files changed, 1451 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 119844d0b4d5..0f07c6d55131 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -104,13 +104,6 @@ static const umode_t nfs_type2fmt[] = {
 	[NF3FIFO] = S_IFIFO,
 };
 
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
-	dprintk("nfs: %s: prematurely hit end of receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
-}
-
 /*
  * While encoding arguments, set up the reply buffer in advance to
  * receive reply data directly into the page cache.
@@ -126,6 +119,16 @@ static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
 }
 
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
 
 /*
  * Common NFS XDR functions as inlines
@@ -284,6 +287,44 @@ static void encode_uint32(struct xdr_stream *xdr, u32 value)
 	*p = cpu_to_be32(value);
 }
 
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*value = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	xdr_decode_hyper(p, value);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * fileid3
+ *
+ *	typedef uint64 fileid3;
+ */
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+	return decode_uint64(xdr, fileid);
+}
+
 /*
  * filename3
  *
@@ -299,6 +340,33 @@ static void encode_filename3(struct xdr_stream *xdr,
 	xdr_encode_opaque(p, name, length);
 }
 
+static int decode_inline_filename3(struct xdr_stream *xdr,
+				   const char **name, u32 *length)
+{
+	__be32 *p;
+	u32 count;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (count > NFS3_MAXNAMLEN)
+		goto out_nametoolong;
+	p = xdr_inline_decode(xdr, count);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*name = (const char *)p;
+	*length = count;
+	return 0;
+
+out_nametoolong:
+	dprintk("NFS: returned filename too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
  * nfspath3
  *
@@ -312,6 +380,39 @@ static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
 	xdr_write_pages(xdr, pages, 0, length);
 }
 
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+	u32 recvd, count;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+		goto out_nametoolong;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+
+	xdr_read_pages(xdr, count);
+	xdr_terminate_string(xdr->buf, count);
+	return 0;
+
+out_nametoolong:
+	dprintk("NFS: returned pathname too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_cheating:
+	dprintk("NFS: server cheating in pathname result: "
+		"count %u > recvd %u\n", count, recvd);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
  * cookie3
  *
@@ -322,6 +423,11 @@ static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
 	return xdr_encode_hyper(p, cookie);
 }
 
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
+{
+	return decode_uint64(xdr, cookie);
+}
+
 /*
  * cookieverf3
  *
@@ -333,6 +439,20 @@ static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
 	return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
 }
 
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
  * createverf3
  *
@@ -346,6 +466,54 @@ static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
 	memcpy(p, verifier, NFS3_CREATEVERFSIZE);
 }
 
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier, p, NFS3_WRITEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * size3
+ *
+ *	typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+	return xdr_decode_hyper(p, size);
+}
+
+/*
+ * nfsstat3
+ *
+ *	enum nfsstat3 {
+ *		NFS3_OK = 0,
+ *		...
+ *	}
+ */
+#define NFS3_OK		NFS_OK
+
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
  * ftype3
  *
@@ -398,6 +566,36 @@ static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
 	xdr_encode_opaque(p, fh->data, fh->size);
 }
 
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > NFS3_FHSIZE))
+		goto out_toobig;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	fh->size = length;
+	memcpy(fh->data, p, length);
+	return 0;
+out_toobig:
+	dprintk("NFS: file handle size (%u) too big\n", length);
+	return -E2BIG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+	memset(fh, 0, sizeof(*fh));
+}
+
 /*
  * nfstime3
  *
@@ -540,6 +738,153 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
 		*p = xdr_zero;
 }
 
+/*
+ * fattr3
+ *
+ *	struct fattr3 {
+ *		ftype3		type;
+ *		mode3		mode;
+ *		uint32		nlink;
+ *		uid3		uid;
+ *		gid3		gid;
+ *		size3		size;
+ *		size3		used;
+ *		specdata3	rdev;
+ *		uint64		fsid;
+ *		fileid3		fileid;
+ *		nfstime3	atime;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	xdr_decode_fattr(p, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * post_op_attr
+ *
+ *	union post_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		fattr3	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_fattr3(xdr, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * wcc_attr
+ *	struct wcc_attr {
+ *		size3		size;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	xdr_decode_wcc_attr(p, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * pre_op_attr
+ *	union pre_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		wcc_attr	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ * wcc_data
+ *
+ *	struct wcc_data {
+ *		pre_op_attr	before;
+ *		post_op_attr	after;
+ *	};
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_wcc_attr(xdr, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	int error;
+
+	error = decode_pre_op_attr(xdr, fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, fattr);
+out:
+	return error;
+}
+
+/*
+ * post_op_fh3
+ *
+ *	union post_op_fh3 switch (bool handle_follows) {
+ *	case TRUE:
+ *		nfs_fh3  handle;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_nfs_fh3(xdr, fh);
+	zero_nfs_fh3(fh);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
  * diropargs3
  *
@@ -1108,78 +1453,6 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	return pglen;
 }
 
-__be32 *
-nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
-{
-	__be32 *p;
-	struct nfs_entry old = *entry;
-
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
-		goto out_overflow;
-	if (!ntohl(*p++)) {
-		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p))
-			goto out_overflow;
-		if (!ntohl(*p++))
-			return ERR_PTR(-EAGAIN);
-		entry->eof = 1;
-		return ERR_PTR(-EBADCOOKIE);
-	}
-
-	p = xdr_inline_decode(xdr, 12);
-	if (unlikely(!p))
-		goto out_overflow;
-	p = xdr_decode_hyper(p, &entry->ino);
-	entry->len  = ntohl(*p++);
-
-	p = xdr_inline_decode(xdr, entry->len + 8);
-	if (unlikely(!p))
-		goto out_overflow;
-	entry->name = (const char *) p;
-	p += XDR_QUADLEN(entry->len);
-	entry->prev_cookie = entry->cookie;
-	p = xdr_decode_hyper(p, &entry->cookie);
-
-	entry->d_type = DT_UNKNOWN;
-	if (plus) {
-		entry->fattr->valid = 0;
-		p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
-		if (IS_ERR(p))
-			goto out_overflow_exit;
-		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-		/* In fact, a post_op_fh3: */
-		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p))
-			goto out_overflow;
-		if (*p++) {
-			p = xdr_decode_fhandle_stream(xdr, entry->fh);
-			if (IS_ERR(p))
-				goto out_overflow_exit;
-			/* Ugh -- server reply was truncated */
-			if (p == NULL) {
-				dprintk("NFS: FH truncated\n");
-				*entry = old;
-				return ERR_PTR(-EAGAIN);
-			}
-		} else
-			memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
-	}
-
-	p = xdr_inline_peek(xdr, 8);
-	if (p != NULL)
-		entry->eof = !p[0] && p[1];
-	else
-		entry->eof = 0;
-
-	return p;
-
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-out_overflow_exit:
-	return ERR_PTR(-EAGAIN);
-}
-
 /*
  * 3.3.21  COMMIT3args
  *
@@ -1275,13 +1548,47 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 }
 
 /*
- * Decode status+wcc_data reply
- * SATTR, REMOVE, RMDIR
+ * 3.3.1  GETATTR3res
+ *
+ *	struct GETATTR3resok {
+ *		fattr3		obj_attributes;
+ *	};
+ *
+ *	union GETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		GETATTR3resok  resok;
+ *	default:
+ *		void;
+ *	};
  */
-static int
-nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, __be32 *p,
+				    struct nfs_fattr *result)
 {
-	int	status;
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_fattr3(&xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * Decode status+wcc_data reply
+ * SATTR, REMOVE, RMDIR
+ */
+static int
+nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+{
+	int	status;
 
 	if ((status = ntohl(*p++)))
 		status = nfs_stat_to_errno(status);
@@ -1289,6 +1596,46 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	return status;
 }
 
+/*
+ * 3.3.2  SETATTR3res
+ *
+ *	struct SETATTR3resok {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	struct SETATTR3resfail {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	union SETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		SETATTR3resok   resok;
+ *	default:
+ *		SETATTR3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, __be32 *p,
+				    struct nfs_fattr *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(&xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 static int
 nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
 {
@@ -1314,6 +1661,55 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 	return status;
 }
 
+/*
+ * 3.3.3  LOOKUP3res
+ *
+ *	struct LOOKUP3resok {
+ *		nfs_fh3		object;
+ *		post_op_attr	obj_attributes;
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	struct LOOKUP3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union LOOKUP3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LOOKUP3resok	resok;
+ *	default:
+ *		LOOKUP3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs3_diropres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfs_fh3(&xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->dir_attr);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(&xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode ACCESS reply
  */
@@ -1329,6 +1725,48 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 	return 0;
 }
 
+/*
+ * 3.3.4  ACCESS3res
+ *
+ *	struct ACCESS3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		access;
+ *	};
+ *
+ *	struct ACCESS3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union ACCESS3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		ACCESS3resok	resok;
+ *	default:
+ *		ACCESS3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs3_accessres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_uint32(&xdr, &result->access);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode READLINK reply
  */
@@ -1375,6 +1813,48 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	return 0;
 }
 
+/*
+ * 3.3.5  READLINK3res
+ *
+ *	struct READLINK3resok {
+ *		post_op_attr	symlink_attributes;
+ *		nfspath3	data;
+ *	};
+ *
+ *	struct READLINK3resfail {
+ *		post_op_attr	symlink_attributes;
+ *	};
+ *
+ *	union READLINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READLINK3resok	resok;
+ *	default:
+ *		READLINK3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, __be32 *p,
+				     struct nfs_fattr *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfspath3(&xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode READ reply
  */
@@ -1428,6 +1908,90 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	return count;
 }
 
+/*
+ * 3.3.6  READ3res
+ *
+ *	struct READ3resok {
+ *		post_op_attr	file_attributes;
+ *		count3		count;
+ *		bool		eof;
+ *		opaque		data<>;
+ *	};
+ *
+ *	struct READ3resfail {
+ *		post_op_attr	file_attributes;
+ *	};
+ *
+ *	union READ3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READ3resok	resok;
+ *	default:
+ *		READ3resfail	resfail;
+ *	};
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+			     struct nfs_readres *result)
+{
+	u32 eof, count, ocount, recvd;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p++);
+	eof = be32_to_cpup(p++);
+	ocount = be32_to_cpup(p++);
+	if (unlikely(ocount != count))
+		goto out_mismatch;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+
+out:
+	xdr_read_pages(xdr, count);
+	result->eof = eof;
+	result->count = count;
+	return count;
+out_mismatch:
+	dprintk("NFS: READ count doesn't match length of opaque: "
+		"count %u != ocount %u\n", count, ocount);
+	return -EIO;
+out_cheating:
+	dprintk("NFS: server cheating in read result: "
+		"count %u > recvd %u\n", count, recvd);
+	count = recvd;
+	eof = 0;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, __be32 *p,
+				 struct nfs_readres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_read3resok(&xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode WRITE response
  */
@@ -1450,6 +2014,78 @@ nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 	return res->count;
 }
 
+/*
+ * 3.3.7  WRITE3res
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3resok {
+ *		wcc_data	file_wcc;
+ *		count3		count;
+ *		stable_how	committed;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct WRITE3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union WRITE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		WRITE3resok	resok;
+ *	default:
+ *		WRITE3resfail	resfail;
+ *	};
+ */
+static int decode_write3resok(struct xdr_stream *xdr,
+			      struct nfs_writeres *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->count = be32_to_cpup(p++);
+	result->verf->committed = be32_to_cpup(p++);
+	if (unlikely(result->verf->committed > NFS_FILE_SYNC))
+		goto out_badvalue;
+	memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
+	return result->count;
+out_badvalue:
+	dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, __be32 *p,
+				  struct nfs_writeres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_write3resok(&xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode a CREATE response
  */
@@ -1477,6 +2113,111 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 	return status;
 }
 
+/*
+ * 3.3.8  CREATE3res
+ *
+ *	struct CREATE3resok {
+ *		post_op_fh3	obj;
+ *		post_op_attr	obj_attributes;
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	struct CREATE3resfail {
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	union CREATE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		CREATE3resok	resok;
+ *	default:
+ *		CREATE3resfail	resfail;
+ *	};
+ */
+static int decode_create3resok(struct xdr_stream *xdr,
+			       struct nfs3_diropres *result)
+{
+	int error;
+
+	error = decode_post_op_fh3(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	/* The server isn't required to return a file handle.
+	 * If it didn't, force the client to perform a LOOKUP
+	 * to determine the correct file handle and attribute
+	 * values for the new object. */
+	if (result->fh->size == 0)
+		result->fattr->valid = 0;
+	error = decode_wcc_data(xdr, result->dir_attr);
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs3_diropres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_create3resok(&xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_wcc_data(&xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.12  REMOVE3res
+ *
+ *	struct REMOVE3resok {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	struct REMOVE3resfail {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	union REMOVE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		REMOVE3resok   resok;
+ *	default:
+ *		REMOVE3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs_removeres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(&xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode RENAME reply
  */
@@ -1492,6 +2233,51 @@ nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 	return status;
 }
 
+/*
+ * 3.3.14  RENAME3res
+ *
+ *	struct RENAME3resok {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	struct RENAME3resfail {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	union RENAME3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		RENAME3resok   resok;
+ *	default:
+ *		RENAME3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs_renameres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(&xdr, result->old_fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(&xdr, result->new_fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode LINK reply
  */
@@ -1507,6 +2293,249 @@ nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 	return status;
 }
 
+/*
+ * 3.3.15  LINK3res
+ *
+ *	struct LINK3resok {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	struct LINK3resfail {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	union LINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LINK3resok	resok;
+ *	default:
+ *		LINK3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, __be32 *p,
+				 struct nfs3_linkres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(&xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/**
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
+ *			the local page cache
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @server: nfs_server data for this directory
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns the position of the next item in the buffer, or an ERR_PTR.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *	struct entry3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		fhandle3	filehandle;
+ *		post_op_attr3	attributes;
+ *		entry3		*nextentry;
+ *	};
+ *
+ * 3.3.17  entryplus3
+ *	struct entryplus3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		post_op_attr	name_attributes;
+ *		post_op_fh3	name_handle;
+ *		entryplus3	*nextentry;
+ *	};
+ */
+__be32 *nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+			   struct nfs_server *server, int plus)
+{
+	struct nfs_entry old = *entry;
+	__be32 *p;
+	int error;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p == xdr_zero)
+			return ERR_PTR(-EAGAIN);
+		entry->eof = 1;
+		return ERR_PTR(-EBADCOOKIE);
+	}
+
+	error = decode_fileid3(xdr, &entry->ino);
+	if (unlikely(error))
+		return ERR_PTR(error);
+
+	error = decode_inline_filename3(xdr, &entry->name, &entry->len);
+	if (unlikely(error))
+		return ERR_PTR(error);
+
+	entry->prev_cookie = entry->cookie;
+	error = decode_cookie3(xdr, &entry->cookie);
+	if (unlikely(error))
+		return ERR_PTR(error);
+
+	entry->d_type = DT_UNKNOWN;
+
+	if (plus) {
+		entry->fattr->valid = 0;
+		error = decode_post_op_attr(xdr, entry->fattr);
+		if (unlikely(error))
+			return ERR_PTR(error);
+		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+		/* In fact, a post_op_fh3: */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p != xdr_zero) {
+			error = decode_nfs_fh3(xdr, entry->fh);
+			if (unlikely(error)) {
+				if (error == -E2BIG)
+					goto out_truncated;
+				return ERR_PTR(error);
+			}
+		} else
+			zero_nfs_fh3(entry->fh);
+	}
+
+	/* Peek at the next entry to see if we're at EOD */
+	p = xdr_inline_peek(xdr, 4 + 4);
+	entry->eof = 0;
+	if (p != NULL)
+		entry->eof = (p[0] == xdr_zero) && (p[1] != xdr_zero);
+	return p;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EAGAIN);
+out_truncated:
+	dprintk("NFS: directory entry contains invalid file handle\n");
+	*entry = old;
+	return ERR_PTR(-EAGAIN);
+}
+
+/*
+ * 3.3.16  READDIR3res
+ *
+ *	struct dirlist3 {
+ *		entry3		*entries;
+ *		bool		eof;
+ *	};
+ *
+ *	struct READDIR3resok {
+ *		post_op_attr	dir_attributes;
+ *		cookieverf3	cookieverf;
+ *		dirlist3	reply;
+ *	};
+ *
+ *	struct READDIR3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union READDIR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READDIR3resok	resok;
+ *	default:
+ *		READDIR3resfail	resfail;
+ *	};
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_dirlist3(struct xdr_stream *xdr)
+{
+	u32 recvd, pglen;
+	size_t hdrlen;
+
+	pglen = xdr->buf->page_len;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(pglen > recvd))
+		goto out_cheating;
+out:
+	xdr_read_pages(xdr, pglen);
+	return pglen;
+out_cheating:
+	dprintk("NFS: server cheating in readdir result: "
+		"pglen %u > recvd %u\n", pglen, recvd);
+	pglen = recvd;
+	goto out;
+}
+
+static int decode_readdir3resok(struct xdr_stream *xdr,
+				struct nfs3_readdirres *result)
+{
+	int error;
+
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	/* XXX: do we need to check if result->verf != NULL ? */
+	error = decode_cookieverf3(xdr, result->verf);
+	if (unlikely(error))
+		goto out;
+	error = decode_dirlist3(xdr);
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, __be32 *p,
+				    struct nfs3_readdirres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_readdir3resok(&xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(&xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode FSSTAT reply
  */
@@ -1532,6 +2561,75 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
 	return 0;
 }
 
+/*
+ * 3.3.18  FSSTAT3res
+ *
+ *	struct FSSTAT3resok {
+ *		post_op_attr	obj_attributes;
+ *		size3		tbytes;
+ *		size3		fbytes;
+ *		size3		abytes;
+ *		size3		tfiles;
+ *		size3		ffiles;
+ *		size3		afiles;
+ *		uint32		invarsec;
+ *	};
+ *
+ *	struct FSSTAT3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSSTAT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSSTAT3resok	resok;
+ *	default:
+ *		FSSTAT3resfail	resfail;
+ *	};
+ */
+static int decode_fsstat3resok(struct xdr_stream *xdr,
+			       struct nfs_fsstat *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8 * 6 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	p = xdr_decode_size3(p, &result->tbytes);
+	p = xdr_decode_size3(p, &result->fbytes);
+	p = xdr_decode_size3(p, &result->abytes);
+	p = xdr_decode_size3(p, &result->tfiles);
+	p = xdr_decode_size3(p, &result->ffiles);
+	xdr_decode_size3(p, &result->afiles);
+	/* ignore invarsec */
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs_fsstat *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsstat3resok(&xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode FSINFO reply
  */
@@ -1561,6 +2659,83 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
 	return 0;
 }
 
+/*
+ * 3.3.19  FSINFO3res
+ *
+ *	struct FSINFO3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		rtmax;
+ *		uint32		rtpref;
+ *		uint32		rtmult;
+ *		uint32		wtmax;
+ *		uint32		wtpref;
+ *		uint32		wtmult;
+ *		uint32		dtpref;
+ *		size3		maxfilesize;
+ *		nfstime3	time_delta;
+ *		uint32		properties;
+ *	};
+ *
+ *	struct FSINFO3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSINFO3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSINFO3resok	resok;
+ *	default:
+ *		FSINFO3resfail	resfail;
+ *	};
+ */
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
+			       struct nfs_fsinfo *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->rtmax  = be32_to_cpup(p++);
+	result->rtpref = be32_to_cpup(p++);
+	result->rtmult = be32_to_cpup(p++);
+	result->wtmax  = be32_to_cpup(p++);
+	result->wtpref = be32_to_cpup(p++);
+	result->wtmult = be32_to_cpup(p++);
+	result->dtpref = be32_to_cpup(p++);
+	p = xdr_decode_size3(p, &result->maxfilesize);
+	xdr_decode_time3(p, &result->time_delta);
+
+	/* ignore properties */
+	result->lease_time = 0;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs_fsinfo *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsinfo3resok(&xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode PATHCONF reply
  */
@@ -1581,6 +2756,70 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
 	return 0;
 }
 
+/*
+ * 3.3.20  PATHCONF3res
+ *
+ *	struct PATHCONF3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		linkmax;
+ *		uint32		name_max;
+ *		bool		no_trunc;
+ *		bool		chown_restricted;
+ *		bool		case_insensitive;
+ *		bool		case_preserving;
+ *	};
+ *
+ *	struct PATHCONF3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union PATHCONF3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		PATHCONF3resok	resok;
+ *	default:
+ *		PATHCONF3resfail resfail;
+ *	};
+ */
+static int decode_pathconf3resok(struct xdr_stream *xdr,
+				 struct nfs_pathconf *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * 6);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->max_link = be32_to_cpup(p++);
+	result->max_namelen = be32_to_cpup(p);
+	/* ignore remaining fields */
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, __be32 *p,
+				     struct nfs_pathconf *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_pathconf3resok(&xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode COMMIT reply
  */
@@ -1599,6 +2838,48 @@ nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 	return 0;
 }
 
+/*
+ * 3.3.21  COMMIT3res
+ *
+ *	struct COMMIT3resok {
+ *		wcc_data	file_wcc;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct COMMIT3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union COMMIT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		COMMIT3resok	resok;
+ *	default:
+ *		COMMIT3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs_writeres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(&xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_writeverf3(&xdr, result->verf->verifier);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
 #ifdef CONFIG_NFS_V3_ACL
 /*
  * Decode GETACL reply
@@ -1632,6 +2913,70 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
 	return (err > 0) ? 0 : err;
 }
 
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
+				      struct nfs3_getaclres *result)
+{
+	struct posix_acl **acl;
+	unsigned int *aclcnt;
+	size_t hdrlen;
+	int error;
+
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_uint32(xdr, &result->mask);
+	if (unlikely(error))
+		goto out;
+	error = -EINVAL;
+	if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		goto out;
+
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+
+	acl = NULL;
+	if (result->mask & NFS_ACL)
+		acl = &result->acl_access;
+	aclcnt = NULL;
+	if (result->mask & NFS_ACLCNT)
+		aclcnt = &result->acl_access_count;
+	error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+	if (unlikely(error <= 0))
+		goto out;
+
+	acl = NULL;
+	if (result->mask & NFS_DFACL)
+		acl = &result->acl_default;
+	aclcnt = NULL;
+	if (result->mask & NFS_DFACLCNT)
+		aclcnt = &result->acl_default_count;
+	error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+	if (unlikely(error <= 0))
+		return error;
+	error = 0;
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs3_getaclres *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_getacl3resok(&xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
 /*
  * Decode setacl reply.
  */
@@ -1645,6 +2990,27 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	xdr_decode_post_op_attr(p, fattr);
 	return 0;
 }
+
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, __be32 *p,
+				   struct nfs_fattr *result)
+{
+	struct xdr_stream xdr;
+	enum nfs_stat status;
+	int error;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	error = decode_nfsstat3(&xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_post_op_attr(&xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
 #endif  /* CONFIG_NFS_V3_ACL */
 
 #define PROC(proc, argtype, restype, timer)				\
diff --git a/include/linux/nfs3.h b/include/linux/nfs3.h
index e2ee14fd0083..6ccfe3b641e1 100644
--- a/include/linux/nfs3.h
+++ b/include/linux/nfs3.h
@@ -13,6 +13,7 @@
 #define NFS3_COOKIESIZE		4
 #define NFS3_CREATEVERFSIZE	8
 #define NFS3_COOKIEVERFSIZE	8
+#define NFS3_WRITEVERFSIZE	8
 #define NFS3_FIFO_DEV		(-1)
 #define NFS3MODE_FMT		0170000
 #define NFS3MODE_DIR		0040000
-- 
cgit v1.2.3-71-gd317


From d8367c504e39528a057a5d7a267b6724f7fdb4b8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 14:57:52 +0000
Subject: lockd: Move nlmdbg_cookie2a() to svclock.c

Clean up.  nlmdbg_cookie2a() is used only in svclock.c.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c          | 30 ++++++++++++++++++++++++++++++
 fs/lockd/xdr.c              | 29 -----------------------------
 include/linux/lockd/debug.h | 10 ----------
 3 files changed, 30 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ef5659b211e9..9266c4600208 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,6 +46,7 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 
 /*
  * The list of blocked locks to retry
@@ -934,3 +935,32 @@ nlmsvc_retry_blocked(void)
 
 	return timeout;
 }
+
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+	/*
+	 * We can get away with a static buffer because we're only
+	 * called with BKL held.
+	 */
+	static char buf[2*NLM_MAXCOOKIELEN+1];
+	unsigned int i, len = sizeof(buf);
+	char *p = buf;
+
+	len--;	/* allow for trailing \0 */
+	if (len < 3)
+		return "???";
+	for (i = 0 ; i < cookie->len ; i++) {
+		if (len < 2) {
+			strcpy(p-3, "...");
+			break;
+		}
+		sprintf(p, "%02x", cookie->data[i]);
+		p += 2;
+		len -= 2;
+	}
+	*p = '\0';
+
+	return buf;
+}
+#endif
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0eb694dc497b..964666c68a86 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -341,32 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
-
-#ifdef RPC_DEBUG
-const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-	/*
-	 * We can get away with a static buffer because we're only
-	 * called with BKL held.
-	 */
-	static char buf[2*NLM_MAXCOOKIELEN+1];
-	unsigned int i, len = sizeof(buf);
-	char *p = buf;
-
-	len--;	/* allow for trailing \0 */
-	if (len < 3)
-		return "???";
-	for (i = 0 ; i < cookie->len ; i++) {
-		if (len < 2) {
-			strcpy(p-3, "...");
-			break;
-		}
-		sprintf(p, "%02x", cookie->data[i]);
-		p += 2;
-		len -= 2;
-	}
-	*p = '\0';
-
-	return buf;
-}
-#endif
diff --git a/include/linux/lockd/debug.h b/include/linux/lockd/debug.h
index 34b2b7f33c3b..257d3779f2ab 100644
--- a/include/linux/lockd/debug.h
+++ b/include/linux/lockd/debug.h
@@ -44,14 +44,4 @@
 #define NLMDBG_XDR		0x0100
 #define NLMDBG_ALL		0x7fff
 
-
-/*
- * Support for printing NLM cookies in dprintk()
- */
-#ifdef RPC_DEBUG
-struct nlm_cookie;
-/* Call this function with the BKL held (it uses a static buffer) */
-extern const char *nlmdbg_cookie2a(const struct nlm_cookie *);
-#endif
-
 #endif /* LINUX_LOCKD_DEBUG_H */
-- 
cgit v1.2.3-71-gd317


From 573c4e1ef53a6b891b73cc2257e1604da754a2e4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 14:58:11 +0000
Subject: NFS: Simplify ->decode_dirent() calling sequence

Clean up.

The pointer returned by ->decode_dirent() is no longer used as a
pointer.  The only call site (xdr_decode() in fs/nfs/dir.c) simply
extracts the errno value encoded in the pointer.  Replace the
returned pointer with a standard integer errno return value.

Also, pass the "server" argument as part of the nfs_entry instead of
as a separate parameter.  It's faster to derive "server" in
nfs_readdir_xdr_to_array() since we already have the directory's inode
handy.  "server" ought to be invariant for a set of entries in the
same directory, right?

The legacy versions of decode_dirent() don't use "server" anyway, so
it's wasted work for them to derive and pass "server" for each entry.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            | 15 ++++++++-------
 fs/nfs/internal.h       |  9 ++++++---
 fs/nfs/nfs2xdr.c        | 18 +++++++++---------
 fs/nfs/nfs3xdr.c        | 28 ++++++++++++++--------------
 fs/nfs/nfs4_fs.h        |  1 -
 fs/nfs/nfs4xdr.c        | 29 ++++++++++++++++++++++-------
 include/linux/nfs_xdr.h |  3 ++-
 7 files changed, 61 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 996dd8989a91..3e2123fe79f5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -172,7 +172,7 @@ struct nfs_cache_array {
 	struct nfs_cache_array_entry array[0];
 };
 
-typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
@@ -378,14 +378,14 @@ error:
 	return error;
 }
 
-/* Fill in an entry based on the xdr code stored in desc->page */
-static
-int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
+		      struct nfs_entry *entry, struct xdr_stream *xdr)
 {
-	__be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	int error;
 
+	error = desc->decode(xdr, entry, desc->plus);
+	if (error)
+		return error;
 	entry->fattr->time_start = desc->timestamp;
 	entry->fattr->gencount = desc->gencount;
 	return 0;
@@ -566,6 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	entry.eof = 0;
 	entry.fh = nfs_alloc_fhandle();
 	entry.fattr = nfs_alloc_fattr();
+	entry.server = NFS_SERVER(inode);
 	if (entry.fh == NULL || entry.fattr == NULL)
 		goto out;
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6c6a9955bae9..435eae3666bd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -187,15 +187,18 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 *nfs2_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs2_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
 
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs3_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
 
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs4_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 0343175fe6c0..a9b848edbd2e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -936,10 +936,10 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, __be32 *p,
  *                      the local page cache.
  * @xdr: XDR stream where entry resides
  * @entry: buffer to fill in with entry data
- * @server: nfs_server data for this directory
  * @plus: boolean indicating whether this should be a readdirplus entry
  *
- * Returns the position of the next item in the buffer, or an ERR_PTR.
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
  *
  * This function is not invoked during READDIR reply decoding, but
  * rather whenever an application invokes the getdents(2) system call
@@ -954,8 +954,8 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, __be32 *p,
  *		entry		*nextentry;
  *	};
  */
-__be32 *nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
-			   struct nfs_server *server, int plus)
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
 {
 	__be32 *p;
 	int error;
@@ -968,9 +968,9 @@ __be32 *nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		if (unlikely(p == NULL))
 			goto out_overflow;
 		if (*p++ == xdr_zero)
-			return ERR_PTR(-EAGAIN);
+			return -EAGAIN;
 		entry->eof = 1;
-		return ERR_PTR(-EBADCOOKIE);
+		return -EBADCOOKIE;
 	}
 
 	p = xdr_inline_decode(xdr, 4);
@@ -980,7 +980,7 @@ __be32 *nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 
 	error = decode_filename_inline(xdr, &entry->name, &entry->len);
 	if (unlikely(error))
-		return ERR_PTR(error);
+		return error;
 
 	/*
 	 * The type (size and byte order) of nfscookie isn't defined in
@@ -999,11 +999,11 @@ __be32 *nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	entry->eof = 0;
 	if (p != NULL)
 		entry->eof = (p[0] == xdr_zero) && (p[1] != xdr_zero);
-	return p;
+	return 0;
 
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return ERR_PTR(-EAGAIN);
+	return -EAGAIN;
 }
 
 /*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index c97d00fe849a..15c93ccd90c5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1970,10 +1970,10 @@ out_status:
  *			the local page cache
  * @xdr: XDR stream where entry resides
  * @entry: buffer to fill in with entry data
- * @server: nfs_server data for this directory
  * @plus: boolean indicating whether this should be a readdirplus entry
  *
- * Returns the position of the next item in the buffer, or an ERR_PTR.
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
  *
  * This function is not invoked during READDIR reply decoding, but
  * rather whenever an application invokes the getdents(2) system call
@@ -2000,8 +2000,8 @@ out_status:
  *		entryplus3	*nextentry;
  *	};
  */
-__be32 *nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
-			   struct nfs_server *server, int plus)
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
 {
 	struct nfs_entry old = *entry;
 	__be32 *p;
@@ -2015,23 +2015,23 @@ __be32 *nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		if (unlikely(p == NULL))
 			goto out_overflow;
 		if (*p == xdr_zero)
-			return ERR_PTR(-EAGAIN);
+			return -EAGAIN;
 		entry->eof = 1;
-		return ERR_PTR(-EBADCOOKIE);
+		return -EBADCOOKIE;
 	}
 
 	error = decode_fileid3(xdr, &entry->ino);
 	if (unlikely(error))
-		return ERR_PTR(error);
+		return error;
 
 	error = decode_inline_filename3(xdr, &entry->name, &entry->len);
 	if (unlikely(error))
-		return ERR_PTR(error);
+		return error;
 
 	entry->prev_cookie = entry->cookie;
 	error = decode_cookie3(xdr, &entry->cookie);
 	if (unlikely(error))
-		return ERR_PTR(error);
+		return error;
 
 	entry->d_type = DT_UNKNOWN;
 
@@ -2039,7 +2039,7 @@ __be32 *nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		entry->fattr->valid = 0;
 		error = decode_post_op_attr(xdr, entry->fattr);
 		if (unlikely(error))
-			return ERR_PTR(error);
+			return error;
 		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
 			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
 
@@ -2052,7 +2052,7 @@ __be32 *nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 			if (unlikely(error)) {
 				if (error == -E2BIG)
 					goto out_truncated;
-				return ERR_PTR(error);
+				return error;
 			}
 		} else
 			zero_nfs_fh3(entry->fh);
@@ -2063,15 +2063,15 @@ __be32 *nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	entry->eof = 0;
 	if (p != NULL)
 		entry->eof = (p[0] == xdr_zero) && (p[1] != xdr_zero);
-	return p;
+	return 0;
 
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return ERR_PTR(-EAGAIN);
+	return -EAGAIN;
 out_truncated:
 	dprintk("NFS: directory entry contains invalid file handle\n");
 	*entry = old;
-	return ERR_PTR(-EAGAIN);
+	return -EAGAIN;
 }
 
 /*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fdf..7a6eecffcaeb 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -331,7 +331,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 868815c55450..be9f00ab0d18 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6159,8 +6159,22 @@ out:
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
-			   struct nfs_server *server, int plus)
+/**
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
 {
 	uint32_t bitmap[2] = {0};
 	uint32_t len;
@@ -6172,9 +6186,9 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		if (unlikely(!p))
 			goto out_overflow;
 		if (!ntohl(*p++))
-			return ERR_PTR(-EAGAIN);
+			return -EAGAIN;
 		entry->eof = 1;
-		return ERR_PTR(-EBADCOOKIE);
+		return -EBADCOOKIE;
 	}
 
 	p = xdr_inline_decode(xdr, 12);
@@ -6203,7 +6217,8 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	if (decode_attr_length(xdr, &len, &p) < 0)
 		goto out_overflow;
 
-	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+					entry->server, 1) < 0)
 		goto out_overflow;
 	if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
 		entry->ino = entry->fattr->fileid;
@@ -6221,11 +6236,11 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	else
 		entry->eof = 0;
 
-	return p;
+	return 0;
 
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return ERR_PTR(-EAGAIN);
+	return -EAGAIN;
 }
 
 /*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 80f07198a31a..236e7e4b99a0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -484,6 +484,7 @@ struct nfs_entry {
 	struct nfs_fh *		fh;
 	struct nfs_fattr *	fattr;
 	unsigned char		d_type;
+	struct nfs_server *	server;
 };
 
 /*
@@ -1089,7 +1090,7 @@ struct nfs_rpc_ops {
 	int	(*pathconf) (struct nfs_server *, struct nfs_fh *,
 			     struct nfs_pathconf *);
 	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
-	__be32 *(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int plus);
+	int	(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
 	void	(*read_setup)   (struct nfs_read_data *, struct rpc_message *);
 	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
 	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
-- 
cgit v1.2.3-71-gd317


From 9f06c719f474be7003763284a990bed6377bb0d4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 14:59:18 +0000
Subject: SUNRPC: New xdr_streams XDR encoder API

Now that all client-side XDR encoder routines use xdr_streams, there
should be no need to support the legacy calling sequence [rpc_rqst *,
__be32 *, RPC arg *] anywhere.  We can construct an xdr_stream in the
generic RPC code, instead of in each encoder function.

Also, all the client-side encoder functions return 0 now, making a
return value superfluous.  Take this opportunity to convert them to
return void instead.

This is a refactoring change.  It should not cause different behavior.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clnt4xdr.c            |  92 +++---
 fs/lockd/clntxdr.c             |  92 +++---
 fs/lockd/mon.c                 |  26 +-
 fs/nfs/mount_clnt.c            |  18 +-
 fs/nfs/nfs2xdr.c               | 147 ++++-----
 fs/nfs/nfs3xdr.c               | 241 ++++++---------
 fs/nfs/nfs4xdr.c               | 663 ++++++++++++++++++-----------------------
 fs/nfsd/nfs4callback.c         |  22 +-
 include/linux/sunrpc/auth.h    |   4 +-
 include/linux/sunrpc/clnt.h    |   2 +-
 include/linux/sunrpc/xdr.h     |   9 +-
 net/sunrpc/auth.c              |  14 +-
 net/sunrpc/auth_gss/auth_gss.c |  31 +-
 net/sunrpc/clnt.c              |   5 +-
 net/sunrpc/rpcb_clnt.c         |  47 ++-
 15 files changed, 606 insertions(+), 807 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 1a1c3e21ed2c..974f1d9cd323 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -385,17 +385,15 @@ static void encode_nlm4_lock(struct xdr_stream *xdr,
  *		struct nlm4_lock alock;
  *	};
  */
-static int nlm4_xdr_enc_testargs(struct rpc_rqst *req, __be32 *p,
-				 const struct nlm_args *args)
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_bool(&xdr, lock->fl.fl_type == F_WRLCK);
-	encode_nlm4_lock(&xdr, lock);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
 }
 
 /*
@@ -408,20 +406,18 @@ static int nlm4_xdr_enc_testargs(struct rpc_rqst *req, __be32 *p,
  *		int state;
  *	};
  */
-static int nlm4_xdr_enc_lockargs(struct rpc_rqst *req, __be32 *p,
-				 const struct nlm_args *args)
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_bool(&xdr, args->block);
-	encode_bool(&xdr, lock->fl.fl_type == F_WRLCK);
-	encode_nlm4_lock(&xdr, lock);
-	encode_bool(&xdr, args->reclaim);
-	encode_int32(&xdr, args->state);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
 }
 
 /*
@@ -432,18 +428,16 @@ static int nlm4_xdr_enc_lockargs(struct rpc_rqst *req, __be32 *p,
  *		struct nlm4_lock alock;
  *	};
  */
-static int nlm4_xdr_enc_cancargs(struct rpc_rqst *req, __be32 *p,
-				 const struct nlm_args *args)
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_bool(&xdr, args->block);
-	encode_bool(&xdr, lock->fl.fl_type == F_WRLCK);
-	encode_nlm4_lock(&xdr, lock);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
 }
 
 /*
@@ -452,16 +446,14 @@ static int nlm4_xdr_enc_cancargs(struct rpc_rqst *req, __be32 *p,
  *		struct nlm4_lock alock;
  *	};
  */
-static int nlm4_xdr_enc_unlockargs(struct rpc_rqst *req, __be32 *p,
-				   const struct nlm_args *args)
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_nlm4_lock(&xdr, lock);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm4_lock(xdr, lock);
 }
 
 /*
@@ -470,15 +462,12 @@ static int nlm4_xdr_enc_unlockargs(struct rpc_rqst *req, __be32 *p,
  *		nlm4_stat stat;
  *	};
  */
-static int nlm4_xdr_enc_res(struct rpc_rqst *req, __be32 *p,
-			    const struct nlm_res *result)
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+			     struct xdr_stream *xdr,
+			     const struct nlm_res *result)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &result->cookie);
-	encode_nlm4_stat(&xdr, result->status);
-	return 0;
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
 }
 
 /*
@@ -494,17 +483,14 @@ static int nlm4_xdr_enc_res(struct rpc_rqst *req, __be32 *p,
  *		nlm4_testrply test_stat;
  *	};
  */
-static int nlm4_xdr_enc_testres(struct rpc_rqst *req, __be32 *p,
-				const struct nlm_res *result)
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_res *result)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &result->cookie);
-	encode_nlm4_stat(&xdr, result->status);
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
 	if (result->status == nlm_lck_denied)
-		encode_nlm4_holder(&xdr, result);
-	return 0;
+		encode_nlm4_holder(xdr, result);
 }
 
 
@@ -588,7 +574,7 @@ out:
 #define PROC(proc, argtype, restype)					\
 [NLMPROC_##proc] = {							\
 	.p_proc      = NLMPROC_##proc,					\
-	.p_encode    = (kxdrproc_t)nlm4_xdr_enc_##argtype,		\
+	.p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,		\
 	.p_decode    = (kxdrproc_t)nlm4_xdr_dec_##restype,		\
 	.p_arglen    = NLM4_##argtype##_sz,				\
 	.p_replen    = NLM4_##restype##_sz,				\
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 0472f2aff509..c6fda8fb1c5b 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -378,17 +378,15 @@ static void encode_nlm_lock(struct xdr_stream *xdr,
  *		struct nlm_lock alock;
  *	};
  */
-static int nlm_xdr_enc_testargs(struct rpc_rqst *req, __be32 *p,
-				const struct nlm_args *args)
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_bool(&xdr, lock->fl.fl_type == F_WRLCK);
-	encode_nlm_lock(&xdr, lock);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
 }
 
 /*
@@ -401,20 +399,18 @@ static int nlm_xdr_enc_testargs(struct rpc_rqst *req, __be32 *p,
  *		int state;
  *	};
  */
-static int nlm_xdr_enc_lockargs(struct rpc_rqst *req, __be32 *p,
-				const struct nlm_args *args)
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_bool(&xdr, args->block);
-	encode_bool(&xdr, lock->fl.fl_type == F_WRLCK);
-	encode_nlm_lock(&xdr, lock);
-	encode_bool(&xdr, args->reclaim);
-	encode_int32(&xdr, args->state);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
 }
 
 /*
@@ -425,18 +421,16 @@ static int nlm_xdr_enc_lockargs(struct rpc_rqst *req, __be32 *p,
  *		struct nlm_lock alock;
  *	};
  */
-static int nlm_xdr_enc_cancargs(struct rpc_rqst *req, __be32 *p,
-				const struct nlm_args *args)
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_bool(&xdr, args->block);
-	encode_bool(&xdr, lock->fl.fl_type == F_WRLCK);
-	encode_nlm_lock(&xdr, lock);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
 }
 
 /*
@@ -445,16 +439,14 @@ static int nlm_xdr_enc_cancargs(struct rpc_rqst *req, __be32 *p,
  *		struct nlm_lock alock;
  *	};
  */
-static int nlm_xdr_enc_unlockargs(struct rpc_rqst *req, __be32 *p,
-				  const struct nlm_args *args)
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nlm_args *args)
 {
 	const struct nlm_lock *lock = &args->lock;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &args->cookie);
-	encode_nlm_lock(&xdr, lock);
-	return 0;
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm_lock(xdr, lock);
 }
 
 /*
@@ -463,15 +455,12 @@ static int nlm_xdr_enc_unlockargs(struct rpc_rqst *req, __be32 *p,
  *		nlm_stat stat;
  *	};
  */
-static int nlm_xdr_enc_res(struct rpc_rqst *req, __be32 *p,
-			   const struct nlm_res *result)
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
+			    const struct nlm_res *result)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &result->cookie);
-	encode_nlm_stat(&xdr, result->status);
-	return 0;
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
 }
 
 /*
@@ -494,16 +483,13 @@ static void encode_nlm_testrply(struct xdr_stream *xdr,
 		encode_nlm_holder(xdr, result);
 }
 
-static int nlm_xdr_enc_testres(struct rpc_rqst *req, __be32 *p,
-			       const struct nlm_res *result)
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				const struct nlm_res *result)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cookie(&xdr, &result->cookie);
-	encode_nlm_stat(&xdr, result->status);
-	encode_nlm_testrply(&xdr, result);
-	return 0;
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
+	encode_nlm_testrply(xdr, result);
 }
 
 
@@ -586,7 +572,7 @@ out:
 #define PROC(proc, argtype, restype)	\
 [NLMPROC_##proc] = {							\
 	.p_proc      = NLMPROC_##proc,					\
-	.p_encode    = (kxdrproc_t)nlm_xdr_enc_##argtype,		\
+	.p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,		\
 	.p_decode    = (kxdrproc_t)nlm_xdr_dec_##restype,		\
 	.p_arglen    = NLM_##argtype##_sz,				\
 	.p_replen    = NLM_##restype##_sz,				\
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index d812818d0258..baa77bc9d825 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -459,25 +459,17 @@ static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 	xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
 }
 
-static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
-		       const struct nsm_args *argp)
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			    const struct nsm_args *argp)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_mon_id(&xdr, argp);
-	encode_priv(&xdr, argp);
-	return 0;
+	encode_mon_id(xdr, argp);
+	encode_priv(xdr, argp);
 }
 
-static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
-			 const struct nsm_args *argp)
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      const struct nsm_args *argp)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_mon_id(&xdr, argp);
-	return 0;
+	encode_mon_id(xdr, argp);
 }
 
 static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
@@ -524,7 +516,7 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
 static struct rpc_procinfo	nsm_procedures[] = {
 [NSMPROC_MON] = {
 		.p_proc		= NSMPROC_MON,
-		.p_encode	= (kxdrproc_t)xdr_enc_mon,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_mon,
 		.p_decode	= (kxdrproc_t)xdr_dec_stat_res,
 		.p_arglen	= SM_mon_sz,
 		.p_replen	= SM_monres_sz,
@@ -533,7 +525,7 @@ static struct rpc_procinfo	nsm_procedures[] = {
 	},
 [NSMPROC_UNMON] = {
 		.p_proc		= NSMPROC_UNMON,
-		.p_encode	= (kxdrproc_t)xdr_enc_unmon,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_unmon,
 		.p_decode	= (kxdrproc_t)xdr_dec_stat,
 		.p_arglen	= SM_mon_id_sz,
 		.p_replen	= SM_unmonres_sz,
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 97c3ec793305..979ebd7af3cb 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -288,14 +288,10 @@ static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 	xdr_encode_opaque(p, pathname, pathname_len);
 }
 
-static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
-			   const char *dirpath)
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const char *dirpath)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_mntdirpath(&xdr, dirpath);
-	return 0;
+	encode_mntdirpath(xdr, dirpath);
 }
 
 /*
@@ -460,7 +456,7 @@ static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
 static struct rpc_procinfo mnt_procedures[] = {
 	[MOUNTPROC_MNT] = {
 		.p_proc		= MOUNTPROC_MNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
 		.p_decode	= (kxdrproc_t)mnt_dec_mountres,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_replen	= MNT_dec_mountres_sz,
@@ -469,7 +465,7 @@ static struct rpc_procinfo mnt_procedures[] = {
 	},
 	[MOUNTPROC_UMNT] = {
 		.p_proc		= MOUNTPROC_UMNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_statidx	= MOUNTPROC_UMNT,
 		.p_name		= "UMOUNT",
@@ -479,7 +475,7 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
 	[MOUNTPROC3_MNT] = {
 		.p_proc		= MOUNTPROC3_MNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
 		.p_decode	= (kxdrproc_t)mnt_dec_mountres3,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_replen	= MNT_dec_mountres3_sz,
@@ -488,7 +484,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
 	},
 	[MOUNTPROC3_UMNT] = {
 		.p_proc		= MOUNTPROC3_UMNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_statidx	= MOUNTPROC3_UMNT,
 		.p_name		= "UMOUNT",
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index a9b848edbd2e..8f3acbec761f 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -558,14 +558,11 @@ out_default:
  * "NFS: Network File System Protocol Specification".
  */
 
-static int nfs2_xdr_enc_fhandle(struct rpc_rqst *req, __be32 *p,
-				const struct nfs_fh *fh)
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nfs_fh *fh)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_fhandle(&xdr, fh);
-	return 0;
+	encode_fhandle(xdr, fh);
 }
 
 /*
@@ -576,37 +573,28 @@ static int nfs2_xdr_enc_fhandle(struct rpc_rqst *req, __be32 *p,
  *		sattr attributes;
  *	};
  */
-static int nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, __be32 *p,
-				  const struct nfs_sattrargs *args)
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_sattrargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_fhandle(&xdr, args->fh);
-	encode_sattr(&xdr, args->sattr);
-	return 0;
+	encode_fhandle(xdr, args->fh);
+	encode_sattr(xdr, args->sattr);
 }
 
-static int nfs2_xdr_enc_diropargs(struct rpc_rqst *req, __be32 *p,
-				  const struct nfs_diropargs *args)
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_diropargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs(&xdr, args->fh, args->name, args->len);
-	return 0;
+	encode_diropargs(xdr, args->fh, args->name, args->len);
 }
 
-static int nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, __be32 *p,
-				     const struct nfs_readlinkargs *args)
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_readlinkargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_fhandle(&xdr, args->fh);
+	encode_fhandle(xdr, args->fh);
 	prepare_reply_buffer(req, args->pages, args->pgbase,
 					args->pglen, NFS_readlinkres_sz);
-	return 0;
 }
 
 /*
@@ -634,17 +622,14 @@ static void encode_readargs(struct xdr_stream *xdr,
 	*p = cpu_to_be32(count);
 }
 
-static int nfs2_xdr_enc_readargs(struct rpc_rqst *req, __be32 *p,
-				 const struct nfs_readargs *args)
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_readargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_readargs(&xdr, args);
+	encode_readargs(xdr, args);
 	prepare_reply_buffer(req, args->pages, args->pgbase,
 					args->count, NFS_readres_sz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
-	return 0;
 }
 
 /*
@@ -677,15 +662,12 @@ static void encode_writeargs(struct xdr_stream *xdr,
 	xdr_write_pages(xdr, args->pages, args->pgbase, count);
 }
 
-static int nfs2_xdr_enc_writeargs(struct rpc_rqst *req, __be32 *p,
-				  const struct nfs_writeargs *args)
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_writeargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_writeargs(&xdr, args);
-	xdr.buf->flags |= XDRBUF_WRITE;
-	return 0;
+	encode_writeargs(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
 }
 
 /*
@@ -696,25 +678,19 @@ static int nfs2_xdr_enc_writeargs(struct rpc_rqst *req, __be32 *p,
  *		sattr attributes;
  *	};
  */
-static int nfs2_xdr_enc_createargs(struct rpc_rqst *req, __be32 *p,
-				   const struct nfs_createargs *args)
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_createargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs(&xdr, args->fh, args->name, args->len);
-	encode_sattr(&xdr, args->sattr);
-	return 0;
+	encode_diropargs(xdr, args->fh, args->name, args->len);
+	encode_sattr(xdr, args->sattr);
 }
 
-static int nfs2_xdr_enc_removeargs(struct rpc_rqst *req, __be32 *p,
-				   const struct nfs_removeargs *args)
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_removeargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs(&xdr, args->fh, args->name.name, args->name.len);
-	return 0;
+	encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
 }
 
 /*
@@ -725,17 +701,15 @@ static int nfs2_xdr_enc_removeargs(struct rpc_rqst *req, __be32 *p,
  *		diropargs to;
  *	};
  */
-static int nfs2_xdr_enc_renameargs(struct rpc_rqst *req, __be32 *p,
-				   const struct nfs_renameargs *args)
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_renameargs *args)
 {
 	const struct qstr *old = args->old_name;
 	const struct qstr *new = args->new_name;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs(&xdr, args->old_dir, old->name, old->len);
-	encode_diropargs(&xdr, args->new_dir, new->name, new->len);
-	return 0;
+	encode_diropargs(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs(xdr, args->new_dir, new->name, new->len);
 }
 
 /*
@@ -746,15 +720,12 @@ static int nfs2_xdr_enc_renameargs(struct rpc_rqst *req, __be32 *p,
  *		diropargs to;
  *	};
  */
-static int nfs2_xdr_enc_linkargs(struct rpc_rqst *req, __be32 *p,
-				 const struct nfs_linkargs *args)
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_linkargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_fhandle(&xdr, args->fromfh);
-	encode_diropargs(&xdr, args->tofh, args->toname, args->tolen);
-	return 0;
+	encode_fhandle(xdr, args->fromfh);
+	encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
 }
 
 /*
@@ -766,16 +737,13 @@ static int nfs2_xdr_enc_linkargs(struct rpc_rqst *req, __be32 *p,
  *		sattr attributes;
  *	};
  */
-static int nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs_symlinkargs *args)
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_symlinkargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs(&xdr, args->fromfh, args->fromname, args->fromlen);
-	encode_path(&xdr, args->pages, args->pathlen);
-	encode_sattr(&xdr, args->sattr);
-	return 0;
+	encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_path(xdr, args->pages, args->pathlen);
+	encode_sattr(xdr, args->sattr);
 }
 
 /*
@@ -799,16 +767,13 @@ static void encode_readdirargs(struct xdr_stream *xdr,
 	*p = cpu_to_be32(args->count);
 }
 
-static int nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs_readdirargs *args)
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_readdirargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_readdirargs(&xdr, args);
+	encode_readdirargs(xdr, args);
 	prepare_reply_buffer(req, args->pages, 0,
 					args->count, NFS_readdirres_sz);
-	return 0;
 }
 
 /*
@@ -1184,7 +1149,7 @@ int nfs_stat_to_errno(enum nfs_stat status)
 #define PROC(proc, argtype, restype, timer)				\
 [NFSPROC_##proc] = {							\
 	.p_proc	    =  NFSPROC_##proc,					\
-	.p_encode   =  (kxdrproc_t)nfs2_xdr_enc_##argtype,		\
+	.p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,		\
 	.p_decode   =  (kxdrproc_t)nfs2_xdr_dec_##restype,		\
 	.p_arglen   =  NFS_##argtype##_sz,				\
 	.p_replen   =  NFS_##restype##_sz,				\
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 15c93ccd90c5..ae1b1a43f05e 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -835,14 +835,11 @@ static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
  *		nfs_fh3  object;
  *	};
  */
-static int nfs3_xdr_enc_getattr3args(struct rpc_rqst *req, __be32 *p,
-				     const struct nfs_fh *fh)
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_fh *fh)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_nfs_fh3(&xdr, fh);
-	return 0;
+	encode_nfs_fh3(xdr, fh);
 }
 
 /*
@@ -876,16 +873,13 @@ static void encode_sattrguard3(struct xdr_stream *xdr,
 	}
 }
 
-static int nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, __be32 *p,
-				     const struct nfs3_sattrargs *args)
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_sattrargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_nfs_fh3(&xdr, args->fh);
-	encode_sattr3(&xdr, args->sattr);
-	encode_sattrguard3(&xdr, args);
-	return 0;
+	encode_nfs_fh3(xdr, args->fh);
+	encode_sattr3(xdr, args->sattr);
+	encode_sattrguard3(xdr, args);
 }
 
 /*
@@ -895,14 +889,11 @@ static int nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, __be32 *p,
  *		diropargs3  what;
  *	};
  */
-static int nfs3_xdr_enc_lookup3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs3_diropargs *args)
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_diropargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs3(&xdr, args->fh, args->name, args->len);
-	return 0;
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
 }
 
 /*
@@ -920,14 +911,11 @@ static void encode_access3args(struct xdr_stream *xdr,
 	encode_uint32(xdr, args->access);
 }
 
-static int nfs3_xdr_enc_access3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs3_accessargs *args)
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_accessargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_access3args(&xdr, args);
-	return 0;
+	encode_access3args(xdr, args);
 }
 
 /*
@@ -937,16 +925,13 @@ static int nfs3_xdr_enc_access3args(struct rpc_rqst *req, __be32 *p,
  *		nfs_fh3	symlink;
  *	};
  */
-static int nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, __be32 *p,
-				      const struct nfs3_readlinkargs *args)
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       const struct nfs3_readlinkargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_nfs_fh3(&xdr, args->fh);
+	encode_nfs_fh3(xdr, args->fh);
 	prepare_reply_buffer(req, args->pages, args->pgbase,
 					args->pglen, NFS3_readlinkres_sz);
-	return 0;
 }
 
 /*
@@ -970,17 +955,14 @@ static void encode_read3args(struct xdr_stream *xdr,
 	*p = cpu_to_be32(args->count);
 }
 
-static int nfs3_xdr_enc_read3args(struct rpc_rqst *req, __be32 *p,
-				  const struct nfs_readargs *args)
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_readargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_read3args(&xdr, args);
+	encode_read3args(xdr, args);
 	prepare_reply_buffer(req, args->pages, args->pgbase,
 					args->count, NFS3_readres_sz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
-	return 0;
 }
 
 /*
@@ -1015,15 +997,12 @@ static void encode_write3args(struct xdr_stream *xdr,
 	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
 }
 
-static int nfs3_xdr_enc_write3args(struct rpc_rqst *req, __be32 *p,
-				   const struct nfs_writeargs *args)
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_writeargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_write3args(&xdr, args);
-	xdr.buf->flags |= XDRBUF_WRITE;
-	return 0;
+	encode_write3args(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
 }
 
 /*
@@ -1065,15 +1044,12 @@ static void encode_createhow3(struct xdr_stream *xdr,
 	}
 }
 
-static int nfs3_xdr_enc_create3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs3_createargs *args)
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_createargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs3(&xdr, args->fh, args->name, args->len);
-	encode_createhow3(&xdr, args);
-	return 0;
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_createhow3(xdr, args);
 }
 
 /*
@@ -1084,15 +1060,12 @@ static int nfs3_xdr_enc_create3args(struct rpc_rqst *req, __be32 *p,
  *		sattr3		attributes;
  *	};
  */
-static int nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, __be32 *p,
-				   const struct nfs3_mkdirargs *args)
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mkdirargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs3(&xdr, args->fh, args->name, args->len);
-	encode_sattr3(&xdr, args->sattr);
-	return 0;
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_sattr3(xdr, args->sattr);
 }
 
 /*
@@ -1115,15 +1088,12 @@ static void encode_symlinkdata3(struct xdr_stream *xdr,
 	encode_nfspath3(xdr, args->pages, args->pathlen);
 }
 
-static int nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, __be32 *p,
-				     const struct nfs3_symlinkargs *args)
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_symlinkargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs3(&xdr, args->fromfh, args->fromname, args->fromlen);
-	encode_symlinkdata3(&xdr, args);
-	return 0;
+	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_symlinkdata3(xdr, args);
 }
 
 /*
@@ -1178,15 +1148,12 @@ static void encode_mknoddata3(struct xdr_stream *xdr,
 	}
 }
 
-static int nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, __be32 *p,
-				   const struct nfs3_mknodargs *args)
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mknodargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs3(&xdr, args->fh, args->name, args->len);
-	encode_mknoddata3(&xdr, args);
-	return 0;
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_mknoddata3(xdr, args);
 }
 
 /*
@@ -1196,14 +1163,11 @@ static int nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, __be32 *p,
  *		diropargs3  object;
  *	};
  */
-static int nfs3_xdr_enc_remove3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs_removeargs *args)
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_removeargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs3(&xdr, args->fh, args->name.name, args->name.len);
-	return 0;
+	encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
 }
 
 /*
@@ -1214,17 +1178,15 @@ static int nfs3_xdr_enc_remove3args(struct rpc_rqst *req, __be32 *p,
  *		diropargs3	to;
  *	};
  */
-static int nfs3_xdr_enc_rename3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs_renameargs *args)
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_renameargs *args)
 {
 	const struct qstr *old = args->old_name;
 	const struct qstr *new = args->new_name;
-	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_diropargs3(&xdr, args->old_dir, old->name, old->len);
-	encode_diropargs3(&xdr, args->new_dir, new->name, new->len);
-	return 0;
+	encode_diropargs3(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs3(xdr, args->new_dir, new->name, new->len);
 }
 
 /*
@@ -1235,15 +1197,12 @@ static int nfs3_xdr_enc_rename3args(struct rpc_rqst *req, __be32 *p,
  *		diropargs3	link;
  *	};
  */
-static int nfs3_xdr_enc_link3args(struct rpc_rqst *req, __be32 *p,
-				  const struct nfs3_linkargs *args)
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs3_linkargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_nfs_fh3(&xdr, args->fromfh);
-	encode_diropargs3(&xdr, args->tofh, args->toname, args->tolen);
-	return 0;
+	encode_nfs_fh3(xdr, args->fromfh);
+	encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
 }
 
 /*
@@ -1269,16 +1228,13 @@ static void encode_readdir3args(struct xdr_stream *xdr,
 	*p = cpu_to_be32(args->count);
 }
 
-static int nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, __be32 *p,
-				     const struct nfs3_readdirargs *args)
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_readdirargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_readdir3args(&xdr, args);
+	encode_readdir3args(xdr, args);
 	prepare_reply_buffer(req, args->pages, 0,
 				args->count, NFS3_readdirres_sz);
-	return 0;
 }
 
 /*
@@ -1312,16 +1268,13 @@ static void encode_readdirplus3args(struct xdr_stream *xdr,
 	*p = cpu_to_be32(args->count);
 }
 
-static int nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, __be32 *p,
-					 const struct nfs3_readdirargs *args)
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+					  const struct nfs3_readdirargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_readdirplus3args(&xdr, args);
+	encode_readdirplus3args(xdr, args);
 	prepare_reply_buffer(req, args->pages, 0,
 				args->count, NFS3_readdirres_sz);
-	return 0;
 }
 
 /*
@@ -1345,57 +1298,49 @@ static void encode_commit3args(struct xdr_stream *xdr,
 	*p = cpu_to_be32(args->count);
 }
 
-static int nfs3_xdr_enc_commit3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs_writeargs *args)
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_writeargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_commit3args(&xdr, args);
-	return 0;
+	encode_commit3args(xdr, args);
 }
 
 #ifdef CONFIG_NFS_V3_ACL
 
-static int nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs3_getaclargs *args)
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_getaclargs *args)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_nfs_fh3(&xdr, args->fh);
-	encode_uint32(&xdr, args->mask);
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->mask);
 	if (args->mask & (NFS_ACL | NFS_DFACL))
 		prepare_reply_buffer(req, args->pages, 0,
 					NFSACL_MAXPAGES << PAGE_SHIFT,
 					ACL3_getaclres_sz);
-	return 0;
 }
 
-static int nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, __be32 *p,
-				    const struct nfs3_setaclargs *args)
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_setaclargs *args)
 {
-	struct xdr_stream xdr;
 	unsigned int base;
 	int error;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_nfs_fh3(&xdr, NFS_FH(args->inode));
-	encode_uint32(&xdr, args->mask);
+	encode_nfs_fh3(xdr, NFS_FH(args->inode));
+	encode_uint32(xdr, args->mask);
 	if (args->npages != 0)
-		xdr_write_pages(&xdr, args->pages, 0, args->len);
+		xdr_write_pages(xdr, args->pages, 0, args->len);
 
 	base = req->rq_slen;
-	error = nfsacl_encode(xdr.buf, base, args->inode,
+	error = nfsacl_encode(xdr->buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
 			    args->acl_access : NULL, 1, 0);
 	BUG_ON(error < 0);
-	error = nfsacl_encode(xdr.buf, base + error, args->inode,
+	error = nfsacl_encode(xdr->buf, base + error, args->inode,
 			    (args->mask & NFS_DFACL) ?
 			    args->acl_default : NULL, 1,
 			    NFS_ACL_DEFAULT);
 	BUG_ON(error < 0);
-	return 0;
 }
 
 #endif  /* CONFIG_NFS_V3_ACL */
@@ -2506,7 +2451,7 @@ out_default:
 #define PROC(proc, argtype, restype, timer)				\
 [NFS3PROC_##proc] = {							\
 	.p_proc      = NFS3PROC_##proc,					\
-	.p_encode    = (kxdrproc_t)nfs3_xdr_enc_##argtype##3args,	\
+	.p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,	\
 	.p_decode    = (kxdrproc_t)nfs3_xdr_dec_##restype##3res,	\
 	.p_arglen    = NFS3_##argtype##args_sz,				\
 	.p_replen    = NFS3_##restype##res_sz,				\
@@ -2549,7 +2494,7 @@ struct rpc_version		nfs_version3 = {
 static struct rpc_procinfo	nfs3_acl_procedures[] = {
 	[ACLPROC3_GETACL] = {
 		.p_proc = ACLPROC3_GETACL,
-		.p_encode = (kxdrproc_t)nfs3_xdr_enc_getacl3args,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
 		.p_decode = (kxdrproc_t)nfs3_xdr_dec_getacl3res,
 		.p_arglen = ACL3_getaclargs_sz,
 		.p_replen = ACL3_getaclres_sz,
@@ -2558,7 +2503,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 	},
 	[ACLPROC3_SETACL] = {
 		.p_proc = ACLPROC3_SETACL,
-		.p_encode = (kxdrproc_t)nfs3_xdr_enc_setacl3args,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
 		.p_decode = (kxdrproc_t)nfs3_xdr_dec_setacl3res,
 		.p_arglen = ACL3_setaclargs_sz,
 		.p_replen = ACL3_setaclres_sz,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a15fe99fea86..6ec38b3e4a3d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1510,7 +1510,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	hdr->replen += decode_restorefh_maxsz;
 }
 
-static int
+static void
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
 	__be32 *p;
@@ -1521,14 +1521,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
 	*p = cpu_to_be32(FATTR4_WORD0_ACL);
-	if (arg->acl_len % 4)
-		return -EINVAL;
+	BUG_ON(arg->acl_len % 4);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
-	return 0;
 }
 
 static void
@@ -1833,393 +1831,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 /*
  * Encode an ACCESS request
  */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_accessargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_access(&xdr, args->access, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_access(xdr, args->access, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode LOOKUP request
  */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_lookup_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_lookup(&xdr, args->name, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_lookup(xdr, args->name, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode LOOKUP_ROOT request
  */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_lookup_root_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putrootfh(&xdr, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode REMOVE request
  */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_removeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_remove(&xdr, &args->name, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_remove(xdr, &args->name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode RENAME request
  */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_renameargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->old_dir, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_putfh(&xdr, args->new_dir, &hdr);
-	encode_rename(&xdr, args->old_name, args->new_name, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->old_dir, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->new_dir, &hdr);
+	encode_rename(xdr, args->old_name, args->new_name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode LINK request
  */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct nfs4_link_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_link(&xdr, args->name, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_link(xdr, args->name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode CREATE request
  */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_create_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_create(&xdr, args, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_create(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode SYMLINK request
  */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_create_arg *args)
 {
-	return nfs4_xdr_enc_create(req, p, args);
+	nfs4_xdr_enc_create(req, xdr, args);
 }
 
 /*
  * Encode GETATTR request
  */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_getattr_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a CLOSE request
  */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_closeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_close(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_close(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN request
  */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_openargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_open(&xdr, args, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_open(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN_CONFIRM request
  */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs_open_confirmargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops   = 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_open_confirm(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_confirm(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN request with no attributes.
  */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_openargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_open(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN_DOWNGRADE request
  */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs_closeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_open_downgrade(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_downgrade(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a LOCK request
  */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_lock_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_lock(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lock(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a LOCKT request
  */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_lockt_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_lockt(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lockt(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a LOCKU request
  */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_locku_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_locku(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_locku(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
-static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+					   struct xdr_stream *xdr,
+					struct nfs_release_lockowner_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_release_lockowner(xdr, &args->lock_owner, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a READLINK request
  */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_readlink *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_readlink(&xdr, args, req, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readlink(xdr, args, req, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
 			args->pgbase, args->pglen);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a READDIR request
  */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_readdir_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_readdir(&xdr, args, req, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readdir(xdr, args, req, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
 			 args->pgbase, args->count);
@@ -2227,428 +2194,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 			__func__, hdr.replen << 2, args->pages,
 			args->pgbase, args->count);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a READ request
  */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_readargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_read(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_read(xdr, args, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
 			 args->pages, args->pgbase, args->count);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an SETATTR request
  */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_setattrargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_setattr(&xdr, args, args->server, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setattr(xdr, args, args->server, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a GETACL request
  */
-static int
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
-		struct nfs_getaclargs *args)
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_getaclargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 	uint32_t replen;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
 	replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
-	encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
 		args->acl_pages, args->acl_pgbase, args->acl_len);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a WRITE request
  */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_writeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_write(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_write(xdr, args, &hdr);
 	req->rq_snd_buf.flags |= XDRBUF_WRITE;
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  *  a COMMIT request
  */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_writeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_commit(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_commit(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * FSINFO request
  */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs4_fsinfo_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_fsinfo(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_fsinfo(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a PATHCONF request
  */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_pathconf_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
 			   &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a STATFS request
  */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_statfs_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
 			   args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * GETATTR_BITMAP request
  */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
-				    struct nfs4_server_caps_arg *args)
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_server_caps_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fhandle, &hdr);
-	encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
 			   FATTR4_WORD0_LINK_SUPPORT|
 			   FATTR4_WORD0_SYMLINK_SUPPORT|
 			   FATTR4_WORD0_ACLSUPPORT, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a RENEW request
  */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_client *clp)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops	= 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_renew(&xdr, clp, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_renew(xdr, clp, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a SETCLIENTID request
  */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_setclientid *sc)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops	= 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_setclientid(&xdr, sc, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid(xdr, sc, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a SETCLIENTID_CONFIRM request
  */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+					     struct xdr_stream *xdr,
+					     struct nfs4_setclientid_res *arg)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops	= 0,
 	};
 	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_setclientid_confirm(&xdr, arg, &hdr);
-	encode_putrootfh(&xdr, &hdr);
-	encode_fsinfo(&xdr, lease_bitmap, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid_confirm(xdr, arg, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_fsinfo(xdr, lease_bitmap, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * DELEGRETURN request
  */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_delegreturnargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fhandle, &hdr);
-	encode_delegreturn(&xdr, args->stateid, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_delegreturn(xdr, args->stateid, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode FS_LOCATIONS request
  */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fs_locations_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 	uint32_t replen;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_lookup(&xdr, args->name, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_lookup(xdr, args->name, &hdr);
 	replen = hdr.replen;	/* get the attribute into args->page */
-	encode_fs_locations(&xdr, args->bitmask, &hdr);
+	encode_fs_locations(xdr, args->bitmask, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
 			0, PAGE_SIZE);
 	encode_nops(&hdr);
-	return 0;
 }
 
 #if defined(CONFIG_NFS_V4_1)
 /*
  * EXCHANGE_ID request
  */
-static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
-				    struct nfs41_exchange_id_args *args)
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs41_exchange_id_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_exchange_id(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_exchange_id(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a CREATE_SESSION request
  */
-static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
-				       struct nfs41_create_session_args *args)
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs41_create_session_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_create_session(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_create_session(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a DESTROY_SESSION request
  */
-static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
-					struct nfs4_session *session)
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
+					 struct xdr_stream *xdr,
+					 struct nfs4_session *session)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = session->clp->cl_mvops->minor_version,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_destroy_session(&xdr, session, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_destroy_session(xdr, session, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a SEQUENCE request
  */
-static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
-				 struct nfs4_sequence_args *args)
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs4_sequence_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a GET_LEASE_TIME request
  */
-static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
-				       struct nfs4_get_lease_time_args *args)
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs4_get_lease_time_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
 	};
 	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->la_seq_args, &hdr);
-	encode_putrootfh(&xdr, &hdr);
-	encode_fsinfo(&xdr, lease_bitmap, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->la_seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_fsinfo(xdr, lease_bitmap, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a RECLAIM_COMPLETE request
  */
-static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
-				     struct nfs41_reclaim_complete_args *args)
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+				struct nfs41_reclaim_complete_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args)
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_reclaim_complete(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_reclaim_complete(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode GETDEVICEINFO request
  */
-static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
-				      struct nfs4_getdeviceinfo_args *args)
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       struct nfs4_getdeviceinfo_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_getdeviceinfo(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_getdeviceinfo(xdr, args, &hdr);
 
 	/* set up reply kvec. Subtract notification bitmap max size (2)
 	 * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2583,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
 			 args->pdev->pglen);
 
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  *  Encode LAYOUTGET request
  */
-static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
-				  struct nfs4_layoutget_args *args)
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs4_layoutget_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
-	encode_layoutget(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutget(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -5368,22 +5291,18 @@ out:
 /*
  * Encode an SETACL request
  */
-static int
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_setaclargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
-	int status;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	status = encode_setacl(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setacl(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return status;
 }
 
 /*
@@ -6316,7 +6235,7 @@ nfs4_stat_to_errno(int stat)
 #define PROC(proc, argtype, restype)				\
 [NFSPROC4_CLNT_##proc] = {					\
 	.p_proc   = NFSPROC4_COMPOUND,				\
-	.p_encode = (kxdrproc_t)nfs4_xdr_##argtype,		\
+	.p_encode = (kxdreproc_t)nfs4_xdr_##argtype,		\
 	.p_decode = (kxdrproc_t)nfs4_xdr_##restype,		\
 	.p_arglen = NFS4_##argtype##_sz,			\
 	.p_replen = NFS4_##restype##_sz,			\
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6529534d7aae..c363efda8ecf 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -499,34 +499,28 @@ out_default:
 /*
  * NB: Without this zero space reservation, callbacks over krb5p fail
  */
-static int nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p, void *__unused)
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 void *__unused)
 {
-	struct xdr_stream xdrs, *xdr = &xdrs;
-
-	xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
 	xdr_reserve_space(xdr, 0);
-	return 0;
 }
 
 /*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
-static int nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
-				  const struct nfsd4_callback *cb)
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+				   const struct nfsd4_callback *cb)
 {
-	struct xdr_stream xdr;
 	const struct nfs4_delegation *args = cb->cb_op;
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = cb->cb_clp->cl_cb_ident,
 		.minorversion = cb->cb_minorversion,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cb_compound4args(&xdr, &hdr);
-	encode_cb_sequence4args(&xdr, cb, &hdr);
-	encode_cb_recall4args(&xdr, args, &hdr);
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_recall4args(xdr, args, &hdr);
 	encode_cb_nops(&hdr);
-	return 0;
 }
 
 
@@ -583,7 +577,7 @@ out_default:
 #define PROC(proc, call, argtype, restype)				\
 [NFSPROC4_CLNT_##proc] = {						\
 	.p_proc    = NFSPROC4_CB_##call,				\
-	.p_encode  = (kxdrproc_t)nfs4_xdr_enc_##argtype,		\
+	.p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,		\
 	.p_decode  = (kxdrproc_t)nfs4_xdr_dec_##restype,		\
 	.p_arglen  = NFS4_enc_##argtype##_sz,				\
 	.p_replen  = NFS4_dec_##restype##_sz,				\
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index b2024757edd5..d88cffbaa6df 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -110,7 +110,7 @@ struct rpc_credops {
 	__be32 *		(*crmarshal)(struct rpc_task *, __be32 *);
 	int			(*crrefresh)(struct rpc_task *);
 	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
-	int			(*crwrap_req)(struct rpc_task *, kxdrproc_t,
+	int			(*crwrap_req)(struct rpc_task *, kxdreproc_t,
 						void *, __be32 *, void *);
 	int			(*crunwrap_resp)(struct rpc_task *, kxdrproc_t,
 						void *, __be32 *, void *);
@@ -139,7 +139,7 @@ struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *
 void			put_rpccred(struct rpc_cred *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
 __be32 *		rpcauth_checkverf(struct rpc_task *, __be32 *);
-int			rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, __be32 *data, void *obj);
+int			rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp, __be32 *data, void *obj);
 int			rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, __be32 *data, void *obj);
 int			rpcauth_refreshcred(struct rpc_task *);
 void			rpcauth_invalcred(struct rpc_task *);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index a5a55f284b7d..7b19c4e1ce53 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -89,7 +89,7 @@ struct rpc_version {
  */
 struct rpc_procinfo {
 	u32			p_proc;		/* RPC procedure number */
-	kxdrproc_t		p_encode;	/* XDR encode function */
+	kxdreproc_t		p_encode;	/* XDR encode function */
 	kxdrproc_t		p_decode;	/* XDR decode function */
 	unsigned int		p_arglen;	/* argument hdr length (u32) */
 	unsigned int		p_replen;	/* reply hdr length (u32) */
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 498ab93a81e4..a21cf5378c1d 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -33,8 +33,8 @@ struct xdr_netobj {
 };
 
 /*
- * This is the generic XDR function. rqstp is either a rpc_rqst (client
- * side) or svc_rqst pointer (server side).
+ * This is the legacy generic XDR function. rqstp is either a rpc_rqst
+ * (client side) or svc_rqst pointer (server side).
  * Encode functions always assume there's enough room in the buffer.
  */
 typedef int	(*kxdrproc_t)(void *rqstp, __be32 *data, void *obj);
@@ -203,6 +203,11 @@ struct xdr_stream {
 	struct kvec *iov;	/* pointer to the current kvec */
 };
 
+/*
+ * This is the xdr_stream style generic XDR function.
+ */
+typedef void	(*kxdreproc_t)(void *rqstp, struct xdr_stream *xdr, void *obj);
+
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index afe67849269f..651c9da703cb 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -563,8 +563,17 @@ rpcauth_checkverf(struct rpc_task *task, __be32 *p)
 	return cred->cr_ops->crvalidate(task, p);
 }
 
+static void rpcauth_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
+				   __be32 *data, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data);
+	encode(rqstp, &xdr, obj);
+}
+
 int
-rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp,
+rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp,
 		__be32 *data, void *obj)
 {
 	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
@@ -574,7 +583,8 @@ rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp,
 	if (cred->cr_ops->crwrap_req)
 		return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj);
 	/* By default, we encode the arguments normally. */
-	return encode(rqstp, data, obj);
+	rpcauth_wrap_req_encode(encode, rqstp, data, obj);
+	return 0;
 }
 
 int
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 3835ce35e224..42b46f9a670a 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1231,9 +1231,19 @@ out_bad:
 	return NULL;
 }
 
+static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
+				__be32 *p, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p);
+	encode(rqstp, &xdr, obj);
+}
+
 static inline int
 gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		kxdrproc_t encode, struct rpc_rqst *rqstp, __be32 *p, void *obj)
+		   kxdreproc_t encode, struct rpc_rqst *rqstp,
+		   __be32 *p, void *obj)
 {
 	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
 	struct xdr_buf	integ_buf;
@@ -1249,9 +1259,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	*p++ = htonl(rqstp->rq_seqno);
 
-	status = encode(rqstp, p, obj);
-	if (status)
-		return status;
+	gss_wrap_req_encode(encode, rqstp, p, obj);
 
 	if (xdr_buf_subsegment(snd_buf, &integ_buf,
 				offset, snd_buf->len - offset))
@@ -1325,7 +1333,8 @@ out:
 
 static inline int
 gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		kxdrproc_t encode, struct rpc_rqst *rqstp, __be32 *p, void *obj)
+		  kxdreproc_t encode, struct rpc_rqst *rqstp,
+		  __be32 *p, void *obj)
 {
 	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
 	u32		offset;
@@ -1342,9 +1351,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	*p++ = htonl(rqstp->rq_seqno);
 
-	status = encode(rqstp, p, obj);
-	if (status)
-		return status;
+	gss_wrap_req_encode(encode, rqstp, p, obj);
 
 	status = alloc_enc_pages(rqstp);
 	if (status)
@@ -1394,7 +1401,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 
 static int
 gss_wrap_req(struct rpc_task *task,
-	     kxdrproc_t encode, void *rqstp, __be32 *p, void *obj)
+	     kxdreproc_t encode, void *rqstp, __be32 *p, void *obj)
 {
 	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
@@ -1407,12 +1414,14 @@ gss_wrap_req(struct rpc_task *task,
 		/* The spec seems a little ambiguous here, but I think that not
 		 * wrapping context destruction requests makes the most sense.
 		 */
-		status = encode(rqstp, p, obj);
+		gss_wrap_req_encode(encode, rqstp, p, obj);
+		status = 0;
 		goto out;
 	}
 	switch (gss_cred->gc_service) {
 		case RPC_GSS_SVC_NONE:
-			status = encode(rqstp, p, obj);
+			gss_wrap_req_encode(encode, rqstp, p, obj);
+			status = 0;
 			break;
 		case RPC_GSS_SVC_INTEGRITY:
 			status = gss_wrap_req_integ(cred, ctx, encode,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 92ce94f5146b..d446a32be667 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1095,7 +1095,7 @@ static void
 rpc_xdr_encode(struct rpc_task *task)
 {
 	struct rpc_rqst	*req = task->tk_rqstp;
-	kxdrproc_t	encode;
+	kxdreproc_t	encode;
 	__be32		*p;
 
 	dprint_status(task);
@@ -1776,9 +1776,8 @@ out_overflow:
 	goto out_garbage;
 }
 
-static int rpcproc_encode_null(void *rqstp, __be32 *data, void *obj)
+static void rpcproc_encode_null(void *rqstp, struct xdr_stream *xdr, void *obj)
 {
-	return 0;
 }
 
 static int rpcproc_decode_null(void *rqstp, __be32 *data, void *obj)
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 43838c72b778..63912a1a2983 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -689,25 +689,21 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
  * XDR functions for rpcbind
  */
 
-static int rpcb_enc_mapping(struct rpc_rqst *req, __be32 *p,
-			    const struct rpcbind_args *rpcb)
+static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct rpcbind_args *rpcb)
 {
 	struct rpc_task *task = req->rq_task;
-	struct xdr_stream xdr;
+	__be32 *p;
 
 	dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n",
 			task->tk_pid, task->tk_msg.rpc_proc->p_name,
 			rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-
-	p = xdr_reserve_space(&xdr, RPCB_mappingargs_sz << 2);
+	p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2);
 	*p++ = cpu_to_be32(rpcb->r_prog);
 	*p++ = cpu_to_be32(rpcb->r_vers);
 	*p++ = cpu_to_be32(rpcb->r_prot);
 	*p   = cpu_to_be32(rpcb->r_port);
-
-	return 0;
 }
 
 static int rpcb_dec_getport(struct rpc_rqst *req, __be32 *p,
@@ -769,27 +765,24 @@ static void encode_rpcb_string(struct xdr_stream *xdr, const char *string,
 	xdr_encode_opaque(p, string, len);
 }
 
-static int rpcb_enc_getaddr(struct rpc_rqst *req, __be32 *p,
-			    const struct rpcbind_args *rpcb)
+static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct rpcbind_args *rpcb)
 {
 	struct rpc_task *task = req->rq_task;
-	struct xdr_stream xdr;
+	__be32 *p;
 
 	dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n",
 			task->tk_pid, task->tk_msg.rpc_proc->p_name,
 			rpcb->r_prog, rpcb->r_vers,
 			rpcb->r_netid, rpcb->r_addr);
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-
-	p = xdr_reserve_space(&xdr, (RPCB_program_sz + RPCB_version_sz) << 2);
+	p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2);
 	*p++ = cpu_to_be32(rpcb->r_prog);
 	*p = cpu_to_be32(rpcb->r_vers);
 
-	encode_rpcb_string(&xdr, rpcb->r_netid, RPCBIND_MAXNETIDLEN);
-	encode_rpcb_string(&xdr, rpcb->r_addr, RPCBIND_MAXUADDRLEN);
-	encode_rpcb_string(&xdr, rpcb->r_owner, RPCB_MAXOWNERLEN);
-	return 0;
+	encode_rpcb_string(xdr, rpcb->r_netid, RPCBIND_MAXNETIDLEN);
+	encode_rpcb_string(xdr, rpcb->r_addr, RPCBIND_MAXUADDRLEN);
+	encode_rpcb_string(xdr, rpcb->r_owner, RPCB_MAXOWNERLEN);
 }
 
 static int rpcb_dec_getaddr(struct rpc_rqst *req, __be32 *p,
@@ -849,7 +842,7 @@ out_fail:
 static struct rpc_procinfo rpcb_procedures2[] = {
 	[RPCBPROC_SET] = {
 		.p_proc		= RPCBPROC_SET,
-		.p_encode	= (kxdrproc_t)rpcb_enc_mapping,
+		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
 		.p_decode	= (kxdrproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_mappingargs_sz,
 		.p_replen	= RPCB_setres_sz,
@@ -859,7 +852,7 @@ static struct rpc_procinfo rpcb_procedures2[] = {
 	},
 	[RPCBPROC_UNSET] = {
 		.p_proc		= RPCBPROC_UNSET,
-		.p_encode	= (kxdrproc_t)rpcb_enc_mapping,
+		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
 		.p_decode	= (kxdrproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_mappingargs_sz,
 		.p_replen	= RPCB_setres_sz,
@@ -869,7 +862,7 @@ static struct rpc_procinfo rpcb_procedures2[] = {
 	},
 	[RPCBPROC_GETPORT] = {
 		.p_proc		= RPCBPROC_GETPORT,
-		.p_encode	= (kxdrproc_t)rpcb_enc_mapping,
+		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
 		.p_decode	= (kxdrproc_t)rpcb_dec_getport,
 		.p_arglen	= RPCB_mappingargs_sz,
 		.p_replen	= RPCB_getportres_sz,
@@ -882,7 +875,7 @@ static struct rpc_procinfo rpcb_procedures2[] = {
 static struct rpc_procinfo rpcb_procedures3[] = {
 	[RPCBPROC_SET] = {
 		.p_proc		= RPCBPROC_SET,
-		.p_encode	= (kxdrproc_t)rpcb_enc_getaddr,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
 		.p_decode	= (kxdrproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
@@ -892,7 +885,7 @@ static struct rpc_procinfo rpcb_procedures3[] = {
 	},
 	[RPCBPROC_UNSET] = {
 		.p_proc		= RPCBPROC_UNSET,
-		.p_encode	= (kxdrproc_t)rpcb_enc_getaddr,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
 		.p_decode	= (kxdrproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
@@ -902,7 +895,7 @@ static struct rpc_procinfo rpcb_procedures3[] = {
 	},
 	[RPCBPROC_GETADDR] = {
 		.p_proc		= RPCBPROC_GETADDR,
-		.p_encode	= (kxdrproc_t)rpcb_enc_getaddr,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
 		.p_decode	= (kxdrproc_t)rpcb_dec_getaddr,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_getaddrres_sz,
@@ -915,7 +908,7 @@ static struct rpc_procinfo rpcb_procedures3[] = {
 static struct rpc_procinfo rpcb_procedures4[] = {
 	[RPCBPROC_SET] = {
 		.p_proc		= RPCBPROC_SET,
-		.p_encode	= (kxdrproc_t)rpcb_enc_getaddr,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
 		.p_decode	= (kxdrproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
@@ -925,7 +918,7 @@ static struct rpc_procinfo rpcb_procedures4[] = {
 	},
 	[RPCBPROC_UNSET] = {
 		.p_proc		= RPCBPROC_UNSET,
-		.p_encode	= (kxdrproc_t)rpcb_enc_getaddr,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
 		.p_decode	= (kxdrproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
@@ -935,7 +928,7 @@ static struct rpc_procinfo rpcb_procedures4[] = {
 	},
 	[RPCBPROC_GETADDR] = {
 		.p_proc		= RPCBPROC_GETADDR,
-		.p_encode	= (kxdrproc_t)rpcb_enc_getaddr,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
 		.p_decode	= (kxdrproc_t)rpcb_dec_getaddr,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_getaddrres_sz,
-- 
cgit v1.2.3-71-gd317


From bf2695516db982e90a22fc94f93491b481796bb1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 14:59:29 +0000
Subject: SUNRPC: New xdr_streams XDR decoder API

Now that all client-side XDR decoder routines use xdr_streams, there
should be no need to support the legacy calling sequence [rpc_rqst *,
__be32 *, RPC res *] anywhere.  We can construct an xdr_stream in the
generic RPC code, instead of in each decoder function.

This is a refactoring change.  It should not cause different behavior.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clnt4xdr.c            |  20 +-
 fs/lockd/clntxdr.c             |  20 +-
 fs/lockd/mon.c                 |  30 +-
 fs/nfs/mount_clnt.c            |  30 +-
 fs/nfs/nfs2xdr.c               |  68 ++---
 fs/nfs/nfs3xdr.c               | 195 ++++++-------
 fs/nfs/nfs4xdr.c               | 619 ++++++++++++++++++++---------------------
 fs/nfsd/nfs4callback.c         |  16 +-
 include/linux/sunrpc/auth.h    |   4 +-
 include/linux/sunrpc/clnt.h    |   2 +-
 include/linux/sunrpc/xdr.h     |   3 +-
 net/sunrpc/auth.c              |  14 +-
 net/sunrpc/auth_gss/auth_gss.c |  13 +-
 net/sunrpc/clnt.c              |   4 +-
 net/sunrpc/rpcb_clnt.c         |  46 ++-
 15 files changed, 518 insertions(+), 566 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 974f1d9cd323..f848b52c67b1 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -529,17 +529,16 @@ out:
 	return error;
 }
 
-static int nlm4_xdr_dec_testres(struct rpc_rqst *req, __be32 *p,
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
 				struct nlm_res *result)
 {
-	struct xdr_stream xdr;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_cookie(&xdr, &result->cookie);
+	error = decode_cookie(xdr, &result->cookie);
 	if (unlikely(error))
 		goto out;
-	error = decode_nlm4_testrply(&xdr, result);
+	error = decode_nlm4_testrply(xdr, result);
 out:
 	return error;
 }
@@ -550,17 +549,16 @@ out:
  *		nlm4_stat stat;
  *	};
  */
-static int nlm4_xdr_dec_res(struct rpc_rqst *req, __be32 *p,
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
 			    struct nlm_res *result)
 {
-	struct xdr_stream xdr;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_cookie(&xdr, &result->cookie);
+	error = decode_cookie(xdr, &result->cookie);
 	if (unlikely(error))
 		goto out;
-	error = decode_nlm4_stat(&xdr, &result->status);
+	error = decode_nlm4_stat(xdr, &result->status);
 out:
 	return error;
 }
@@ -575,7 +573,7 @@ out:
 [NLMPROC_##proc] = {							\
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,		\
-	.p_decode    = (kxdrproc_t)nlm4_xdr_dec_##restype,		\
+	.p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,		\
 	.p_arglen    = NLM4_##argtype##_sz,				\
 	.p_replen    = NLM4_##restype##_sz,				\
 	.p_statidx   = NLMPROC_##proc,					\
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index c6fda8fb1c5b..180ac34feb9a 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -527,17 +527,16 @@ out:
 	return error;
 }
 
-static int nlm_xdr_dec_testres(struct rpc_rqst *req, __be32 *p,
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+			       struct xdr_stream *xdr,
 			       struct nlm_res *result)
 {
-	struct xdr_stream xdr;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_cookie(&xdr, &result->cookie);
+	error = decode_cookie(xdr, &result->cookie);
 	if (unlikely(error))
 		goto out;
-	error = decode_nlm_testrply(&xdr, result);
+	error = decode_nlm_testrply(xdr, result);
 out:
 	return error;
 }
@@ -548,17 +547,16 @@ out:
  *		nlm_stat stat;
  *	};
  */
-static int nlm_xdr_dec_res(struct rpc_rqst *req, __be32 *p,
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+			   struct xdr_stream *xdr,
 			   struct nlm_res *result)
 {
-	struct xdr_stream xdr;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_cookie(&xdr, &result->cookie);
+	error = decode_cookie(xdr, &result->cookie);
 	if (unlikely(error))
 		goto out;
-	error = decode_nlm_stat(&xdr, &result->status);
+	error = decode_nlm_stat(xdr, &result->status);
 out:
 	return error;
 }
@@ -573,7 +571,7 @@ out:
 [NLMPROC_##proc] = {							\
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,		\
-	.p_decode    = (kxdrproc_t)nlm_xdr_dec_##restype,		\
+	.p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,		\
 	.p_arglen    = NLM_##argtype##_sz,				\
 	.p_replen    = NLM_##restype##_sz,				\
 	.p_statidx   = NLMPROC_##proc,					\
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index baa77bc9d825..23d7451b2938 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -472,35 +472,35 @@ static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_mon_id(xdr, argp);
 }
 
-static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
-			    struct nsm_res *resp)
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nsm_res *resp)
 {
-	struct xdr_stream xdr;
+	__be32 *p;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	p = xdr_inline_decode(&xdr, 4 + 4);
+	p = xdr_inline_decode(xdr, 4 + 4);
 	if (unlikely(p == NULL))
 		return -EIO;
 	resp->status = be32_to_cpup(p++);
 	resp->state = be32_to_cpup(p);
 
-	dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
-			resp->status, resp->state);
+	dprintk("lockd: %s status %d state %d\n",
+		__func__, resp->status, resp->state);
 	return 0;
 }
 
-static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
-			struct nsm_res *resp)
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
+			    struct xdr_stream *xdr,
+			    struct nsm_res *resp)
 {
-	struct xdr_stream xdr;
+	__be32 *p;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	p = xdr_inline_decode(&xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
 	resp->state = be32_to_cpup(p);
 
-	dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
+	dprintk("lockd: %s state %d\n", __func__, resp->state);
 	return 0;
 }
 
@@ -517,7 +517,7 @@ static struct rpc_procinfo	nsm_procedures[] = {
 [NSMPROC_MON] = {
 		.p_proc		= NSMPROC_MON,
 		.p_encode	= (kxdreproc_t)nsm_xdr_enc_mon,
-		.p_decode	= (kxdrproc_t)xdr_dec_stat_res,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat_res,
 		.p_arglen	= SM_mon_sz,
 		.p_replen	= SM_monres_sz,
 		.p_statidx	= NSMPROC_MON,
@@ -526,7 +526,7 @@ static struct rpc_procinfo	nsm_procedures[] = {
 [NSMPROC_UNMON] = {
 		.p_proc		= NSMPROC_UNMON,
 		.p_encode	= (kxdreproc_t)nsm_xdr_enc_unmon,
-		.p_decode	= (kxdrproc_t)xdr_dec_stat,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat,
 		.p_arglen	= SM_mon_id_sz,
 		.p_replen	= SM_unmonres_sz,
 		.p_statidx	= NSMPROC_UNMON,
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 979ebd7af3cb..697e07235f30 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -340,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
 	return 0;
 }
 
-static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
-			    struct mountres *res)
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct mountres *res)
 {
-	struct xdr_stream xdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-
-	status = decode_status(&xdr, res);
+	status = decode_status(xdr, res);
 	if (unlikely(status != 0 || res->errno != 0))
 		return status;
-	return decode_fhandle(&xdr, res);
+	return decode_fhandle(xdr, res);
 }
 
 static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -434,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
 	return 0;
 }
 
-static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
-			     struct mountres *res)
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 struct mountres *res)
 {
-	struct xdr_stream xdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-
-	status = decode_fhs_status(&xdr, res);
+	status = decode_fhs_status(xdr, res);
 	if (unlikely(status != 0 || res->errno != 0))
 		return status;
-	status = decode_fhandle3(&xdr, res);
+	status = decode_fhandle3(xdr, res);
 	if (unlikely(status != 0)) {
 		res->errno = -EBADHANDLE;
 		return 0;
 	}
-	return decode_auth_flavors(&xdr, res);
+	return decode_auth_flavors(xdr, res);
 }
 
 static struct rpc_procinfo mnt_procedures[] = {
 	[MOUNTPROC_MNT] = {
 		.p_proc		= MOUNTPROC_MNT,
 		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
-		.p_decode	= (kxdrproc_t)mnt_dec_mountres,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_replen	= MNT_dec_mountres_sz,
 		.p_statidx	= MOUNTPROC_MNT,
@@ -476,7 +472,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
 	[MOUNTPROC3_MNT] = {
 		.p_proc		= MOUNTPROC3_MNT,
 		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
-		.p_decode	= (kxdrproc_t)mnt_dec_mountres3,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres3,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_replen	= MNT_dec_mountres3_sz,
 		.p_statidx	= MOUNTPROC3_MNT,
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 8f3acbec761f..51f1cfa04d27 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -783,15 +783,13 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
  * "NFS: Network File System Protocol Specification".
  */
 
-static int nfs2_xdr_dec_stat(struct rpc_rqst *req, __be32 *p,
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
 			     void *__unused)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_stat(&xdr, &status);
+	error = decode_stat(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS_OK)
@@ -802,22 +800,16 @@ out_default:
 	return nfs_stat_to_errno(status);
 }
 
-static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, __be32 *p,
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs_fattr *result)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	return decode_attrstat(&xdr, result);
+	return decode_attrstat(xdr, result);
 }
 
-static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, __be32 *p,
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs_diropok *result)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	return decode_diropres(&xdr, result);
+	return decode_diropres(xdr, result);
 }
 
 /*
@@ -830,20 +822,18 @@ static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, __be32 *p,
  *		void;
  *	};
  */
-static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req, __be32 *p,
-				    void *__unused)
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
+				    struct xdr_stream *xdr, void *__unused)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_stat(&xdr, &status);
+	error = decode_stat(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS_OK)
 		goto out_default;
-	error = decode_path(&xdr);
+	error = decode_path(xdr);
 out:
 	return error;
 out_default:
@@ -861,39 +851,33 @@ out_default:
  *		void;
  *	};
  */
-static int nfs2_xdr_dec_readres(struct rpc_rqst *req, __be32 *p,
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
 				struct nfs_readres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_stat(&xdr, &status);
+	error = decode_stat(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS_OK)
 		goto out_default;
-	error = decode_fattr(&xdr, result->fattr);
+	error = decode_fattr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
-	error = decode_nfsdata(&xdr, result);
+	error = decode_nfsdata(xdr, result);
 out:
 	return error;
 out_default:
 	return nfs_stat_to_errno(status);
 }
 
-static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, __be32 *p,
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs_writeres *result)
 {
-	struct xdr_stream xdr;
-
 	/* All NFSv2 writes are "file sync" writes */
 	result->verf->committed = NFS_FILE_SYNC;
-
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	return decode_attrstat(&xdr, result->fattr);
+	return decode_attrstat(xdr, result->fattr);
 }
 
 /**
@@ -1008,20 +992,18 @@ out_cheating:
 	goto out;
 }
 
-static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, __be32 *p,
-				   void *__unused)
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
+				   struct xdr_stream *xdr, void *__unused)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_stat(&xdr, &status);
+	error = decode_stat(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS_OK)
 		goto out_default;
-	error = decode_readdirok(&xdr);
+	error = decode_readdirok(xdr);
 out:
 	return error;
 out_default:
@@ -1062,20 +1044,18 @@ out_overflow:
 	return -EIO;
 }
 
-static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, __be32 *p,
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
 				  struct nfs2_fsstat *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_stat(&xdr, &status);
+	error = decode_stat(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS_OK)
 		goto out_default;
-	error = decode_info(&xdr, result);
+	error = decode_info(xdr, result);
 out:
 	return error;
 out_default:
@@ -1150,7 +1130,7 @@ int nfs_stat_to_errno(enum nfs_stat status)
 [NFSPROC_##proc] = {							\
 	.p_proc	    =  NFSPROC_##proc,					\
 	.p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,		\
-	.p_decode   =  (kxdrproc_t)nfs2_xdr_dec_##restype,		\
+	.p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,		\
 	.p_arglen   =  NFS_##argtype##_sz,				\
 	.p_replen   =  NFS_##restype##_sz,				\
 	.p_timer    =  timer,						\
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ae1b1a43f05e..df30a26cc4fa 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1366,20 +1366,19 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
  *		void;
  *	};
  */
-static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
 				    struct nfs_fattr *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_fattr3(&xdr, result);
+	error = decode_fattr3(xdr, result);
 out:
 	return error;
 out_default:
@@ -1404,18 +1403,17 @@ out_default:
  *		SETATTR3resfail resfail;
  *	};
  */
-static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
 				    struct nfs_fattr *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(&xdr, result);
+	error = decode_wcc_data(xdr, result);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1446,30 +1444,29 @@ out_status:
  *		LOOKUP3resfail	resfail;
  *	};
  */
-static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs3_diropres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_nfs_fh3(&xdr, result->fh);
+	error = decode_nfs_fh3(xdr, result->fh);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->dir_attr);
+	error = decode_post_op_attr(xdr, result->dir_attr);
 out:
 	return error;
 out_default:
-	error = decode_post_op_attr(&xdr, result->dir_attr);
+	error = decode_post_op_attr(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
 	return nfs_stat_to_errno(status);
@@ -1494,23 +1491,22 @@ out_default:
  *		ACCESS3resfail	resfail;
  *	};
  */
-static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs3_accessres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_uint32(&xdr, &result->access);
+	error = decode_uint32(xdr, &result->access);
 out:
 	return error;
 out_default:
@@ -1536,23 +1532,22 @@ out_default:
  *		READLINK3resfail resfail;
  *	};
  */
-static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
 				     struct nfs_fattr *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result);
+	error = decode_post_op_attr(xdr, result);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_nfspath3(&xdr);
+	error = decode_nfspath3(xdr);
 out:
 	return error;
 out_default:
@@ -1620,23 +1615,21 @@ out_overflow:
 	return -EIO;
 }
 
-static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs_readres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_status;
-	error = decode_read3resok(&xdr, result);
+	error = decode_read3resok(xdr, result);
 out:
 	return error;
 out_status:
@@ -1692,23 +1685,21 @@ out_overflow:
 	return -EIO;
 }
 
-static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 				  struct nfs_writeres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(&xdr, result->fattr);
+	error = decode_wcc_data(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_status;
-	error = decode_write3resok(&xdr, result);
+	error = decode_write3resok(xdr, result);
 out:
 	return error;
 out_status:
@@ -1757,24 +1748,23 @@ out:
 	return error;
 }
 
-static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs3_diropres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_create3resok(&xdr, result);
+	error = decode_create3resok(xdr, result);
 out:
 	return error;
 out_default:
-	error = decode_wcc_data(&xdr, result->dir_attr);
+	error = decode_wcc_data(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
 	return nfs_stat_to_errno(status);
@@ -1798,18 +1788,17 @@ out_default:
  *		REMOVE3resfail resfail;
  *	};
  */
-static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs_removeres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(&xdr, result->dir_attr);
+	error = decode_wcc_data(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1840,21 +1829,20 @@ out_status:
  *		RENAME3resfail resfail;
  *	};
  */
-static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs_renameres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(&xdr, result->old_fattr);
+	error = decode_wcc_data(xdr, result->old_fattr);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(&xdr, result->new_fattr);
+	error = decode_wcc_data(xdr, result->new_fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1885,21 +1873,19 @@ out_status:
  *		LINK3resfail	resfail;
  *	};
  */
-static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs3_linkres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(&xdr, result->dir_attr);
+	error = decode_wcc_data(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -2085,24 +2071,23 @@ out:
 	return error;
 }
 
-static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
 				    struct nfs3_readdirres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_readdir3resok(&xdr, result);
+	error = decode_readdir3resok(xdr, result);
 out:
 	return error;
 out_default:
-	error = decode_post_op_attr(&xdr, result->dir_attr);
+	error = decode_post_op_attr(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
 	return nfs_stat_to_errno(status);
@@ -2154,23 +2139,22 @@ out_overflow:
 	return -EIO;
 }
 
-static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs_fsstat *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_status;
-	error = decode_fsstat3resok(&xdr, result);
+	error = decode_fsstat3resok(xdr, result);
 out:
 	return error;
 out_status:
@@ -2231,23 +2215,22 @@ out_overflow:
 	return -EIO;
 }
 
-static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs_fsinfo *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_status;
-	error = decode_fsinfo3resok(&xdr, result);
+	error = decode_fsinfo3resok(xdr, result);
 out:
 	return error;
 out_status:
@@ -2295,23 +2278,22 @@ out_overflow:
 	return -EIO;
 }
 
-static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
 				     struct nfs_pathconf *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(&xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_status;
-	error = decode_pathconf3resok(&xdr, result);
+	error = decode_pathconf3resok(xdr, result);
 out:
 	return error;
 out_status:
@@ -2337,23 +2319,22 @@ out_status:
  *		COMMIT3resfail	resfail;
  *	};
  */
-static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs_writeres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(&xdr, result->fattr);
+	error = decode_wcc_data(xdr, result->fattr);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_status;
-	error = decode_writeverf3(&xdr, result->verf->verifier);
+	error = decode_writeverf3(xdr, result->verf->verifier);
 out:
 	return error;
 out_status:
@@ -2406,40 +2387,38 @@ out:
 	return error;
 }
 
-static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs3_getaclres *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_getacl3resok(&xdr, result);
+	error = decode_getacl3resok(xdr, result);
 out:
 	return error;
 out_default:
 	return nfs_stat_to_errno(status);
 }
 
-static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, __be32 *p,
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
 				   struct nfs_fattr *result)
 {
-	struct xdr_stream xdr;
 	enum nfs_stat status;
 	int error;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	error = decode_nfsstat3(&xdr, &status);
+	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_post_op_attr(&xdr, result);
+	error = decode_post_op_attr(xdr, result);
 out:
 	return error;
 out_default:
@@ -2452,7 +2431,7 @@ out_default:
 [NFS3PROC_##proc] = {							\
 	.p_proc      = NFS3PROC_##proc,					\
 	.p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,	\
-	.p_decode    = (kxdrproc_t)nfs3_xdr_dec_##restype##3res,	\
+	.p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,	\
 	.p_arglen    = NFS3_##argtype##args_sz,				\
 	.p_replen    = NFS3_##restype##res_sz,				\
 	.p_timer     = timer,						\
@@ -2495,7 +2474,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 	[ACLPROC3_GETACL] = {
 		.p_proc = ACLPROC3_GETACL,
 		.p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
-		.p_decode = (kxdrproc_t)nfs3_xdr_dec_getacl3res,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
 		.p_arglen = ACL3_getaclargs_sz,
 		.p_replen = ACL3_getaclres_sz,
 		.p_timer = 1,
@@ -2504,7 +2483,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 	[ACLPROC3_SETACL] = {
 		.p_proc = ACLPROC3_SETACL,
 		.p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
-		.p_decode = (kxdrproc_t)nfs3_xdr_dec_setacl3res,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
 		.p_arglen = ACL3_setaclargs_sz,
 		.p_replen = ACL3_setaclres_sz,
 		.p_timer = 0,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6ec38b3e4a3d..f3f99156bfcb 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5013,26 +5013,26 @@ out_overflow:
 /*
  * Decode OPEN_DOWNGRADE response
  */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs_closeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_open_downgrade(&xdr, res);
+	status = decode_open_downgrade(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5041,26 +5041,25 @@ out:
 /*
  * Decode ACCESS response
  */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_accessres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status != 0)
 		goto out;
-	status = decode_access(&xdr, res);
+	status = decode_access(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5069,26 +5068,28 @@ out:
 /*
  * Decode LOOKUP response
  */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_lookup_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_lookup(&xdr)) != 0)
+	status = decode_lookup(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_getfh(&xdr, res->fh)) != 0)
+	status = decode_getfh(xdr, res->fh);
+	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server
+	status = decode_getfattr(xdr, res->fattr, res->server
 			,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5097,23 +5098,25 @@ out:
 /*
  * Decode LOOKUP_ROOT response
  */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_lookup_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putrootfh(&xdr)) != 0)
+	status = decode_putrootfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_getfh(&xdr, res->fh)) == 0)
-		status = decode_getfattr(&xdr, res->fattr, res->server,
+	status = decode_getfh(xdr, res->fh);
+	if (status == 0)
+		status = decode_getfattr(xdr, res->fattr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5122,24 +5125,25 @@ out:
 /*
  * Decode REMOVE response
  */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_removeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+	status = decode_remove(xdr, &res->cinfo);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->dir_attr, res->server,
+	decode_getfattr(xdr, res->dir_attr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5148,34 +5152,38 @@ out:
 /*
  * Decode RENAME response
  */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_renameres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_savefh(&xdr)) != 0)
+	status = decode_savefh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+	status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+	if (status)
 		goto out;
 	/* Current FH is target directory */
-	if (decode_getfattr(&xdr, res->new_fattr, res->server,
+	if (decode_getfattr(xdr, res->new_fattr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if ((status = decode_restorefh(&xdr)) != 0)
+	status = decode_restorefh(xdr);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->old_fattr, res->server,
+	decode_getfattr(xdr, res->old_fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5184,37 +5192,41 @@ out:
 /*
  * Decode LINK response
  */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs4_link_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_savefh(&xdr)) != 0)
+	status = decode_savefh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+	status = decode_link(xdr, &res->cinfo);
+	if (status)
 		goto out;
 	/*
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(&xdr, res->dir_attr, res->server,
+	if (decode_getfattr(xdr, res->dir_attr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if ((status = decode_restorefh(&xdr)) != 0)
+	status = decode_restorefh(xdr);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5223,33 +5235,37 @@ out:
 /*
  * Decode CREATE response
  */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_create_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_savefh(&xdr)) != 0)
+	status = decode_savefh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
+	status = decode_create(xdr, &res->dir_cinfo);
+	if (status)
 		goto out;
-	if ((status = decode_getfh(&xdr, res->fh)) != 0)
+	status = decode_getfh(xdr, res->fh);
+	if (status)
 		goto out;
-	if (decode_getfattr(&xdr, res->fattr, res->server,
+	if (decode_getfattr(xdr, res->fattr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if ((status = decode_restorefh(&xdr)) != 0)
+	status = decode_restorefh(xdr);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->dir_fattr, res->server,
+	decode_getfattr(xdr, res->dir_fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5258,31 +5274,31 @@ out:
 /*
  * Decode SYMLINK response
  */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_create_res *res)
 {
-	return nfs4_xdr_dec_create(rqstp, p, res);
+	return nfs4_xdr_dec_create(rqstp, xdr, res);
 }
 
 /*
  * Decode GETATTR response
  */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_getattr_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server,
+	status = decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5309,24 +5325,22 @@ static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
  * Decode SETACL response
  */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		    struct nfs_setaclres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_setattr(&xdr);
+	status = decode_setattr(xdr);
 out:
 	return status;
 }
@@ -5335,24 +5349,22 @@ out:
  * Decode GETACL response
  */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		    struct nfs_getaclres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getacl(&xdr, rqstp, &res->acl_len);
+	status = decode_getacl(xdr, rqstp, &res->acl_len);
 
 out:
 	return status;
@@ -5361,23 +5373,22 @@ out:
 /*
  * Decode CLOSE response
  */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_closeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_close(&xdr, res);
+	status = decode_close(xdr, res);
 	if (status != 0)
 		goto out;
 	/*
@@ -5386,7 +5397,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
 	 * 	an ESTALE error. Shouldn't be a problem,
 	 * 	though, since fattr->valid will remain unset.
 	 */
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5395,36 +5406,35 @@ out:
 /*
  * Decode OPEN response
  */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_openres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_savefh(&xdr);
+	status = decode_savefh(xdr);
 	if (status)
 		goto out;
-	status = decode_open(&xdr, res);
+	status = decode_open(xdr, res);
 	if (status)
 		goto out;
-	if (decode_getfh(&xdr, &res->fh) != 0)
+	if (decode_getfh(xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(&xdr, res->f_attr, res->server,
+	if (decode_getfattr(xdr, res->f_attr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if (decode_restorefh(&xdr) != 0)
+	if (decode_restorefh(xdr) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->dir_attr, res->server,
+	decode_getfattr(xdr, res->dir_attr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5433,20 +5443,20 @@ out:
 /*
  * Decode OPEN_CONFIRM response
  */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs_open_confirmres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_open_confirm(&xdr, res);
+	status = decode_open_confirm(xdr, res);
 out:
 	return status;
 }
@@ -5454,26 +5464,26 @@ out:
 /*
  * Decode OPEN response
  */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs_openres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_open(&xdr, res);
+	status = decode_open(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->f_attr, res->server,
+	decode_getfattr(xdr, res->f_attr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5482,26 +5492,26 @@ out:
 /*
  * Decode SETATTR response
  */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nfs_setattrres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_setattr(&xdr);
+	status = decode_setattr(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5510,23 +5520,22 @@ out:
 /*
  * Decode LOCK response
  */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_lock_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_lock(&xdr, res);
+	status = decode_lock(xdr, res);
 out:
 	return status;
 }
@@ -5534,23 +5543,22 @@ out:
 /*
  * Decode LOCKT response
  */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_lockt_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_lockt(&xdr, res);
+	status = decode_lockt(xdr, res);
 out:
 	return status;
 }
@@ -5558,61 +5566,58 @@ out:
 /*
  * Decode LOCKU response
  */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_locku_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_locku(&xdr, res);
+	status = decode_locku(xdr, res);
 out:
 	return status;
 }
 
-static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+					  struct xdr_stream *xdr, void *dummy)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_release_lockowner(&xdr);
+		status = decode_release_lockowner(xdr);
 	return status;
 }
 
 /*
  * Decode READLINK response
  */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
 				 struct nfs4_readlink_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_readlink(&xdr, rqstp);
+	status = decode_readlink(xdr, rqstp);
 out:
 	return status;
 }
@@ -5620,23 +5625,22 @@ out:
 /*
  * Decode READDIR response
  */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_readdir_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_readdir(&xdr, rqstp, res);
+	status = decode_readdir(xdr, rqstp, res);
 out:
 	return status;
 }
@@ -5644,23 +5648,22 @@ out:
 /*
  * Decode Read response
  */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_readres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_read(&xdr, rqstp, res);
+	status = decode_read(xdr, rqstp, res);
 	if (!status)
 		status = res->count;
 out:
@@ -5670,26 +5673,25 @@ out:
 /*
  * Decode WRITE response
  */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_writeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_write(&xdr, res);
+	status = decode_write(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 	if (!status)
 		status = res->count;
@@ -5700,26 +5702,25 @@ out:
 /*
  * Decode COMMIT response
  */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_writeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_commit(&xdr, res);
+	status = decode_commit(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5728,85 +5729,80 @@ out:
 /*
  * Decode FSINFO response
  */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
 			       struct nfs4_fsinfo_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, req);
+		status = decode_sequence(xdr, &res->seq_res, req);
 	if (!status)
-		status = decode_putfh(&xdr);
+		status = decode_putfh(xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, res->fsinfo);
+		status = decode_fsinfo(xdr, res->fsinfo);
 	return status;
 }
 
 /*
  * Decode PATHCONF response
  */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs4_pathconf_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, req);
+		status = decode_sequence(xdr, &res->seq_res, req);
 	if (!status)
-		status = decode_putfh(&xdr);
+		status = decode_putfh(xdr);
 	if (!status)
-		status = decode_pathconf(&xdr, res->pathconf);
+		status = decode_pathconf(xdr, res->pathconf);
 	return status;
 }
 
 /*
  * Decode STATFS response
  */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
 			       struct nfs4_statfs_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, req);
+		status = decode_sequence(xdr, &res->seq_res, req);
 	if (!status)
-		status = decode_putfh(&xdr);
+		status = decode_putfh(xdr);
 	if (!status)
-		status = decode_statfs(&xdr, res->fsstat);
+		status = decode_statfs(xdr, res->fsstat);
 	return status;
 }
 
 /*
  * Decode GETATTR_BITMAP response
  */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_server_caps_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, req);
+	status = decode_sequence(xdr, &res->seq_res, req);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	status = decode_server_caps(&xdr, res);
+	status = decode_server_caps(xdr, res);
 out:
 	return status;
 }
@@ -5814,79 +5810,77 @@ out:
 /*
  * Decode RENEW response
  */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      void *__unused)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_renew(&xdr);
+		status = decode_renew(xdr);
 	return status;
 }
 
 /*
  * Decode SETCLIENTID response
  */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
-		struct nfs4_setclientid_res *res)
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_setclientid_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_setclientid(&xdr, res);
+		status = decode_setclientid(xdr, res);
 	return status;
 }
 
 /*
  * Decode SETCLIENTID_CONFIRM response
  */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+					    struct xdr_stream *xdr,
+					    struct nfs_fsinfo *fsinfo)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_setclientid_confirm(&xdr);
+		status = decode_setclientid_confirm(xdr);
 	if (!status)
-		status = decode_putrootfh(&xdr);
+		status = decode_putrootfh(xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, fsinfo);
+		status = decode_fsinfo(xdr, fsinfo);
 	return status;
 }
 
 /*
  * Decode DELEGRETURN response
  */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_delegreturnres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status != 0)
 		goto out;
-	status = decode_delegreturn(&xdr);
+	status = decode_delegreturn(xdr);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5895,26 +5889,27 @@ out:
 /*
  * Decode FS_LOCATIONS response
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
 				     struct nfs4_fs_locations_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, req);
+	status = decode_sequence(xdr, &res->seq_res, req);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_lookup(&xdr)) != 0)
+	status = decode_lookup(xdr);
+	if (status)
 		goto out;
-	xdr_enter_page(&xdr, PAGE_SIZE);
-	status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+	xdr_enter_page(xdr, PAGE_SIZE);
+	status = decode_getfattr(xdr, &res->fs_locations->fattr,
 				 res->fs_locations->server,
 				 !RPC_IS_ASYNC(req->rq_task));
 out:
@@ -5925,129 +5920,122 @@ out:
 /*
  * Decode EXCHANGE_ID response
  */
-static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
 				    void *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_exchange_id(&xdr, res);
+		status = decode_exchange_id(xdr, res);
 	return status;
 }
 
 /*
  * Decode CREATE_SESSION response
  */
-static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
 				       struct nfs41_create_session_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_create_session(&xdr, res);
+		status = decode_create_session(xdr, res);
 	return status;
 }
 
 /*
  * Decode DESTROY_SESSION response
  */
-static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
-					void *dummy)
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_destroy_session(&xdr, dummy);
+		status = decode_destroy_session(xdr, res);
 	return status;
 }
 
 /*
  * Decode SEQUENCE response
  */
-static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
 				 struct nfs4_sequence_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, res, rqstp);
+		status = decode_sequence(xdr, res, rqstp);
 	return status;
 }
 
 /*
  * Decode GET_LEASE_TIME response
  */
-static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
 				       struct nfs4_get_lease_time_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+		status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
 	if (!status)
-		status = decode_putrootfh(&xdr);
+		status = decode_putrootfh(xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, res->lr_fsinfo);
+		status = decode_fsinfo(xdr, res->lr_fsinfo);
 	return status;
 }
 
 /*
  * Decode RECLAIM_COMPLETE response
  */
-static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+					 struct xdr_stream *xdr,
 					 struct nfs41_reclaim_complete_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, rqstp);
+		status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (!status)
-		status = decode_reclaim_complete(&xdr, (void *)NULL);
+		status = decode_reclaim_complete(xdr, (void *)NULL);
 	return status;
 }
 
 /*
  * Decode GETDEVINFO response
  */
-static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+				      struct xdr_stream *xdr,
 				      struct nfs4_getdeviceinfo_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status != 0)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status != 0)
 		goto out;
-	status = decode_getdeviceinfo(&xdr, res->pdev);
+	status = decode_getdeviceinfo(xdr, res->pdev);
 out:
 	return status;
 }
@@ -6055,24 +6043,23 @@ out:
 /*
  * Decode LAYOUTGET response
  */
-static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
 				  struct nfs4_layoutget_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_layoutget(&xdr, rqstp, res);
+	status = decode_layoutget(xdr, rqstp, res);
 out:
 	return status;
 }
@@ -6236,7 +6223,7 @@ nfs4_stat_to_errno(int stat)
 [NFSPROC4_CLNT_##proc] = {					\
 	.p_proc   = NFSPROC4_COMPOUND,				\
 	.p_encode = (kxdreproc_t)nfs4_xdr_##argtype,		\
-	.p_decode = (kxdrproc_t)nfs4_xdr_##restype,		\
+	.p_decode = (kxdrdproc_t)nfs4_xdr_##restype,		\
 	.p_arglen = NFS4_##argtype##_sz,			\
 	.p_replen = NFS4_##restype##_sz,			\
 	.p_statidx = NFSPROC4_CLNT_##proc,			\
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c363efda8ecf..21a63da305ff 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -533,7 +533,8 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
  * Protocol".
  */
 
-static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p, void *__unused)
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				void *__unused)
 {
 	return 0;
 }
@@ -541,26 +542,25 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p, void *__unused)
 /*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
-static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
 				  struct nfsd4_callback *cb)
 {
-	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
 	enum nfsstat4 nfserr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_cb_compound4res(&xdr, &hdr);
+	status = decode_cb_compound4res(xdr, &hdr);
 	if (unlikely(status))
 		goto out;
 
 	if (cb != NULL) {
-		status = decode_cb_sequence4res(&xdr, cb);
+		status = decode_cb_sequence4res(xdr, cb);
 		if (unlikely(status))
 			goto out;
 	}
 
-	status = decode_cb_op_status(&xdr, OP_CB_RECALL, &nfserr);
+	status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
 	if (unlikely(status))
 		goto out;
 	if (unlikely(nfserr != NFS4_OK))
@@ -578,7 +578,7 @@ out_default:
 [NFSPROC4_CLNT_##proc] = {						\
 	.p_proc    = NFSPROC4_CB_##call,				\
 	.p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,		\
-	.p_decode  = (kxdrproc_t)nfs4_xdr_dec_##restype,		\
+	.p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,		\
 	.p_arglen  = NFS4_enc_##argtype##_sz,				\
 	.p_replen  = NFS4_dec_##restype##_sz,				\
 	.p_statidx = NFSPROC4_CB_##call,				\
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index d88cffbaa6df..8521067ed4f7 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -112,7 +112,7 @@ struct rpc_credops {
 	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
 	int			(*crwrap_req)(struct rpc_task *, kxdreproc_t,
 						void *, __be32 *, void *);
-	int			(*crunwrap_resp)(struct rpc_task *, kxdrproc_t,
+	int			(*crunwrap_resp)(struct rpc_task *, kxdrdproc_t,
 						void *, __be32 *, void *);
 };
 
@@ -140,7 +140,7 @@ void			put_rpccred(struct rpc_cred *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
 __be32 *		rpcauth_checkverf(struct rpc_task *, __be32 *);
 int			rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp, __be32 *data, void *obj);
-int			rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, __be32 *data, void *obj);
+int			rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *data, void *obj);
 int			rpcauth_refreshcred(struct rpc_task *);
 void			rpcauth_invalcred(struct rpc_task *);
 int			rpcauth_uptodatecred(struct rpc_task *);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 7b19c4e1ce53..ef9476a36ff7 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -90,7 +90,7 @@ struct rpc_version {
 struct rpc_procinfo {
 	u32			p_proc;		/* RPC procedure number */
 	kxdreproc_t		p_encode;	/* XDR encode function */
-	kxdrproc_t		p_decode;	/* XDR decode function */
+	kxdrdproc_t		p_decode;	/* XDR decode function */
 	unsigned int		p_arglen;	/* argument hdr length (u32) */
 	unsigned int		p_replen;	/* reply hdr length (u32) */
 	unsigned int		p_count;	/* call count */
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index a21cf5378c1d..9a21e8102c42 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -204,9 +204,10 @@ struct xdr_stream {
 };
 
 /*
- * This is the xdr_stream style generic XDR function.
+ * These are the xdr_stream style generic XDR encode and decode functions.
  */
 typedef void	(*kxdreproc_t)(void *rqstp, struct xdr_stream *xdr, void *obj);
+typedef int	(*kxdrdproc_t)(void *rqstp, struct xdr_stream *xdr, void *obj);
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 651c9da703cb..67e31276682a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -587,8 +587,18 @@ rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp,
 	return 0;
 }
 
+static int
+rpcauth_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
+			  __be32 *data, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data);
+	return decode(rqstp, &xdr, obj);
+}
+
 int
-rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
+rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp,
 		__be32 *data, void *obj)
 {
 	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
@@ -599,7 +609,7 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 		return cred->cr_ops->crunwrap_resp(task, decode, rqstp,
 						   data, obj);
 	/* By default, we decode the arguments normally. */
-	return decode(rqstp, data, obj);
+	return rpcauth_unwrap_req_decode(decode, rqstp, data, obj);
 }
 
 int
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 42b46f9a670a..45dbf1521b9a 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1503,10 +1503,19 @@ gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	return 0;
 }
 
+static int
+gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
+		      __be32 *p, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	return decode(rqstp, &xdr, obj);
+}
 
 static int
 gss_unwrap_resp(struct rpc_task *task,
-		kxdrproc_t decode, void *rqstp, __be32 *p, void *obj)
+		kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj)
 {
 	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
@@ -1537,7 +1546,7 @@ gss_unwrap_resp(struct rpc_task *task,
 	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
 						+ (savedlen - head->iov_len);
 out_decode:
-	status = decode(rqstp, p, obj);
+	status = gss_unwrap_req_decode(decode, rqstp, p, obj);
 out:
 	gss_put_ctx(ctx);
 	dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d446a32be667..4e8210dcda72 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1535,7 +1535,7 @@ call_decode(struct rpc_task *task)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
 	struct rpc_rqst	*req = task->tk_rqstp;
-	kxdrproc_t	decode = task->tk_msg.rpc_proc->p_decode;
+	kxdrdproc_t	decode = task->tk_msg.rpc_proc->p_decode;
 	__be32		*p;
 
 	dprintk("RPC: %5u call_decode (status %d)\n",
@@ -1780,7 +1780,7 @@ static void rpcproc_encode_null(void *rqstp, struct xdr_stream *xdr, void *obj)
 {
 }
 
-static int rpcproc_decode_null(void *rqstp, __be32 *data, void *obj)
+static int rpcproc_decode_null(void *rqstp, struct xdr_stream *xdr, void *obj)
 {
 	return 0;
 }
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 63912a1a2983..c652e4cc9fe9 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -706,18 +706,16 @@ static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr,
 	*p   = cpu_to_be32(rpcb->r_port);
 }
 
-static int rpcb_dec_getport(struct rpc_rqst *req, __be32 *p,
+static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr,
 			    struct rpcbind_args *rpcb)
 {
 	struct rpc_task *task = req->rq_task;
-	struct xdr_stream xdr;
 	unsigned long port;
-
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	__be32 *p;
 
 	rpcb->r_port = 0;
 
-	p = xdr_inline_decode(&xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
 
@@ -731,20 +729,18 @@ static int rpcb_dec_getport(struct rpc_rqst *req, __be32 *p,
 	return 0;
 }
 
-static int rpcb_dec_set(struct rpc_rqst *req, __be32 *p,
+static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr,
 			unsigned int *boolp)
 {
 	struct rpc_task *task = req->rq_task;
-	struct xdr_stream xdr;
-
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	__be32 *p;
 
-	p = xdr_inline_decode(&xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
 
 	*boolp = 0;
-	if (*p)
+	if (*p != xdr_zero)
 		*boolp = 1;
 
 	dprintk("RPC: %5u RPCB_%s call %s\n",
@@ -785,20 +781,18 @@ static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_rpcb_string(xdr, rpcb->r_owner, RPCB_MAXOWNERLEN);
 }
 
-static int rpcb_dec_getaddr(struct rpc_rqst *req, __be32 *p,
+static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
 			    struct rpcbind_args *rpcb)
 {
 	struct sockaddr_storage address;
 	struct sockaddr *sap = (struct sockaddr *)&address;
 	struct rpc_task *task = req->rq_task;
-	struct xdr_stream xdr;
+	__be32 *p;
 	u32 len;
 
 	rpcb->r_port = 0;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-
-	p = xdr_inline_decode(&xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		goto out_fail;
 	len = be32_to_cpup(p);
@@ -816,7 +810,7 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, __be32 *p,
 	if (unlikely(len > RPCBIND_MAXUADDRLEN))
 		goto out_fail;
 
-	p = xdr_inline_decode(&xdr, len);
+	p = xdr_inline_decode(xdr, len);
 	if (unlikely(p == NULL))
 		goto out_fail;
 	dprintk("RPC: %5u RPCB_%s reply: %s\n", task->tk_pid,
@@ -843,7 +837,7 @@ static struct rpc_procinfo rpcb_procedures2[] = {
 	[RPCBPROC_SET] = {
 		.p_proc		= RPCBPROC_SET,
 		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
-		.p_decode	= (kxdrproc_t)rpcb_dec_set,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_mappingargs_sz,
 		.p_replen	= RPCB_setres_sz,
 		.p_statidx	= RPCBPROC_SET,
@@ -853,7 +847,7 @@ static struct rpc_procinfo rpcb_procedures2[] = {
 	[RPCBPROC_UNSET] = {
 		.p_proc		= RPCBPROC_UNSET,
 		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
-		.p_decode	= (kxdrproc_t)rpcb_dec_set,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_mappingargs_sz,
 		.p_replen	= RPCB_setres_sz,
 		.p_statidx	= RPCBPROC_UNSET,
@@ -863,7 +857,7 @@ static struct rpc_procinfo rpcb_procedures2[] = {
 	[RPCBPROC_GETPORT] = {
 		.p_proc		= RPCBPROC_GETPORT,
 		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
-		.p_decode	= (kxdrproc_t)rpcb_dec_getport,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_getport,
 		.p_arglen	= RPCB_mappingargs_sz,
 		.p_replen	= RPCB_getportres_sz,
 		.p_statidx	= RPCBPROC_GETPORT,
@@ -876,7 +870,7 @@ static struct rpc_procinfo rpcb_procedures3[] = {
 	[RPCBPROC_SET] = {
 		.p_proc		= RPCBPROC_SET,
 		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
-		.p_decode	= (kxdrproc_t)rpcb_dec_set,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
 		.p_statidx	= RPCBPROC_SET,
@@ -886,7 +880,7 @@ static struct rpc_procinfo rpcb_procedures3[] = {
 	[RPCBPROC_UNSET] = {
 		.p_proc		= RPCBPROC_UNSET,
 		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
-		.p_decode	= (kxdrproc_t)rpcb_dec_set,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
 		.p_statidx	= RPCBPROC_UNSET,
@@ -896,7 +890,7 @@ static struct rpc_procinfo rpcb_procedures3[] = {
 	[RPCBPROC_GETADDR] = {
 		.p_proc		= RPCBPROC_GETADDR,
 		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
-		.p_decode	= (kxdrproc_t)rpcb_dec_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_getaddr,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_getaddrres_sz,
 		.p_statidx	= RPCBPROC_GETADDR,
@@ -909,7 +903,7 @@ static struct rpc_procinfo rpcb_procedures4[] = {
 	[RPCBPROC_SET] = {
 		.p_proc		= RPCBPROC_SET,
 		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
-		.p_decode	= (kxdrproc_t)rpcb_dec_set,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
 		.p_statidx	= RPCBPROC_SET,
@@ -919,7 +913,7 @@ static struct rpc_procinfo rpcb_procedures4[] = {
 	[RPCBPROC_UNSET] = {
 		.p_proc		= RPCBPROC_UNSET,
 		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
-		.p_decode	= (kxdrproc_t)rpcb_dec_set,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_setres_sz,
 		.p_statidx	= RPCBPROC_UNSET,
@@ -929,7 +923,7 @@ static struct rpc_procinfo rpcb_procedures4[] = {
 	[RPCBPROC_GETADDR] = {
 		.p_proc		= RPCBPROC_GETADDR,
 		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
-		.p_decode	= (kxdrproc_t)rpcb_dec_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_getaddr,
 		.p_arglen	= RPCB_getaddrargs_sz,
 		.p_replen	= RPCB_getaddrres_sz,
 		.p_statidx	= RPCBPROC_GETADDR,
-- 
cgit v1.2.3-71-gd317


From 7db836d4a427c3c64406b00b6d8d745d6335d72a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 15:05:42 +0000
Subject: lockd: Split nlm_release_call()

The nlm_release_call() function is invoked from both the server and
the client side.  We're about to introduce a distinct server- and
client-side nlm_release_host(), so nlm_release_call() must first be
split into a client-side and a server-side version.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         | 12 ++++++------
 fs/lockd/svc4proc.c         |  4 ++--
 fs/lockd/svclock.c          |  4 ++--
 fs/lockd/svcproc.c          | 12 ++++++++++--
 include/linux/lockd/lockd.h |  3 ++-
 5 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 332c54cf75e0..fbc6617f76c4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -211,7 +211,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 	return NULL;
 }
 
-void nlm_release_call(struct nlm_rqst *call)
+void nlmclnt_release_call(struct nlm_rqst *call)
 {
 	if (!atomic_dec_and_test(&call->a_count))
 		return;
@@ -222,7 +222,7 @@ void nlm_release_call(struct nlm_rqst *call)
 
 static void nlmclnt_rpc_release(void *data)
 {
-	nlm_release_call(data);
+	nlmclnt_release_call(data);
 }
 
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -436,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 			status = nlm_stat_to_errno(req->a_res.status);
 	}
 out:
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 }
 
@@ -593,7 +593,7 @@ again:
 out_unblock:
 	nlmclnt_finish_block(block);
 out:
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 out_unlock:
 	/* Fatal error: ensure that we remove the lock altogether */
@@ -694,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	/* What to do now? I'm out of my depth... */
 	status = -ENOLCK;
 out:
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 }
 
@@ -755,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 			NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 	if (status == 0 && req->a_res.status == nlm_lck_denied)
 		status = -ENOLCK;
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 }
 
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 38d261192453..c187422026d8 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -229,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlm4svc_callback_release(void *data)
 {
-	nlm_release_call(data);
+	nlmsvc_release_call(data);
 }
 
 static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -261,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
 
 	stat = func(rqstp, argp, &call->a_res);
 	if (stat != 0) {
-		nlm_release_call(call);
+		nlmsvc_release_call(call);
 		return stat;
 	}
 
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 9266c4600208..6e31695d046f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -234,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 failed_free:
 	kfree(block);
 failed:
-	nlm_release_call(call);
+	nlmsvc_release_call(call);
 	return NULL;
 }
 
@@ -267,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
 	mutex_unlock(&file->f_mutex);
 
 	nlmsvc_freegrantargs(block->b_call);
-	nlm_release_call(block->b_call);
+	nlmsvc_release_call(block->b_call);
 	nlm_release_file(block->b_file);
 	kfree(block->b_fl);
 	kfree(block);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0caea5310ac3..0df65ec29e43 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -257,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 			-task->tk_status);
 }
 
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+	if (!atomic_dec_and_test(&call->a_count))
+		return;
+	nlm_release_host(call->a_host);
+	kfree(call);
+}
+
 static void nlmsvc_callback_release(void *data)
 {
-	nlm_release_call(data);
+	nlmsvc_release_call(data);
 }
 
 static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -291,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
 
 	stat = func(rqstp, argp, &call->a_res);
 	if (stat != 0) {
-		nlm_release_call(call);
+		nlmsvc_release_call(call);
 		return stat;
 	}
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 2dee05e5119a..a32ba62455af 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -202,9 +202,9 @@ extern u32			nsm_local_state;
  * Lockd client functions
  */
 struct nlm_rqst * nlm_alloc_call(struct nlm_host *host);
-void		  nlm_release_call(struct nlm_rqst *);
 int		  nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 int		  nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *);
+void		  nlmclnt_release_call(struct nlm_rqst *);
 struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl);
 void		  nlmclnt_finish_block(struct nlm_wait *block);
 int		  nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
@@ -267,6 +267,7 @@ unsigned long	  nlmsvc_retry_blocked(void);
 void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
 					nlm_host_match_fn_t match);
 void		  nlmsvc_grant_reply(struct nlm_cookie *, __be32);
+void		  nlmsvc_release_call(struct nlm_rqst *);
 
 /*
  * File handling for the server personality
-- 
cgit v1.2.3-71-gd317


From 8ea6ecc8b0759756a766c05dc7c98c51ec90de37 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 15:05:52 +0000
Subject: lockd: Create client-side nlm_host cache

NFS clients don't need the garbage collection processing that is
performed on nlm_host structures.  The client picks up an nlm_host at
mount time and holds a reference to it until the file system is
unmounted.

Servers, on the other hand, don't have a precise way to tell when an
nlm_host is no longer being used, so zero refcount nlm_host entries
are left to expire in the cache after a time.

Basically there's nothing holding a reference to an nlm_host between
individual server-side NLM requests, but we can't afford the expense
of recreating them for every new NLM request from a client.  The
nlm_host cache adds some lifetime hysteresis to entries in the cache
so the next time a particular nlm_host is needed, it's likely to be
discovered by a lookup rather than created from whole cloth.

With the new implementation, client nlm_host cache items are no longer
garbage collected, and are destroyed directly by a new release
function specialized for client entries, nlmclnt_release_host().  They
are cached in their own data structure, and have their own lookup
logic, simplified and specialized for client nlm_host entries.

However, the client nlm_host cache still shares reboot recovery logic
with the server nlm_host cache.  The NSM "peer rebooted" downcall for
clients and servers still come through the same RPC call.  This is a
legacy formal API that would be difficult to alter, and besides, the
user space NSM implementation can't tell the difference between peers
that are clients or servers.

For this reason, the client cache continues to share the
nlm_host_mutex (and reboot recovery logic) with the server cache.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c         |  4 +--
 fs/lockd/clntproc.c         |  6 ++--
 fs/lockd/host.c             | 81 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/lockd/lockd.h |  1 +
 4 files changed, 78 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 25509eb28fd7..8d4ea8351e3d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
  */
 void nlmclnt_done(struct nlm_host *host)
 {
-	nlm_release_host(host);
+	nlmclnt_release_host(host);
 	lockd_down();
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -273,7 +273,7 @@ restart:
 	spin_unlock(&nlm_blocked_lock);
 
 	/* Release host handle after use */
-	nlm_release_host(host);
+	nlmclnt_release_host(host);
 	lockd_down();
 	return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index fbc6617f76c4..adb45ec9038c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -58,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
 		return;
 	list_del(&lockowner->list);
 	spin_unlock(&lockowner->host->h_lock);
-	nlm_release_host(lockowner->host);
+	nlmclnt_release_host(lockowner->host);
 	kfree(lockowner);
 }
 
@@ -207,7 +207,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 		printk("nlm_alloc_call: failed, waiting for memory\n");
 		schedule_timeout_interruptible(5*HZ);
 	}
-	nlm_release_host(host);
+	nlmclnt_release_host(host);
 	return NULL;
 }
 
@@ -215,7 +215,7 @@ void nlmclnt_release_call(struct nlm_rqst *call)
 {
 	if (!atomic_dec_and_test(&call->a_count))
 		return;
-	nlm_release_host(call->a_host);
+	nlmclnt_release_host(call->a_host);
 	nlmclnt_release_lockargs(call);
 	kfree(call);
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index e58e1426d161..c6942fb4bd0d 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -26,6 +26,7 @@
 #define NLM_HOST_COLLECT	(120 * HZ)
 
 static struct hlist_head	nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head	nlm_client_hosts[NLM_HOST_NRHASH];
 
 #define for_each_host(host, pos, chain, table) \
 	for ((chain) = (table); \
@@ -288,12 +289,76 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
 		.hostname_len	= strlen(hostname),
 		.noresvport	= noresvport,
 	};
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host;
+	struct nsm_handle *nsm = NULL;
 
 	dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
 			(hostname ? hostname : "<none>"), version,
 			(protocol == IPPROTO_UDP ? "udp" : "tcp"));
 
-	return nlm_lookup_host(&ni);
+	mutex_lock(&nlm_host_mutex);
+
+	chain = &nlm_client_hosts[nlm_hash_address(sap)];
+	hlist_for_each_entry(host, pos, chain, h_hash) {
+		if (!rpc_cmp_addr(nlm_addr(host), sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != protocol)
+			continue;
+		if (host->h_version != version)
+			continue;
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n", __func__,
+			host->h_name, host->h_addrbuf);
+		goto out;
+	}
+
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	hlist_add_head(&host->h_hash, chain);
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n", __func__,
+		host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release client host %s\n", host->h_name);
+
+	BUG_ON(atomic_read(&host->h_count) < 0);
+	BUG_ON(host->h_server);
+
+	if (atomic_dec_and_test(&host->h_count)) {
+		BUG_ON(!list_empty(&host->h_lockowners));
+		BUG_ON(!list_empty(&host->h_granted));
+		BUG_ON(!list_empty(&host->h_reclaim));
+
+		mutex_lock(&nlm_host_mutex);
+		nlm_destroy_host_locked(host);
+		mutex_unlock(&nlm_host_mutex);
+	}
 }
 
 /**
@@ -515,16 +580,14 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
 	 * To avoid processing a host several times, we match the nsmstate.
 	 */
 	while ((host = next_host_state(nlm_hosts, nsm, info)) != NULL) {
-		if (host->h_server) {
-			/* We're server for this guy, just ditch
-			 * all the locks he held. */
-			nlmsvc_free_host_resources(host);
-		} else {
-			/* He's the server, initiate lock recovery. */
-			nlmclnt_recovery(host);
-		}
+		nlmsvc_free_host_resources(host);
 		nlm_release_host(host);
 	}
+	while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+		nlmclnt_recovery(host);
+		nlmclnt_release_host(host);
+	}
+
 	nsm_release(nsm);
 }
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index a32ba62455af..6c2a0e2f298e 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -223,6 +223,7 @@ struct nlm_host  *nlmclnt_lookup_host(const struct sockaddr *sap,
 					const u32 version,
 					const char *hostname,
 					int noresvport);
+void		  nlmclnt_release_host(struct nlm_host *);
 struct nlm_host  *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 					const char *hostname,
 					const size_t hostname_len);
-- 
cgit v1.2.3-71-gd317


From 67216b94d498f5880d8bba2a6b841880739dd524 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 14 Dec 2010 15:06:12 +0000
Subject: lockd: Clean up nlmsvc_lookup_host()

Clean up.

Change nlmsvc_lookup_host() to be purpose-built for server-side
nlm_host management.  This replaces the generic nlm_lookup_host()
helper function, just like on the client side.  The lookup logic is
specialized for server host lookups.

The server side cache also gets its own specialized equivalent of the
nlm_release_host() function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             | 91 +++++++++++++++++++++++++++++++++++----------
 fs/lockd/svc4proc.c         | 16 ++++----
 fs/lockd/svcproc.c          | 18 ++++-----
 include/linux/lockd/lockd.h |  2 +-
 4 files changed, 90 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index c6942fb4bd0d..0250b0e4f5e9 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -383,6 +383,10 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 				    const char *hostname,
 				    const size_t hostname_len)
 {
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host = NULL;
+	struct nsm_handle *nsm = NULL;
 	struct sockaddr_in sin = {
 		.sin_family	= AF_INET,
 	};
@@ -404,6 +408,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 			(int)hostname_len, hostname, rqstp->rq_vers,
 			(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
 
+	mutex_lock(&nlm_host_mutex);
+
 	switch (ni.sap->sa_family) {
 	case AF_INET:
 		sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
@@ -414,10 +420,73 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 		ni.src_sap = (struct sockaddr *)&sin6;
 		break;
 	default:
-		return NULL;
+		dprintk("lockd: %s failed; unrecognized address family\n",
+			__func__);
+		goto out;
 	}
 
-	return nlm_lookup_host(&ni);
+	if (time_after_eq(jiffies, next_gc))
+		nlm_gc_hosts();
+
+	chain = &nlm_hosts[nlm_hash_address(ni.sap)];
+	hlist_for_each_entry(host, pos, chain, h_hash) {
+		if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != ni.protocol)
+			continue;
+		if (host->h_version != ni.version)
+			continue;
+		if (!rpc_cmp_addr(nlm_srcaddr(host), ni.src_sap))
+			continue;
+
+		/* Move to head of hash chain. */
+		hlist_del(&host->h_hash);
+		hlist_add_head(&host->h_hash, chain);
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n",
+			__func__, host->h_name, host->h_addrbuf);
+		goto out;
+	}
+
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	memcpy(nlm_srcaddr(host), ni.src_sap, ni.src_len);
+	host->h_srcaddrlen = ni.src_len;
+	hlist_add_head(&host->h_hash, chain);
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n",
+		__func__, host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release server host %s\n", host->h_name);
+
+	BUG_ON(atomic_read(&host->h_count) < 0);
+	BUG_ON(!host->h_server);
+	atomic_dec(&host->h_count);
 }
 
 /*
@@ -517,22 +586,6 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
 	return host;
 }
 
-/*
- * Release NLM host after use
- */
-void nlm_release_host(struct nlm_host *host)
-{
-	if (host != NULL) {
-		dprintk("lockd: release host %s\n", host->h_name);
-		BUG_ON(atomic_read(&host->h_count) < 0);
-		if (atomic_dec_and_test(&host->h_count)) {
-			BUG_ON(!list_empty(&host->h_lockowners));
-			BUG_ON(!list_empty(&host->h_granted));
-			BUG_ON(!list_empty(&host->h_reclaim));
-		}
-	}
-}
-
 static struct nlm_host *next_host_state(struct hlist_head *cache,
 					struct nsm_handle *nsm,
 					const struct nlm_reboot *info)
@@ -581,7 +634,7 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
 	 */
 	while ((host = next_host_state(nlm_hosts, nsm, info)) != NULL) {
 		nlmsvc_free_host_resources(host);
-		nlm_release_host(host);
+		nlmsvc_release_host(host);
 	}
 	while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
 		nlmclnt_recovery(host);
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index c187422026d8..9a41fdc19511 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -51,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
  	if (error)
 		return error;	
 	return nlm_lck_denied_nolocks;
@@ -92,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	else
 		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -134,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -164,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -197,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_unlock(file, &argp->lock);
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -334,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_share_file(host, file, argp);
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -367,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_unshare_file(host, file, argp);
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -399,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return rpc_success;
 
 	nlmsvc_free_host_resources(host);
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0df65ec29e43..d27aab11f324 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -80,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	if (error)
 		return error;
 	return nlm_lck_denied_nolocks;
@@ -122,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		dprintk("lockd: TEST          status %d vers %d\n",
 			ntohl(resp->status), rqstp->rq_vers);
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -164,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -194,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -227,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -261,7 +261,7 @@ void nlmsvc_release_call(struct nlm_rqst *call)
 {
 	if (!atomic_dec_and_test(&call->a_count))
 		return;
-	nlm_release_host(call->a_host);
+	nlmsvc_release_host(call->a_host);
 	kfree(call);
 }
 
@@ -374,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_share_file(host, file, argp));
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -407,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -439,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return rpc_success;
 
 	nlmsvc_free_host_resources(host);
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	return rpc_success;
 }
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 6c2a0e2f298e..ff9abff55aa0 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -227,10 +227,10 @@ void		  nlmclnt_release_host(struct nlm_host *);
 struct nlm_host  *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 					const char *hostname,
 					const size_t hostname_len);
+void		  nlmsvc_release_host(struct nlm_host *);
 struct rpc_clnt * nlm_bind_host(struct nlm_host *);
 void		  nlm_rebind_host(struct nlm_host *);
 struct nlm_host * nlm_get_host(struct nlm_host *);
-void		  nlm_release_host(struct nlm_host *);
 void		  nlm_shutdown_hosts(void);
 void		  nlm_host_rebooted(const struct nlm_reboot *);
 
-- 
cgit v1.2.3-71-gd317


From 7c96aef75949a56ec427fc6a2522dace2af33605 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 15 Nov 2010 11:27:01 +1100
Subject: sunrpc: remove xpt_pool

The xpt_pool field is only used for reporting BUGs.
And it isn't used correctly.

In particular, when it is cleared in svc_xprt_received before
XPT_BUSY is cleared, there is no guarantee that either the
compiler or the CPU might not re-order to two assignments, just
setting xpt_pool to NULL after XPT_BUSY is cleared.

If a different cpu were running svc_xprt_enqueue at this moment,
it might see XPT_BUSY clear and then xpt_pool non-NULL, and
so BUG.

This could be fixed by calling
  smp_mb__before_clear_bit()
before the clear_bit.  However as xpt_pool isn't really used,
it seems safest to simply remove xpt_pool.

Another alternate would be to change the clear_bit to
clear_bit_unlock, and the test_and_set_bit to test_and_set_bit_lock.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h | 1 -
 net/sunrpc/svc_xprt.c           | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index aea0d438e3c7..c8f81da15c7e 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -63,7 +63,6 @@ struct svc_xprt {
 #define XPT_LISTENER	11		/* listening endpoint */
 #define XPT_CACHE_AUTH	12		/* cache auth info */
 
-	struct svc_pool		*xpt_pool;	/* current pool iff queued */
 	struct svc_serv		*xpt_server;	/* service for transport */
 	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
 	struct mutex		xpt_mutex;	/* to serialize sending data */
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 5a75d23645c8..5eae53b1e306 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -351,8 +351,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		dprintk("svc: transport %p busy, not enqueued\n", xprt);
 		goto out_unlock;
 	}
-	BUG_ON(xprt->xpt_pool != NULL);
-	xprt->xpt_pool = pool;
 
 	if (!list_empty(&pool->sp_threads)) {
 		rqstp = list_entry(pool->sp_threads.next,
@@ -370,13 +368,11 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		rqstp->rq_reserved = serv->sv_max_mesg;
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
 		pool->sp_stats.threads_woken++;
-		BUG_ON(xprt->xpt_pool != pool);
 		wake_up(&rqstp->rq_wait);
 	} else {
 		dprintk("svc: transport %p put into queue\n", xprt);
 		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
 		pool->sp_stats.sockets_queued++;
-		BUG_ON(xprt->xpt_pool != pool);
 	}
 
 out_unlock:
@@ -415,7 +411,6 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 void svc_xprt_received(struct svc_xprt *xprt)
 {
 	BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
-	xprt->xpt_pool = NULL;
 	/* As soon as we clear busy, the xprt could be closed and
 	 * 'put', so we need a reference to call svc_xprt_enqueue with:
 	 */
-- 
cgit v1.2.3-71-gd317


From 04f4ad16b231abbfde34c762697ad035a3af0b5f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Dec 2010 09:51:13 -0500
Subject: nfsd4: implement secinfo_no_name

Implementation of this operation is mandatory for NFSv4.1.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c   | 27 +++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c    | 15 +++++++++++++--
 fs/nfsd/xdr4.h       |  5 +++++
 include/linux/nfs4.h |  3 +++
 4 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 095431a35722..f80c3997d24c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -779,6 +779,29 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return err;
 }
 
+static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo_no_name *sin)
+{
+	__be32 err;
+
+	switch (sin->sin_style) {
+	case NFS4_SECINFO_STYLE4_CURRENT_FH:
+		break;
+	case NFS4_SECINFO_STYLE4_PARENT:
+		err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+		if (err)
+			return err;
+		break;
+	default:
+		return nfserr_inval;
+	}
+	exp_get(cstate->current_fh.fh_export);
+	sin->sin_exp = cstate->current_fh.fh_export;
+	fh_put(&cstate->current_fh);
+	return nfs_ok;
+}
+
 static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	      struct nfsd4_setattr *setattr)
@@ -1327,6 +1350,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH,
 		.op_name = "OP_RECLAIM_COMPLETE",
 	},
+	[OP_SECINFO_NO_NAME] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+		.op_name = "OP_SECINFO_NO_NAME",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 71d7d339e44a..b543b2410b54 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -846,6 +846,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 	DECODE_TAIL;
 }
 
+static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo_no_name *sin)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(sin->sin_style);
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
@@ -1358,7 +1369,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -3162,7 +3173,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 60fce3dc5cb5..799c30c3b495 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
 	struct svc_export *si_exp;			/* response */
 };
 
+struct nfsd4_secinfo_no_name {
+	u32 sin_style;					/* request */
+	struct svc_export *sin_exp;			/* response */
+};
+
 struct nfsd4_setattr {
 	stateid_t	sa_stateid;         /* request */
 	u32		sa_bmval[3];        /* request */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 4925b22219d2..26afa3021ed6 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -136,6 +136,9 @@
 #define SEQ4_STATUS_CB_PATH_DOWN_SESSION	0x00000200
 #define SEQ4_STATUS_BACKCHANNEL_FAULT		0x00000400
 
+#define NFS4_SECINFO_STYLE4_CURRENT_FH	0
+#define NFS4_SECINFO_STYLE4_PARENT	1
+
 #define NFS4_MAX_UINT64	(~(u64)0)
 
 /* An NFS4 sessions server must support at least NFS4_MAX_OPS operations.
-- 
cgit v1.2.3-71-gd317


From c66ae9bb4dcaac78cc5e30d0ce7ff2bf3dcb48d9 Mon Sep 17 00:00:00 2001
From: Vasily Khoruzhick <anarsoul@gmail.com>
Date: Mon, 13 Dec 2010 12:26:21 +0200
Subject: s3c_adc_battery: Add gpio_inverted field to pdata

Add support for inverted gpio_charge_finished values.
This change is necessary for H1940 support.

Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/s3c_adc_battery.c | 12 +++++++++---
 include/linux/s3c_adc_battery.h |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/s3c_adc_battery.c b/drivers/power/s3c_adc_battery.c
index fe16b482e912..7bc5bfe55ce9 100644
--- a/drivers/power/s3c_adc_battery.c
+++ b/drivers/power/s3c_adc_battery.c
@@ -112,6 +112,13 @@ static int calc_full_volt(int volt_val, int cur_val, int impedance)
 	return volt_val + cur_val * impedance / 1000;
 }
 
+static int charge_finished(struct s3c_adc_bat *bat)
+{
+	return bat->pdata->gpio_inverted ?
+		!gpio_get_value(bat->pdata->gpio_charge_finished) :
+		gpio_get_value(bat->pdata->gpio_charge_finished);
+}
+
 static int s3c_adc_bat_get_property(struct power_supply *psy,
 				    enum power_supply_property psp,
 				    union power_supply_propval *val)
@@ -140,7 +147,7 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 
 	if (bat->cable_plugged &&
 		((bat->pdata->gpio_charge_finished < 0) ||
-		!gpio_get_value(bat->pdata->gpio_charge_finished))) {
+		!charge_finished(bat))) {
 		lut = bat->pdata->lut_acin;
 		lut_size = bat->pdata->lut_acin_cnt;
 	}
@@ -236,8 +243,7 @@ static void s3c_adc_bat_work(struct work_struct *work)
 		}
 	} else {
 		if ((bat->pdata->gpio_charge_finished >= 0) && is_plugged) {
-			is_charged = gpio_get_value(
-				main_bat.pdata->gpio_charge_finished);
+			is_charged = charge_finished(&main_bat);
 			if (is_charged) {
 				if (bat->pdata->disable_charger)
 					bat->pdata->disable_charger();
diff --git a/include/linux/s3c_adc_battery.h b/include/linux/s3c_adc_battery.h
index dbce22faa660..fbe58b7e63eb 100644
--- a/include/linux/s3c_adc_battery.h
+++ b/include/linux/s3c_adc_battery.h
@@ -14,6 +14,7 @@ struct s3c_adc_bat_pdata {
 	void (*disable_charger)(void);
 
 	int gpio_charge_finished;
+	int gpio_inverted;
 
 	const struct s3c_adc_bat_thresh *lut_noac;
 	unsigned int lut_noac_cnt;
-- 
cgit v1.2.3-71-gd317


From 00aaaef9a51a1a25c5d6d52ce510772f149a0eb0 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 11 Nov 2010 15:46:54 +0800
Subject: PCI: MSI: Move MSI-X entry definition to pci_regs.h

Then it can be used by others.

Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.h        | 6 ------
 include/linux/pci_regs.h | 7 +++++++
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index feff3bee6fe5..65c42f80f23e 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -6,12 +6,6 @@
 #ifndef MSI_H
 #define MSI_H
 
-#define PCI_MSIX_ENTRY_SIZE		16
-#define  PCI_MSIX_ENTRY_LOWER_ADDR	0
-#define  PCI_MSIX_ENTRY_UPPER_ADDR	4
-#define  PCI_MSIX_ENTRY_DATA		8
-#define  PCI_MSIX_ENTRY_VECTOR_CTRL	12
-
 #define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
 #define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
 #define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index af83076c31a6..b21d33e4bc2a 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -309,6 +309,13 @@
 #define PCI_MSIX_PBA		8
 #define  PCI_MSIX_FLAGS_BIRMASK	(7 << 0)
 
+/* MSI-X entry's format */
+#define PCI_MSIX_ENTRY_SIZE		16
+#define  PCI_MSIX_ENTRY_LOWER_ADDR	0
+#define  PCI_MSIX_ENTRY_UPPER_ADDR	4
+#define  PCI_MSIX_ENTRY_DATA		8
+#define  PCI_MSIX_ENTRY_VECTOR_CTRL	12
+
 /* CompactPCI Hotswap Register */
 
 #define PCI_CHSWP_CSR		2	/* Control and Status Register */
-- 
cgit v1.2.3-71-gd317


From 8d805286968811223cca002134ba3d81244d5313 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 11 Nov 2010 15:46:55 +0800
Subject: PCI: Add mask bit definition for MSI-X table

Then we can use it instead of magic number 1.

Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c        | 5 +++--
 include/linux/pci_regs.h | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7c24dcef2989..44b0aeee83e5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -168,8 +168,9 @@ static u32 __msix_mask_irq(struct msi_desc *desc, u32 flag)
 	u32 mask_bits = desc->masked;
 	unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 						PCI_MSIX_ENTRY_VECTOR_CTRL;
-	mask_bits &= ~1;
-	mask_bits |= flag;
+	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
+	if (flag)
+		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
 	writel(mask_bits, desc->mask_base + offset);
 
 	return mask_bits;
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index b21d33e4bc2a..d4f2c80a6c3e 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -315,6 +315,7 @@
 #define  PCI_MSIX_ENTRY_UPPER_ADDR	4
 #define  PCI_MSIX_ENTRY_DATA		8
 #define  PCI_MSIX_ENTRY_VECTOR_CTRL	12
+#define   PCI_MSIX_ENTRY_CTRL_MASKBIT	1
 
 /* CompactPCI Hotswap Register */
 
-- 
cgit v1.2.3-71-gd317


From 2f671e2dbff6eb5ef4e2600adbec550c13b8fe72 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 6 Dec 2010 14:00:56 -0500
Subject: PCI: Disable ASPM if BIOS asks us to

We currently refuse to touch the ASPM registers if the BIOS tells us that
ASPM isn't supported. This can cause problems if the BIOS has (for any
reason) enabled ASPM on some devices anyway. Change the code such that we
explicitly clear ASPM if the FADT indicates that ASPM isn't supported,
and make sure we tidy up appropriately on device removal in order to deal
with the hotplug case. If ASPM is disabled because the BIOS doesn't hand
over control then we won't touch the registers.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci-acpi.c   |  1 +
 drivers/pci/pcie/aspm.c  | 21 +++++++++++++++++----
 include/linux/pci-aspm.h |  5 ++++-
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 24e19c594e57..d7ea699b8ddc 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -399,6 +399,7 @@ static int __init acpi_pci_init(void)
 
 	if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) {
 		printk(KERN_INFO"ACPI FADT declares the system doesn't support PCIe ASPM, so disable it\n");
+		pcie_clear_aspm();
 		pcie_no_aspm();
 	}
 
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 71222814c1ec..3188cd96b338 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -68,7 +68,7 @@ struct pcie_link_state {
 	struct aspm_latency acceptable[8];
 };
 
-static int aspm_disabled, aspm_force;
+static int aspm_disabled, aspm_force, aspm_clear_state;
 static DEFINE_MUTEX(aspm_lock);
 static LIST_HEAD(link_list);
 
@@ -139,7 +139,7 @@ static void pcie_set_clkpm(struct pcie_link_state *link, int enable)
 {
 	/* Don't enable Clock PM if the link is not Clock PM capable */
 	if (!link->clkpm_capable && enable)
-		return;
+		enable = 0;
 	/* Need nothing if the specified equals to current state */
 	if (link->clkpm_enabled == enable)
 		return;
@@ -498,6 +498,10 @@ static int pcie_aspm_sanity_check(struct pci_dev *pdev)
 	struct pci_dev *child;
 	int pos;
 	u32 reg32;
+
+	if (aspm_clear_state)
+		return -EINVAL;
+
 	/*
 	 * Some functions in a slot might not all be PCIe functions,
 	 * very strange. Disable ASPM for the whole slot
@@ -563,12 +567,15 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
 	struct pcie_link_state *link;
 	int blacklist = !!pcie_aspm_sanity_check(pdev);
 
-	if (aspm_disabled || !pci_is_pcie(pdev) || pdev->link_state)
+	if (!pci_is_pcie(pdev) || pdev->link_state)
 		return;
 	if (pdev->pcie_type != PCI_EXP_TYPE_ROOT_PORT &&
 	    pdev->pcie_type != PCI_EXP_TYPE_DOWNSTREAM)
 		return;
 
+	if (aspm_disabled && !aspm_clear_state)
+		return;
+
 	/* VIA has a strange chipset, root port is under a bridge */
 	if (pdev->pcie_type == PCI_EXP_TYPE_ROOT_PORT &&
 	    pdev->bus->self)
@@ -641,7 +648,7 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev)
 	struct pci_dev *parent = pdev->bus->self;
 	struct pcie_link_state *link, *root, *parent_link;
 
-	if (aspm_disabled || !pci_is_pcie(pdev) ||
+	if ((aspm_disabled && !aspm_clear_state) || !pci_is_pcie(pdev) ||
 	    !parent || !parent->link_state)
 		return;
 	if ((parent->pcie_type != PCI_EXP_TYPE_ROOT_PORT) &&
@@ -899,6 +906,12 @@ static int __init pcie_aspm_disable(char *str)
 
 __setup("pcie_aspm=", pcie_aspm_disable);
 
+void pcie_clear_aspm(void)
+{
+	if (!aspm_force)
+		aspm_clear_state = 1;
+}
+
 void pcie_no_aspm(void)
 {
 	if (!aspm_force)
diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h
index 91ba0b338b47..ce6810512c66 100644
--- a/include/linux/pci-aspm.h
+++ b/include/linux/pci-aspm.h
@@ -27,6 +27,7 @@ extern void pcie_aspm_init_link_state(struct pci_dev *pdev);
 extern void pcie_aspm_exit_link_state(struct pci_dev *pdev);
 extern void pcie_aspm_pm_state_change(struct pci_dev *pdev);
 extern void pci_disable_link_state(struct pci_dev *pdev, int state);
+extern void pcie_clear_aspm(void);
 extern void pcie_no_aspm(void);
 #else
 static inline void pcie_aspm_init_link_state(struct pci_dev *pdev)
@@ -41,7 +42,9 @@ static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev)
 static inline void pci_disable_link_state(struct pci_dev *pdev, int state)
 {
 }
-
+static inline void pcie_clear_aspm(void)
+{
+}
 static inline void pcie_no_aspm(void)
 {
 }
-- 
cgit v1.2.3-71-gd317


From 1d3c16a818e992c199844954d95c17fd7ce6cbba Mon Sep 17 00:00:00 2001
From: Jon Mason <jon.mason@exar.com>
Date: Tue, 30 Nov 2010 17:43:26 -0600
Subject: PCI: make pci_restore_state return void

pci_restore_state only ever returns 0, thus there is no benefit in
having it return any value.  Also, a large majority of the callers do
not check the return code of pci_restore_state.  Make the
pci_restore_state a void return and avoid the overhead.

Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Jon Mason <jon.mason@exar.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/media/video/cafe_ccic.c         |  4 +---
 drivers/net/myri10ge/myri10ge.c         |  4 +---
 drivers/net/sfc/falcon.c                | 25 +++++--------------------
 drivers/net/skge.c                      |  4 +---
 drivers/net/sky2.c                      |  5 +----
 drivers/net/wireless/rt2x00/rt2x00pci.c |  4 ++--
 drivers/pci/pci-driver.c                |  3 ++-
 drivers/pci/pci.c                       |  7 ++-----
 drivers/scsi/ipr.c                      |  8 +-------
 drivers/scsi/pmcraid.c                  |  7 +------
 drivers/staging/sm7xx/smtcfb.c          |  2 +-
 include/linux/pci.h                     |  8 +++-----
 sound/pci/cs5535audio/cs5535audio_pm.c  |  7 +------
 13 files changed, 22 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/cafe_ccic.c b/drivers/media/video/cafe_ccic.c
index 0dfff50891e4..737bb877e962 100644
--- a/drivers/media/video/cafe_ccic.c
+++ b/drivers/media/video/cafe_ccic.c
@@ -2186,9 +2186,7 @@ static int cafe_pci_resume(struct pci_dev *pdev)
 	struct cafe_camera *cam = to_cam(v4l2_dev);
 	int ret = 0;
 
-	ret = pci_restore_state(pdev);
-	if (ret)
-		return ret;
+	pci_restore_state(pdev);
 	ret = pci_enable_device(pdev);
 
 	if (ret) {
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 8524cc40ec57..d3c4a374a92e 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -3403,9 +3403,7 @@ static int myri10ge_resume(struct pci_dev *pdev)
 		return -EIO;
 	}
 
-	status = pci_restore_state(pdev);
-	if (status)
-		return status;
+	pci_restore_state(pdev);
 
 	status = pci_enable_device(pdev);
 	if (status) {
diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c
index 267019bb2b15..1763b9a7fd8e 100644
--- a/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@ -1066,22 +1066,9 @@ static int falcon_reset_hw(struct efx_nic *efx, enum reset_type method)
 
 	/* Restore PCI configuration if needed */
 	if (method == RESET_TYPE_WORLD) {
-		if (efx_nic_is_dual_func(efx)) {
-			rc = pci_restore_state(nic_data->pci_dev2);
-			if (rc) {
-				netif_err(efx, drv, efx->net_dev,
-					  "failed to restore PCI config for "
-					  "the secondary function\n");
-				goto fail3;
-			}
-		}
-		rc = pci_restore_state(efx->pci_dev);
-		if (rc) {
-			netif_err(efx, drv, efx->net_dev,
-				  "failed to restore PCI config for the "
-				  "primary function\n");
-			goto fail4;
-		}
+		if (efx_nic_is_dual_func(efx))
+			pci_restore_state(nic_data->pci_dev2);
+		pci_restore_state(efx->pci_dev);
 		netif_dbg(efx, drv, efx->net_dev,
 			  "successfully restored PCI config\n");
 	}
@@ -1092,7 +1079,7 @@ static int falcon_reset_hw(struct efx_nic *efx, enum reset_type method)
 		rc = -ETIMEDOUT;
 		netif_err(efx, hw, efx->net_dev,
 			  "timed out waiting for hardware reset\n");
-		goto fail5;
+		goto fail3;
 	}
 	netif_dbg(efx, hw, efx->net_dev, "hardware reset complete\n");
 
@@ -1100,11 +1087,9 @@ static int falcon_reset_hw(struct efx_nic *efx, enum reset_type method)
 
 	/* pci_save_state() and pci_restore_state() MUST be called in pairs */
 fail2:
-fail3:
 	pci_restore_state(efx->pci_dev);
 fail1:
-fail4:
-fail5:
+fail3:
 	return rc;
 }
 
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 220e0398f1d5..61553af38e65 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -4087,9 +4087,7 @@ static int skge_resume(struct pci_dev *pdev)
 	if (err)
 		goto out;
 
-	err = pci_restore_state(pdev);
-	if (err)
-		goto out;
+	pci_restore_state(pdev);
 
 	err = skge_reset(hw);
 	if (err)
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index d6577084ce70..be3aee782760 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -4969,10 +4969,7 @@ static int sky2_resume(struct pci_dev *pdev)
 	if (err)
 		goto out;
 
-	err = pci_restore_state(pdev);
-	if (err)
-		goto out;
-
+	pci_restore_state(pdev);
 	pci_enable_wake(pdev, PCI_D0, 0);
 
 	/* Re-enable all clocks */
diff --git a/drivers/net/wireless/rt2x00/rt2x00pci.c b/drivers/net/wireless/rt2x00/rt2x00pci.c
index 2449d785cf8d..4fd4c33de6ae 100644
--- a/drivers/net/wireless/rt2x00/rt2x00pci.c
+++ b/drivers/net/wireless/rt2x00/rt2x00pci.c
@@ -356,12 +356,12 @@ int rt2x00pci_resume(struct pci_dev *pci_dev)
 	struct rt2x00_dev *rt2x00dev = hw->priv;
 
 	if (pci_set_power_state(pci_dev, PCI_D0) ||
-	    pci_enable_device(pci_dev) ||
-	    pci_restore_state(pci_dev)) {
+	    pci_enable_device(pci_dev)) {
 		ERROR(rt2x00dev, "Failed to resume device.\n");
 		return -EIO;
 	}
 
+	pci_restore_state(pci_dev);
 	return rt2x00lib_resume(rt2x00dev);
 }
 EXPORT_SYMBOL_GPL(rt2x00pci_resume);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 8a6f797de8e5..80e551ee640b 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -449,7 +449,8 @@ static int pci_restore_standard_config(struct pci_dev *pci_dev)
 			return error;
 	}
 
-	return pci_restore_state(pci_dev);
+	pci_restore_state(pci_dev);
+	return 0;
 }
 
 static void pci_pm_default_resume_early(struct pci_dev *pci_dev)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 710c8a29be0d..6762dcae90ab 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -937,14 +937,13 @@ pci_save_state(struct pci_dev *dev)
  * pci_restore_state - Restore the saved state of a PCI device
  * @dev: - PCI device that we're dealing with
  */
-int 
-pci_restore_state(struct pci_dev *dev)
+void pci_restore_state(struct pci_dev *dev)
 {
 	int i;
 	u32 val;
 
 	if (!dev->state_saved)
-		return 0;
+		return;
 
 	/* PCI Express register must be restored first */
 	pci_restore_pcie_state(dev);
@@ -968,8 +967,6 @@ pci_restore_state(struct pci_dev *dev)
 	pci_restore_iov_state(dev);
 
 	dev->state_saved = false;
-
-	return 0;
 }
 
 static int do_pci_enable_device(struct pci_dev *dev, int bars)
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 5bbaee597e88..524d586c3147 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -7487,16 +7487,10 @@ static int ipr_reset_restore_cfg_space(struct ipr_cmnd *ipr_cmd)
 {
 	struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg;
 	volatile u32 int_reg;
-	int rc;
 
 	ENTER;
 	ioa_cfg->pdev->state_saved = true;
-	rc = pci_restore_state(ioa_cfg->pdev);
-
-	if (rc != PCIBIOS_SUCCESSFUL) {
-		ipr_cmd->s.ioasa.hdr.ioasc = cpu_to_be32(IPR_IOASC_PCI_ACCESS_ERROR);
-		return IPR_RC_JOB_CONTINUE;
-	}
+	pci_restore_state(ioa_cfg->pdev);
 
 	if (ipr_set_pcix_cmd_reg(ioa_cfg)) {
 		ipr_cmd->s.ioasa.hdr.ioasc = cpu_to_be32(IPR_IOASC_PCI_ACCESS_ERROR);
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 300d59f389da..321cf3ae8630 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -2228,12 +2228,7 @@ static void pmcraid_ioa_reset(struct pmcraid_cmd *cmd)
 		/* Once either bist or pci reset is done, restore PCI config
 		 * space. If this fails, proceed with hard reset again
 		 */
-		if (pci_restore_state(pinstance->pdev)) {
-			pmcraid_info("config-space error resetting again\n");
-			pinstance->ioa_state = IOA_STATE_IN_RESET_ALERT;
-			pmcraid_reset_alert(cmd);
-			break;
-		}
+		pci_restore_state(pinstance->pdev);
 
 		/* fail all pending commands */
 		pmcraid_fail_outstanding_cmds(pinstance);
diff --git a/drivers/staging/sm7xx/smtcfb.c b/drivers/staging/sm7xx/smtcfb.c
index 24f47d6388f4..7162dee3b0d8 100644
--- a/drivers/staging/sm7xx/smtcfb.c
+++ b/drivers/staging/sm7xx/smtcfb.c
@@ -1071,7 +1071,7 @@ static int __maybe_unused smtcfb_resume(struct pci_dev *pdev)
 	/* when resuming, restore pci data and fb cursor */
 	if (pdev->dev.power.power_state.event != PM_EVENT_FREEZE) {
 		retv = pci_set_power_state(pdev, PCI_D0);
-		retv = pci_restore_state(pdev);
+		pci_restore_state(pdev);
 		if (pci_enable_device(pdev))
 			return -1;
 		pci_set_master(pdev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7454408c41b6..63cbadce337e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -806,7 +806,7 @@ size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size);
 
 /* Power management related routines */
 int pci_save_state(struct pci_dev *dev);
-int pci_restore_state(struct pci_dev *dev);
+void pci_restore_state(struct pci_dev *dev);
 int __pci_complete_power_transition(struct pci_dev *dev, pci_power_t state);
 int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
@@ -1168,10 +1168,8 @@ static inline int pci_save_state(struct pci_dev *dev)
 	return 0;
 }
 
-static inline int pci_restore_state(struct pci_dev *dev)
-{
-	return 0;
-}
+static inline void pci_restore_state(struct pci_dev *dev)
+{ }
 
 static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 {
diff --git a/sound/pci/cs5535audio/cs5535audio_pm.c b/sound/pci/cs5535audio/cs5535audio_pm.c
index a3301cc4ab82..185b00088320 100644
--- a/sound/pci/cs5535audio/cs5535audio_pm.c
+++ b/sound/pci/cs5535audio/cs5535audio_pm.c
@@ -90,12 +90,7 @@ int snd_cs5535audio_resume(struct pci_dev *pci)
 	int i;
 
 	pci_set_power_state(pci, PCI_D0);
-	if (pci_restore_state(pci) < 0) {
-		printk(KERN_ERR "cs5535audio: pci_restore_state failed, "
-		       "disabling device\n");
-		snd_card_disconnect(card);
-		return -EIO;
-	}
+	pci_restore_state(pci);
 	if (pci_enable_device(pci) < 0) {
 		printk(KERN_ERR "cs5535audio: pci_enable_device failed, "
 		       "disabling device\n");
-- 
cgit v1.2.3-71-gd317


From 9b444b36fee16d2aaae9cc91ce594ecb15d922a9 Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Wed, 17 Nov 2010 12:12:08 -0700
Subject: x86/PCI: irq and pci_ids patch for Intel Patsburg

This patch adds an additional LPC Controller DeviceID for the Intel
Patsburg PCH.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c      | 3 ++-
 include/linux/pci_ids.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9f9bfb705cf9..87e6c8323117 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	case PCI_DEVICE_ID_INTEL_ICH10_1:
 	case PCI_DEVICE_ID_INTEL_ICH10_2:
 	case PCI_DEVICE_ID_INTEL_ICH10_3:
-	case PCI_DEVICE_ID_INTEL_PATSBURG_LPC:
+	case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
+	case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cb845c16ad7d..d830106827fd 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2468,7 +2468,8 @@
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN	0x1c41
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX	0x1c5f
 #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS	0x1d22
-#define PCI_DEVICE_ID_INTEL_PATSBURG_LPC	0x1d40
+#define PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0	0x1d40
+#define PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1	0x1d41
 #define PCI_DEVICE_ID_INTEL_82801AA_0	0x2410
 #define PCI_DEVICE_ID_INTEL_82801AA_1	0x2411
 #define PCI_DEVICE_ID_INTEL_82801AA_3	0x2413
-- 
cgit v1.2.3-71-gd317


From fe31e69740eddc7316071ed5165fed6703c8cd12 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 19 Dec 2010 15:57:16 +0100
Subject: PCI/PCIe: Clear Root PME Status bits early during system resume

I noticed that PCI Express PMEs don't work on my Toshiba Portege R500
after the system has been woken up from a sleep state by a PME
(through Wake-on-LAN).  After some investigation it turned out that
the BIOS didn't clear the Root PME Status bit in the root port that
received the wakeup PME and since the Requester ID was also set in
the port's Root Status register, any subsequent PMEs didn't trigger
interrupts.

This problem can be avoided by clearing the Root PME Status bits in
all PCI Express root ports during early resume.  For this purpose,
add an early resume routine to the PCIe port driver and make this
driver be always registered, even if pci_ports_disable is set (in
which case the driver's only function is to provide the early
resume callback).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/pme.c          | 27 ++++-----------------------
 drivers/pci/pcie/portdrv.h      |  2 ++
 drivers/pci/pcie/portdrv_core.c | 25 ++++++++++++++-----------
 drivers/pci/pcie/portdrv_pci.c  | 37 +++++++++++++++++++++++++++++++++----
 include/linux/pci_regs.h        |  2 ++
 5 files changed, 55 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c
index 2f3c90407227..073f0308c6b2 100644
--- a/drivers/pci/pcie/pme.c
+++ b/drivers/pci/pcie/pme.c
@@ -26,9 +26,6 @@
 #include "../pci.h"
 #include "portdrv.h"
 
-#define PCI_EXP_RTSTA_PME	0x10000 /* PME status */
-#define PCI_EXP_RTSTA_PENDING	0x20000 /* PME pending */
-
 /*
  * If this switch is set, MSI will not be used for PCIe PME signaling.  This
  * causes the PCIe port driver to use INTx interrupts only, but it turns out
@@ -73,22 +70,6 @@ void pcie_pme_interrupt_enable(struct pci_dev *dev, bool enable)
 	pci_write_config_word(dev, rtctl_pos, rtctl);
 }
 
-/**
- * pcie_pme_clear_status - Clear root port PME interrupt status.
- * @dev: PCIe root port or event collector.
- */
-static void pcie_pme_clear_status(struct pci_dev *dev)
-{
-	int rtsta_pos;
-	u32 rtsta;
-
-	rtsta_pos = pci_pcie_cap(dev) + PCI_EXP_RTSTA;
-
-	pci_read_config_dword(dev, rtsta_pos, &rtsta);
-	rtsta |= PCI_EXP_RTSTA_PME;
-	pci_write_config_dword(dev, rtsta_pos, rtsta);
-}
-
 /**
  * pcie_pme_walk_bus - Scan a PCI bus for devices asserting PME#.
  * @bus: PCI bus to scan.
@@ -253,7 +234,7 @@ static void pcie_pme_work_fn(struct work_struct *work)
 			 * Clear PME status of the port.  If there are other
 			 * pending PMEs, the status will be set again.
 			 */
-			pcie_pme_clear_status(port);
+			pcie_clear_root_pme_status(port);
 
 			spin_unlock_irq(&data->lock);
 			pcie_pme_handle_request(port, rtsta & 0xffff);
@@ -378,7 +359,7 @@ static int pcie_pme_probe(struct pcie_device *srv)
 
 	port = srv->port;
 	pcie_pme_interrupt_enable(port, false);
-	pcie_pme_clear_status(port);
+	pcie_clear_root_pme_status(port);
 
 	ret = request_irq(srv->irq, pcie_pme_irq, IRQF_SHARED, "PCIe PME", srv);
 	if (ret) {
@@ -402,7 +383,7 @@ static int pcie_pme_suspend(struct pcie_device *srv)
 
 	spin_lock_irq(&data->lock);
 	pcie_pme_interrupt_enable(port, false);
-	pcie_pme_clear_status(port);
+	pcie_clear_root_pme_status(port);
 	data->noirq = true;
 	spin_unlock_irq(&data->lock);
 
@@ -422,7 +403,7 @@ static int pcie_pme_resume(struct pcie_device *srv)
 
 	spin_lock_irq(&data->lock);
 	data->noirq = false;
-	pcie_pme_clear_status(port);
+	pcie_clear_root_pme_status(port);
 	pcie_pme_interrupt_enable(port, true);
 	spin_unlock_irq(&data->lock);
 
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 7b5aba0a3291..8fcc03598b29 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -35,6 +35,8 @@ extern void pcie_port_bus_unregister(void);
 
 struct pci_dev;
 
+extern void pcie_clear_root_pme_status(struct pci_dev *dev);
+
 #ifdef CONFIG_PCIE_PME
 extern bool pcie_pme_msi_disabled;
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index a9c222d79ebc..5130d0d22390 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -241,17 +241,17 @@ static int get_port_device_capability(struct pci_dev *dev)
 	int cap_mask;
 	int err;
 
+	if (pcie_ports_disabled)
+		return 0;
+
 	err = pcie_port_platform_notify(dev, &cap_mask);
-	if (pcie_ports_auto) {
-		if (err) {
-			pcie_no_aspm();
-			return 0;
-		}
-	} else {
+	if (!pcie_ports_auto) {
 		cap_mask = PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP
 				| PCIE_PORT_SERVICE_VC;
 		if (pci_aer_available())
 			cap_mask |= PCIE_PORT_SERVICE_AER;
+	} else if (err) {
+			return 0;
 	}
 
 	pos = pci_pcie_cap(dev);
@@ -349,15 +349,18 @@ int pcie_port_device_register(struct pci_dev *dev)
 	int status, capabilities, i, nr_service;
 	int irqs[PCIE_PORT_DEVICE_MAXSERVICES];
 
-	/* Get and check PCI Express port services */
-	capabilities = get_port_device_capability(dev);
-	if (!capabilities)
-		return -ENODEV;
-
 	/* Enable PCI Express port device */
 	status = pci_enable_device(dev);
 	if (status)
 		return status;
+
+	/* Get and check PCI Express port services */
+	capabilities = get_port_device_capability(dev);
+	if (!capabilities) {
+		pcie_no_aspm();
+		return 0;
+	}
+
 	pci_set_master(dev);
 	/*
 	 * Initialize service irqs. Don't use service devices that
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index f9033e190fb6..e0610bda1dea 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -57,6 +57,22 @@ __setup("pcie_ports=", pcie_port_setup);
 
 /* global data */
 
+/**
+ * pcie_clear_root_pme_status - Clear root port PME interrupt status.
+ * @dev: PCIe root port or event collector.
+ */
+void pcie_clear_root_pme_status(struct pci_dev *dev)
+{
+	int rtsta_pos;
+	u32 rtsta;
+
+	rtsta_pos = pci_pcie_cap(dev) + PCI_EXP_RTSTA;
+
+	pci_read_config_dword(dev, rtsta_pos, &rtsta);
+	rtsta |= PCI_EXP_RTSTA_PME;
+	pci_write_config_dword(dev, rtsta_pos, rtsta);
+}
+
 static int pcie_portdrv_restore_config(struct pci_dev *dev)
 {
 	int retval;
@@ -69,6 +85,20 @@ static int pcie_portdrv_restore_config(struct pci_dev *dev)
 }
 
 #ifdef CONFIG_PM
+static int pcie_port_resume_noirq(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	/*
+	 * Some BIOSes forget to clear Root PME Status bits after system wakeup
+	 * which breaks ACPI-based runtime wakeup on PCI Express, so clear those
+	 * bits now just in case (shouldn't hurt).
+	 */
+	if(pdev->pcie_type == PCI_EXP_TYPE_ROOT_PORT)
+		pcie_clear_root_pme_status(pdev);
+	return 0;
+}
+
 static const struct dev_pm_ops pcie_portdrv_pm_ops = {
 	.suspend	= pcie_port_device_suspend,
 	.resume		= pcie_port_device_resume,
@@ -76,6 +106,7 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
 	.thaw		= pcie_port_device_resume,
 	.poweroff	= pcie_port_device_suspend,
 	.restore	= pcie_port_device_resume,
+	.resume_noirq	= pcie_port_resume_noirq,
 };
 
 #define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
@@ -327,10 +358,8 @@ static int __init pcie_portdrv_init(void)
 {
 	int retval;
 
-	if (pcie_ports_disabled) {
-		pcie_no_aspm();
-		return -EACCES;
-	}
+	if (pcie_ports_disabled)
+		return pci_register_driver(&pcie_portdriver);
 
 	dmi_check_system(pcie_portdrv_dmi_table);
 
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d4f2c80a6c3e..5b7e6b1ba54f 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -504,6 +504,8 @@
 #define  PCI_EXP_RTCTL_CRSSVE	0x10	/* CRS Software Visibility Enable */
 #define PCI_EXP_RTCAP		30	/* Root Capabilities */
 #define PCI_EXP_RTSTA		32	/* Root Status */
+#define PCI_EXP_RTSTA_PME	0x10000 /* PME status */
+#define PCI_EXP_RTSTA_PENDING	0x20000 /* PME pending */
 #define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
 #define  PCI_EXP_DEVCAP2_ARI	0x20	/* Alternative Routing-ID */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
-- 
cgit v1.2.3-71-gd317


From 0131d8973c8b9bd9d40fee8fae24eab24821efdb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 1 Dec 2010 10:54:46 +0100
Subject: of/address: use proper endianess in get_flags

This patch changes u32 to __be32 for all "ranges", "prop" and "addr" and
such. Those variables are pointing to the device tree which contains
integers in big endian format.

Most functions are doing it right because of_read_number() is doing the
right thing for them. of_bus_isa_get_flags(), of_bus_pci_get_flags() and
of_bus_isa_map() were accessing the data directly and were doing it wrong.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/include/asm/prom.h |  2 +-
 drivers/of/address.c            | 54 +++++++++++++++++++++--------------------
 include/linux/of_address.h      |  6 ++---
 3 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 98264bf0a433..d72757585595 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -42,7 +42,7 @@ extern void pci_create_OF_bus_map(void);
 
 /* Translate a DMA address from device space to CPU space */
 extern u64 of_translate_dma_address(struct device_node *dev,
-				    const u32 *in_addr);
+				    const __be32 *in_addr);
 
 #ifdef CONFIG_PCI
 extern unsigned long pci_address_to_pio(phys_addr_t address);
diff --git a/drivers/of/address.c b/drivers/of/address.c
index 3a1c7e70b192..b4559c58c095 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -12,13 +12,13 @@
 			(ns) > 0)
 
 static struct of_bus *of_match_bus(struct device_node *np);
-static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				    u64 size, unsigned int flags,
+static int __of_address_to_resource(struct device_node *dev,
+		const __be32 *addrp, u64 size, unsigned int flags,
 				    struct resource *r);
 
 /* Debug utility */
 #ifdef DEBUG
-static void of_dump_addr(const char *s, const u32 *addr, int na)
+static void of_dump_addr(const char *s, const __be32 *addr, int na)
 {
 	printk(KERN_DEBUG "%s", s);
 	while (na--)
@@ -26,7 +26,7 @@ static void of_dump_addr(const char *s, const u32 *addr, int na)
 	printk("\n");
 }
 #else
-static void of_dump_addr(const char *s, const u32 *addr, int na) { }
+static void of_dump_addr(const char *s, const __be32 *addr, int na) { }
 #endif
 
 /* Callbacks for bus specific translators */
@@ -36,10 +36,10 @@ struct of_bus {
 	int		(*match)(struct device_node *parent);
 	void		(*count_cells)(struct device_node *child,
 				       int *addrc, int *sizec);
-	u64		(*map)(u32 *addr, const u32 *range,
+	u64		(*map)(u32 *addr, const __be32 *range,
 				int na, int ns, int pna);
 	int		(*translate)(u32 *addr, u64 offset, int na);
-	unsigned int	(*get_flags)(const u32 *addr);
+	unsigned int	(*get_flags)(const __be32 *addr);
 };
 
 /*
@@ -55,7 +55,7 @@ static void of_bus_default_count_cells(struct device_node *dev,
 		*sizec = of_n_size_cells(dev);
 }
 
-static u64 of_bus_default_map(u32 *addr, const u32 *range,
+static u64 of_bus_default_map(u32 *addr, const __be32 *range,
 		int na, int ns, int pna)
 {
 	u64 cp, s, da;
@@ -85,7 +85,7 @@ static int of_bus_default_translate(u32 *addr, u64 offset, int na)
 	return 0;
 }
 
-static unsigned int of_bus_default_get_flags(const u32 *addr)
+static unsigned int of_bus_default_get_flags(const __be32 *addr)
 {
 	return IORESOURCE_MEM;
 }
@@ -110,10 +110,10 @@ static void of_bus_pci_count_cells(struct device_node *np,
 		*sizec = 2;
 }
 
-static unsigned int of_bus_pci_get_flags(const u32 *addr)
+static unsigned int of_bus_pci_get_flags(const __be32 *addr)
 {
 	unsigned int flags = 0;
-	u32 w = addr[0];
+	u32 w = be32_to_cpup(addr);
 
 	switch((w >> 24) & 0x03) {
 	case 0x01:
@@ -129,7 +129,8 @@ static unsigned int of_bus_pci_get_flags(const u32 *addr)
 	return flags;
 }
 
-static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
+static u64 of_bus_pci_map(u32 *addr, const __be32 *range, int na, int ns,
+		int pna)
 {
 	u64 cp, s, da;
 	unsigned int af, rf;
@@ -160,7 +161,7 @@ static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
 	return of_bus_default_translate(addr + 1, offset, na - 1);
 }
 
-const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
+const __be32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
 			unsigned int *flags)
 {
 	const __be32 *prop;
@@ -207,7 +208,7 @@ EXPORT_SYMBOL(of_get_pci_address);
 int of_pci_address_to_resource(struct device_node *dev, int bar,
 			       struct resource *r)
 {
-	const u32	*addrp;
+	const __be32	*addrp;
 	u64		size;
 	unsigned int	flags;
 
@@ -237,12 +238,13 @@ static void of_bus_isa_count_cells(struct device_node *child,
 		*sizec = 1;
 }
 
-static u64 of_bus_isa_map(u32 *addr, const u32 *range, int na, int ns, int pna)
+static u64 of_bus_isa_map(u32 *addr, const __be32 *range, int na, int ns,
+		int pna)
 {
 	u64 cp, s, da;
 
 	/* Check address type match */
-	if ((addr[0] ^ range[0]) & 0x00000001)
+	if ((addr[0] ^ range[0]) & cpu_to_be32(1))
 		return OF_BAD_ADDR;
 
 	/* Read address values, skipping high cell */
@@ -264,10 +266,10 @@ static int of_bus_isa_translate(u32 *addr, u64 offset, int na)
 	return of_bus_default_translate(addr + 1, offset, na - 1);
 }
 
-static unsigned int of_bus_isa_get_flags(const u32 *addr)
+static unsigned int of_bus_isa_get_flags(const __be32 *addr)
 {
 	unsigned int flags = 0;
-	u32 w = addr[0];
+	u32 w = be32_to_cpup(addr);
 
 	if (w & 1)
 		flags |= IORESOURCE_IO;
@@ -330,7 +332,7 @@ static int of_translate_one(struct device_node *parent, struct of_bus *bus,
 			    struct of_bus *pbus, u32 *addr,
 			    int na, int ns, int pna, const char *rprop)
 {
-	const u32 *ranges;
+	const __be32 *ranges;
 	unsigned int rlen;
 	int rone;
 	u64 offset = OF_BAD_ADDR;
@@ -398,7 +400,7 @@ static int of_translate_one(struct device_node *parent, struct of_bus *bus,
  * that can be mapped to a cpu physical address). This is not really specified
  * that way, but this is traditionally the way IBM at least do things
  */
-u64 __of_translate_address(struct device_node *dev, const u32 *in_addr,
+u64 __of_translate_address(struct device_node *dev, const __be32 *in_addr,
 			   const char *rprop)
 {
 	struct device_node *parent = NULL;
@@ -475,22 +477,22 @@ u64 __of_translate_address(struct device_node *dev, const u32 *in_addr,
 	return result;
 }
 
-u64 of_translate_address(struct device_node *dev, const u32 *in_addr)
+u64 of_translate_address(struct device_node *dev, const __be32 *in_addr)
 {
 	return __of_translate_address(dev, in_addr, "ranges");
 }
 EXPORT_SYMBOL(of_translate_address);
 
-u64 of_translate_dma_address(struct device_node *dev, const u32 *in_addr)
+u64 of_translate_dma_address(struct device_node *dev, const __be32 *in_addr)
 {
 	return __of_translate_address(dev, in_addr, "dma-ranges");
 }
 EXPORT_SYMBOL(of_translate_dma_address);
 
-const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
+const __be32 *of_get_address(struct device_node *dev, int index, u64 *size,
 		    unsigned int *flags)
 {
-	const u32 *prop;
+	const __be32 *prop;
 	unsigned int psize;
 	struct device_node *parent;
 	struct of_bus *bus;
@@ -525,8 +527,8 @@ const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
 }
 EXPORT_SYMBOL(of_get_address);
 
-static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				    u64 size, unsigned int flags,
+static int __of_address_to_resource(struct device_node *dev,
+		const __be32 *addrp, u64 size, unsigned int flags,
 				    struct resource *r)
 {
 	u64 taddr;
@@ -564,7 +566,7 @@ static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
 int of_address_to_resource(struct device_node *dev, int index,
 			   struct resource *r)
 {
-	const u32	*addrp;
+	const __be32	*addrp;
 	u64		size;
 	unsigned int	flags;
 
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 8aea06f0564c..2feda6ee6140 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -3,7 +3,7 @@
 #include <linux/ioport.h>
 #include <linux/of.h>
 
-extern u64 of_translate_address(struct device_node *np, const u32 *addr);
+extern u64 of_translate_address(struct device_node *np, const __be32 *addr);
 extern int of_address_to_resource(struct device_node *dev, int index,
 				  struct resource *r);
 extern void __iomem *of_iomap(struct device_node *device, int index);
@@ -21,7 +21,7 @@ static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; }
 #endif
 
 #ifdef CONFIG_PCI
-extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
+extern const __be32 *of_get_pci_address(struct device_node *dev, int bar_no,
 			       u64 *size, unsigned int *flags);
 extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 				      struct resource *r);
@@ -32,7 +32,7 @@ static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
 	return -ENOSYS;
 }
 
-static inline const u32 *of_get_pci_address(struct device_node *dev,
+static inline const __be32 *of_get_pci_address(struct device_node *dev,
 		int bar_no, u64 *size, unsigned int *flags)
 {
 	return NULL;
-- 
cgit v1.2.3-71-gd317


From c7b61de5b7b17f0df34dc7d2f8b9576f8bd36fce Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 1 Dec 2010 00:14:42 +0100
Subject: PM / Runtime: Add synchronous runtime interface for interrupt
 handlers (v3)

This patch (as1431c) makes the synchronous runtime-PM interface
suitable for use in interrupt handlers.  Subsystems can call the new
pm_runtime_irq_safe() function to tell the PM core that a device's
runtime_suspend and runtime_resume callbacks should be invoked with
interrupts disabled and the spinlock held.  This permits the
pm_runtime_get_sync() and the new pm_runtime_put_sync_suspend()
routines to be called from within interrupt handlers.

When a device is declared irq-safe in this way, the PM core increments
the parent's usage count, so the parent will never be runtime
suspended.  This prevents difficult situations in which an irq-safe
device can't resume because it is forced to wait for its non-irq-safe
parent.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/runtime_pm.txt | 31 +++++++++++++++++++++++++
 drivers/base/power/runtime.c       | 47 ++++++++++++++++++++++++++++++--------
 include/linux/pm.h                 |  1 +
 include/linux/pm_runtime.h         |  7 ++++++
 4 files changed, 77 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 41cc7b30d7dd..ffe55ffa540a 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -50,6 +50,15 @@ type's callbacks are not defined) of given device.  The bus type, device type
 and device class callbacks are referred to as subsystem-level callbacks in what
 follows.
 
+By default, the callbacks are always invoked in process context with interrupts
+enabled.  However, subsystems can use the pm_runtime_irq_safe() helper function
+to tell the PM core that a device's ->runtime_suspend() and ->runtime_resume()
+callbacks should be invoked in atomic context with interrupts disabled
+(->runtime_idle() is still invoked the default way).  This implies that these
+callback routines must not block or sleep, but it also means that the
+synchronous helper functions listed at the end of Section 4 can be used within
+an interrupt handler or in an atomic context.
+
 The subsystem-level suspend callback is _entirely_ _responsible_ for handling
 the suspend of the device as appropriate, which may, but need not include
 executing the device driver's own ->runtime_suspend() callback (from the
@@ -237,6 +246,10 @@ defined in include/linux/pm.h:
       Section 8); it may be modified only by the pm_runtime_no_callbacks()
       helper function
 
+  unsigned int irq_safe;
+    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
+      will be invoked with the spinlock held and interrupts disabled
+
   unsigned int use_autosuspend;
     - indicates that the device's driver supports delayed autosuspend (see
       Section 9); it may be modified only by the
@@ -344,6 +357,10 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
     - decrement the device's usage counter; if the result is 0 then run
       pm_runtime_idle(dev) and return its result
 
+  int pm_runtime_put_sync_suspend(struct device *dev);
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_suspend(dev) and return its result
+
   int pm_runtime_put_sync_autosuspend(struct device *dev);
     - decrement the device's usage counter; if the result is 0 then run
       pm_runtime_autosuspend(dev) and return its result
@@ -397,6 +414,11 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
       PM attributes from /sys/devices/.../power (or prevent them from being
       added when the device is registered)
 
+  void pm_runtime_irq_safe(struct device *dev);
+    - set the power.irq_safe flag for the device, causing the runtime-PM
+      suspend and resume callbacks (but not the idle callback) to be invoked
+      with interrupts disabled
+
   void pm_runtime_mark_last_busy(struct device *dev);
     - set the power.last_busy field to the current time
 
@@ -438,6 +460,15 @@ pm_runtime_suspended()
 pm_runtime_mark_last_busy()
 pm_runtime_autosuspend_expiration()
 
+If pm_runtime_irq_safe() has been called for a device then the following helper
+functions may also be used in interrupt context:
+
+pm_runtime_suspend()
+pm_runtime_autosuspend()
+pm_runtime_resume()
+pm_runtime_get_sync()
+pm_runtime_put_sync_suspend()
+
 5. Run-time PM Initialization, Device Probing and Removal
 
 Initially, the run-time PM is disabled for all devices, which means that the
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 02c652be83e7..656493a5e073 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -250,13 +250,16 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev)
 	if (!cb)
 		return -ENOSYS;
 
-	spin_unlock_irq(&dev->power.lock);
+	if (dev->power.irq_safe) {
+		retval = cb(dev);
+	} else {
+		spin_unlock_irq(&dev->power.lock);
 
-	retval = cb(dev);
+		retval = cb(dev);
 
-	spin_lock_irq(&dev->power.lock);
+		spin_lock_irq(&dev->power.lock);
+	}
 	dev->power.runtime_error = retval;
-
 	return retval;
 }
 
@@ -404,7 +407,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 		goto out;
 	}
 
-	if (parent && !parent->power.ignore_children) {
+	if (parent && !parent->power.ignore_children && !dev->power.irq_safe) {
 		spin_unlock_irq(&dev->power.lock);
 
 		pm_request_idle(parent);
@@ -527,10 +530,13 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 	if (!parent && dev->parent) {
 		/*
-		 * Increment the parent's resume counter and resume it if
-		 * necessary.
+		 * Increment the parent's usage counter and resume it if
+		 * necessary.  Not needed if dev is irq-safe; then the
+		 * parent is permanently resumed.
 		 */
 		parent = dev->parent;
+		if (dev->power.irq_safe)
+			goto skip_parent;
 		spin_unlock(&dev->power.lock);
 
 		pm_runtime_get_noresume(parent);
@@ -553,6 +559,7 @@ static int rpm_resume(struct device *dev, int rpmflags)
 			goto out;
 		goto repeat;
 	}
+ skip_parent:
 
 	if (dev->power.no_callbacks)
 		goto no_callback;	/* Assume success. */
@@ -584,7 +591,7 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		rpm_idle(dev, RPM_ASYNC);
 
  out:
-	if (parent) {
+	if (parent && !dev->power.irq_safe) {
 		spin_unlock_irq(&dev->power.lock);
 
 		pm_runtime_put(parent);
@@ -1065,7 +1072,6 @@ EXPORT_SYMBOL_GPL(pm_runtime_allow);
  * Set the power.no_callbacks flag, which tells the PM core that this
  * device is power-managed through its parent and has no run-time PM
  * callbacks of its own.  The run-time sysfs attributes will be removed.
- *
  */
 void pm_runtime_no_callbacks(struct device *dev)
 {
@@ -1077,6 +1083,27 @@ void pm_runtime_no_callbacks(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks);
 
+/**
+ * pm_runtime_irq_safe - Leave interrupts disabled during callbacks.
+ * @dev: Device to handle
+ *
+ * Set the power.irq_safe flag, which tells the PM core that the
+ * ->runtime_suspend() and ->runtime_resume() callbacks for this device should
+ * always be invoked with the spinlock held and interrupts disabled.  It also
+ * causes the parent's usage counter to be permanently incremented, preventing
+ * the parent from runtime suspending -- otherwise an irq-safe child might have
+ * to wait for a non-irq-safe parent.
+ */
+void pm_runtime_irq_safe(struct device *dev)
+{
+	if (dev->parent)
+		pm_runtime_get_sync(dev->parent);
+	spin_lock_irq(&dev->power.lock);
+	dev->power.irq_safe = 1;
+	spin_unlock_irq(&dev->power.lock);
+}
+EXPORT_SYMBOL_GPL(pm_runtime_irq_safe);
+
 /**
  * update_autosuspend - Handle a change to a device's autosuspend settings.
  * @dev: Device to handle.
@@ -1199,4 +1226,6 @@ void pm_runtime_remove(struct device *dev)
 	/* Change the status back to 'suspended' to match the initial status. */
 	if (dev->power.runtime_status == RPM_ACTIVE)
 		pm_runtime_set_suspended(dev);
+	if (dev->power.irq_safe && dev->parent)
+		pm_runtime_put_sync(dev->parent);
 }
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 40f3f45702ba..61f2066e6852 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -486,6 +486,7 @@ struct dev_pm_info {
 	unsigned int		run_wake:1;
 	unsigned int		runtime_auto:1;
 	unsigned int		no_callbacks:1;
+	unsigned int		irq_safe:1;
 	unsigned int		use_autosuspend:1;
 	unsigned int		timer_autosuspends:1;
 	enum rpm_request	request;
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index d19f1cca7f74..e9cc049ccb62 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -40,6 +40,7 @@ extern int pm_generic_runtime_idle(struct device *dev);
 extern int pm_generic_runtime_suspend(struct device *dev);
 extern int pm_generic_runtime_resume(struct device *dev);
 extern void pm_runtime_no_callbacks(struct device *dev);
+extern void pm_runtime_irq_safe(struct device *dev);
 extern void __pm_runtime_use_autosuspend(struct device *dev, bool use);
 extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
 extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
@@ -124,6 +125,7 @@ static inline int pm_generic_runtime_idle(struct device *dev) { return 0; }
 static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
 static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
 static inline void pm_runtime_no_callbacks(struct device *dev) {}
+static inline void pm_runtime_irq_safe(struct device *dev) {}
 
 static inline void pm_runtime_mark_last_busy(struct device *dev) {}
 static inline void __pm_runtime_use_autosuspend(struct device *dev,
@@ -196,6 +198,11 @@ static inline int pm_runtime_put_sync(struct device *dev)
 	return __pm_runtime_idle(dev, RPM_GET_PUT);
 }
 
+static inline int pm_runtime_put_sync_suspend(struct device *dev)
+{
+	return __pm_runtime_suspend(dev, RPM_GET_PUT);
+}
+
 static inline int pm_runtime_put_sync_autosuspend(struct device *dev)
 {
 	return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO);
-- 
cgit v1.2.3-71-gd317


From a2867e08c8e3bdbc00caf56bc3bdde19ccc058e3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 3 Dec 2010 22:58:31 +0100
Subject: PM / Wakeup: Replace pm_check_wakeup_events() with
 pm_wakeup_pending()

To avoid confusion with the meaning and return value of
pm_check_wakeup_events() replace it with pm_wakeup_pending() that
will work the other way around (ie. return true when system-wide
power transition should be aborted).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/main.c   |  2 +-
 drivers/base/power/wakeup.c | 20 ++++++++++----------
 include/linux/suspend.h     |  4 ++--
 kernel/power/hibernate.c    |  4 ++--
 kernel/power/process.c      |  2 +-
 kernel/power/suspend.c      |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 4747a1e8b44a..8a5258339ca2 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1056,7 +1056,7 @@ static int dpm_prepare(pm_message_t state)
 		if (pm_runtime_barrier(dev) && device_may_wakeup(dev))
 			pm_wakeup_event(dev, 0);
 
-		if (!pm_check_wakeup_events()) {
+		if (pm_wakeup_pending()) {
 			pm_runtime_put_sync(dev);
 			error = -EBUSY;
 		} else {
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 71c5528e1c35..8ec406d8f548 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -542,26 +542,26 @@ static void pm_wakeup_update_hit_counts(void)
 }
 
 /**
- * pm_check_wakeup_events - Check for new wakeup events.
+ * pm_wakeup_pending - Check if power transition in progress should be aborted.
  *
  * Compare the current number of registered wakeup events with its preserved
- * value from the past to check if new wakeup events have been registered since
- * the old value was stored.  Check if the current number of wakeup events being
- * processed is zero.
+ * value from the past and return true if new wakeup events have been registered
+ * since the old value was stored.  Also return true if the current number of
+ * wakeup events being processed is different from zero.
  */
-bool pm_check_wakeup_events(void)
+bool pm_wakeup_pending(void)
 {
 	unsigned long flags;
-	bool ret = true;
+	bool ret = false;
 
 	spin_lock_irqsave(&events_lock, flags);
 	if (events_check_enabled) {
-		ret = ((unsigned int)atomic_read(&event_count) == saved_count)
-			&& !atomic_read(&events_in_progress);
-		events_check_enabled = ret;
+		ret = ((unsigned int)atomic_read(&event_count) != saved_count)
+			|| atomic_read(&events_in_progress);
+		events_check_enabled = !ret;
 	}
 	spin_unlock_irqrestore(&events_lock, flags);
-	if (!ret)
+	if (ret)
 		pm_wakeup_update_hit_counts();
 	return ret;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 26697514c5ec..144b34be5c32 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -292,7 +292,7 @@ extern int unregister_pm_notifier(struct notifier_block *nb);
 /* drivers/base/power/wakeup.c */
 extern bool events_check_enabled;
 
-extern bool pm_check_wakeup_events(void);
+extern bool pm_wakeup_pending(void);
 extern bool pm_get_wakeup_count(unsigned int *count);
 extern bool pm_save_wakeup_count(unsigned int count);
 #else /* !CONFIG_PM_SLEEP */
@@ -309,7 +309,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 
 #define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
 
-static inline bool pm_check_wakeup_events(void) { return true; }
+static inline bool pm_wakeup_pending(void) { return false; }
 #endif /* !CONFIG_PM_SLEEP */
 
 extern struct mutex pm_mutex;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c9a98beffee4..870f72bc72ae 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
 		goto Enable_irqs;
 	}
 
-	if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
+	if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
 		goto Power_up;
 
 	in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_HIBERNATE);
-	if (!pm_check_wakeup_events()) {
+	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
 		goto Power_up;
 	}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index eb2c88a9e562..d6d2a10320e0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -85,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
 		if (!todo || time_after(jiffies, end_time))
 			break;
 
-		if (!pm_check_wakeup_events()) {
+		if (pm_wakeup_pending()) {
 			wakeup = true;
 			break;
 		}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ecf770509d0d..5e644e3a6bf3 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -163,7 +163,7 @@ static int suspend_enter(suspend_state_t state)
 
 	error = sysdev_suspend(PMSG_SUSPEND);
 	if (!error) {
-		if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
+		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
 			error = suspend_ops->enter(state);
 			events_check_enabled = false;
 		}
-- 
cgit v1.2.3-71-gd317


From b8c76f6aed0ab7df73a6410f3f82de2c831bb144 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 16 Dec 2010 00:51:21 +0100
Subject: PM: Replace the device power.status field with a bit field

The device power.status field is too complicated for its purpose
(storing the information about whether or not the device is in the
"active" state from the PM core's point of view), so replace it with
a bit field and modify all of its users accordingly.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/main.c | 17 +++++------------
 drivers/usb/core/driver.c |  7 +++----
 include/linux/device.h    |  4 ++--
 include/linux/pm.h        | 43 ++-----------------------------------------
 4 files changed, 12 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index bb5c8cb64174..a90480baa850 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -63,7 +63,7 @@ static int async_error;
  */
 void device_pm_init(struct device *dev)
 {
-	dev->power.status = DPM_ON;
+	dev->power.in_suspend = false;
 	init_completion(&dev->power.completion);
 	complete_all(&dev->power.completion);
 	dev->power.wakeup = NULL;
@@ -98,7 +98,7 @@ void device_pm_add(struct device *dev)
 		 kobject_name(&dev->kobj));
 	mutex_lock(&dpm_list_mtx);
 	if (dev->parent) {
-		if (dev->parent->power.status >= DPM_SUSPENDING)
+		if (dev->parent->power.in_suspend)
 			dev_warn(dev, "parent %s should not be sleeping\n",
 				 dev_name(dev->parent));
 	} else if (transition_started) {
@@ -488,7 +488,6 @@ void dpm_resume_noirq(pm_message_t state)
 		int error;
 
 		get_device(dev);
-		dev->power.status = DPM_OFF;
 		list_move_tail(&dev->power.entry, &dpm_suspended_list);
 		mutex_unlock(&dpm_list_mtx);
 
@@ -541,7 +540,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
 	dpm_wait(dev->parent, async);
 	device_lock(dev);
 
-	dev->power.status = DPM_RESUMING;
+	dev->power.in_suspend = false;
 
 	if (dev->bus) {
 		if (dev->bus->pm) {
@@ -690,7 +689,7 @@ static void dpm_complete(pm_message_t state)
 		struct device *dev = to_device(dpm_prepared_list.prev);
 
 		get_device(dev);
-		dev->power.status = DPM_ON;
+		dev->power.in_suspend = false;
 		list_move(&dev->power.entry, &list);
 		mutex_unlock(&dpm_list_mtx);
 
@@ -806,7 +805,6 @@ int dpm_suspend_noirq(pm_message_t state)
 			put_device(dev);
 			break;
 		}
-		dev->power.status = DPM_OFF_IRQ;
 		if (!list_empty(&dev->power.entry))
 			list_move(&dev->power.entry, &dpm_noirq_list);
 		put_device(dev);
@@ -894,9 +892,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 		}
 	}
 
-	if (!error)
-		dev->power.status = DPM_OFF;
-
  End:
 	device_unlock(dev);
 	complete_all(&dev->power.completion);
@@ -1030,7 +1025,6 @@ static int dpm_prepare(pm_message_t state)
 		struct device *dev = to_device(dpm_list.next);
 
 		get_device(dev);
-		dev->power.status = DPM_PREPARING;
 		mutex_unlock(&dpm_list_mtx);
 
 		pm_runtime_get_noresume(dev);
@@ -1046,7 +1040,6 @@ static int dpm_prepare(pm_message_t state)
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
-			dev->power.status = DPM_ON;
 			if (error == -EAGAIN) {
 				put_device(dev);
 				error = 0;
@@ -1058,7 +1051,7 @@ static int dpm_prepare(pm_message_t state)
 			put_device(dev);
 			break;
 		}
-		dev->power.status = DPM_SUSPENDING;
+		dev->power.in_suspend = true;
 		if (!list_empty(&dev->power.entry))
 			list_move_tail(&dev->power.entry, &dpm_prepared_list);
 		put_device(dev);
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index c0e60fbcb048..4ec50224ee86 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -376,7 +376,7 @@ static int usb_unbind_interface(struct device *dev)
 		 * Just re-enable it without affecting the endpoint toggles.
 		 */
 		usb_enable_interface(udev, intf, false);
-	} else if (!error && intf->dev.power.status == DPM_ON) {
+	} else if (!error && !intf->dev.power.in_suspend) {
 		r = usb_set_interface(udev, intf->altsetting[0].
 				desc.bInterfaceNumber, 0);
 		if (r < 0)
@@ -961,7 +961,7 @@ void usb_rebind_intf(struct usb_interface *intf)
 	}
 
 	/* Try to rebind the interface */
-	if (intf->dev.power.status == DPM_ON) {
+	if (!intf->dev.power.in_suspend) {
 		intf->needs_binding = 0;
 		rc = device_attach(&intf->dev);
 		if (rc < 0)
@@ -1108,8 +1108,7 @@ static int usb_resume_interface(struct usb_device *udev,
 	if (intf->condition == USB_INTERFACE_UNBOUND) {
 
 		/* Carry out a deferred switch to altsetting 0 */
-		if (intf->needs_altsetting0 &&
-				intf->dev.power.status == DPM_ON) {
+		if (intf->needs_altsetting0 && !intf->dev.power.in_suspend) {
 			usb_set_interface(udev, intf->altsetting[0].
 					desc.bInterfaceNumber, 0);
 			intf->needs_altsetting0 = 0;
diff --git a/include/linux/device.h b/include/linux/device.h
index dd4895313468..45bc8c1669d2 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -508,13 +508,13 @@ static inline int device_is_registered(struct device *dev)
 
 static inline void device_enable_async_suspend(struct device *dev)
 {
-	if (dev->power.status == DPM_ON)
+	if (!dev->power.in_suspend)
 		dev->power.async_suspend = true;
 }
 
 static inline void device_disable_async_suspend(struct device *dev)
 {
-	if (dev->power.status == DPM_ON)
+	if (!dev->power.in_suspend)
 		dev->power.async_suspend = false;
 }
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 61f2066e6852..c1756dfeb8c5 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -366,45 +366,6 @@ extern struct dev_pm_ops generic_subsys_pm_ops;
 #define PMSG_AUTO_RESUME	((struct pm_message) \
 					{ .event = PM_EVENT_AUTO_RESUME, })
 
-/**
- * Device power management states
- *
- * These state labels are used internally by the PM core to indicate the current
- * status of a device with respect to the PM core operations.
- *
- * DPM_ON		Device is regarded as operational.  Set this way
- *			initially and when ->complete() is about to be called.
- *			Also set when ->prepare() fails.
- *
- * DPM_PREPARING	Device is going to be prepared for a PM transition.  Set
- *			when ->prepare() is about to be called.
- *
- * DPM_RESUMING		Device is going to be resumed.  Set when ->resume(),
- *			->thaw(), or ->restore() is about to be called.
- *
- * DPM_SUSPENDING	Device has been prepared for a power transition.  Set
- *			when ->prepare() has just succeeded.
- *
- * DPM_OFF		Device is regarded as inactive.  Set immediately after
- *			->suspend(), ->freeze(), or ->poweroff() has succeeded.
- *			Also set when ->resume()_noirq, ->thaw_noirq(), or
- *			->restore_noirq() is about to be called.
- *
- * DPM_OFF_IRQ		Device is in a "deep sleep".  Set immediately after
- *			->suspend_noirq(), ->freeze_noirq(), or
- *			->poweroff_noirq() has just succeeded.
- */
-
-enum dpm_state {
-	DPM_INVALID,
-	DPM_ON,
-	DPM_PREPARING,
-	DPM_RESUMING,
-	DPM_SUSPENDING,
-	DPM_OFF,
-	DPM_OFF_IRQ,
-};
-
 /**
  * Device run-time power management status.
  *
@@ -463,8 +424,8 @@ struct wakeup_source;
 struct dev_pm_info {
 	pm_message_t		power_state;
 	unsigned int		can_wakeup:1;
-	unsigned		async_suspend:1;
-	enum dpm_state		status;		/* Owned by the PM core */
+	unsigned int		async_suspend:1;
+	unsigned int		in_suspend:1;	/* Owned by the PM core */
 	spinlock_t		lock;
 #ifdef CONFIG_PM_SLEEP
 	struct list_head	entry;
-- 
cgit v1.2.3-71-gd317


From 4b31db8a16fa0d4d6a0fa42d044e7a4f4dad3641 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 24 Dec 2010 15:04:06 +0100
Subject: PM / Runtime: Generic resume shouldn't set RPM_ACTIVE unconditionally

The __pm_generic_resume() function changes the given device's runtime
PM status to RPM_ACTIVE if its driver's callback returns 0, but it
only should do that if the rumtime PM is enabled for the device.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/generic_ops.c | 2 +-
 include/linux/pm_runtime.h       | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c
index 3d2c3500069a..42f97f925629 100644
--- a/drivers/base/power/generic_ops.c
+++ b/drivers/base/power/generic_ops.c
@@ -185,7 +185,7 @@ static int __pm_generic_resume(struct device *dev, int event)
 		return 0;
 
 	ret = callback(dev);
-	if (!ret) {
+	if (!ret && pm_runtime_enabled(dev)) {
 		pm_runtime_disable(dev);
 		pm_runtime_set_active(dev);
 		pm_runtime_enable(dev);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index e9cc049ccb62..d34f067e2a7f 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -82,6 +82,11 @@ static inline bool pm_runtime_suspended(struct device *dev)
 		&& !dev->power.disable_depth;
 }
 
+static inline bool pm_runtime_enabled(struct device *dev)
+{
+	return !dev->power.disable_depth;
+}
+
 static inline void pm_runtime_mark_last_busy(struct device *dev)
 {
 	ACCESS_ONCE(dev->power.last_busy) = jiffies;
@@ -120,6 +125,7 @@ static inline void pm_runtime_put_noidle(struct device *dev) {}
 static inline bool device_run_wake(struct device *dev) { return false; }
 static inline void device_set_run_wake(struct device *dev, bool enable) {}
 static inline bool pm_runtime_suspended(struct device *dev) { return false; }
+static inline bool pm_runtime_enabled(struct device *dev) { return false; }
 
 static inline int pm_generic_runtime_idle(struct device *dev) { return 0; }
 static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
-- 
cgit v1.2.3-71-gd317


From 62bcb91573425975d6ad2389d7ab1d8feca88ab4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 24 Dec 2010 15:04:41 +0100
Subject: PM: Prototype the pm_generic_ operations

The pm_generic_ operations are all exported but are not prototyped in any
header file for direct use. Do so.

[rjw: Added extern.]

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/pm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index c1756dfeb8c5..dd9c7ab38270 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -572,4 +572,11 @@ extern unsigned int	pm_flags;
 #define PM_APM	1
 #define PM_ACPI	2
 
+extern int pm_generic_suspend(struct device *dev);
+extern int pm_generic_resume(struct device *dev);
+extern int pm_generic_freeze(struct device *dev);
+extern int pm_generic_thaw(struct device *dev);
+extern int pm_generic_restore(struct device *dev);
+extern int pm_generic_poweroff(struct device *dev);
+
 #endif /* _LINUX_PM_H */
-- 
cgit v1.2.3-71-gd317


From 9706a36e35c4ce04f28a62cfe1205b4e3b0dd13c Mon Sep 17 00:00:00 2001
From: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Date: Thu, 18 Nov 2010 15:54:59 -0800
Subject: of/flattree: Add non-boottime device tree functions

In preparation for providing run-time handling of device trees, factor
out some of the basic functions so that they take an arbitrary blob,
rather than relying on the single boot-time tree.

V2:
- functions have of_fdt_* names
- removed find_flat_dt_string
- blob argument is first

Signed-off-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/fdt.c       | 133 +++++++++++++++++++++++++++++--------------------
 include/linux/of_fdt.h |  11 ++++
 2 files changed, 90 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 2ebacf14e7de..10eab21076ea 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -22,6 +22,82 @@
 
 #include <asm/page.h>
 
+char *of_fdt_get_string(struct boot_param_header *blob, u32 offset)
+{
+	return ((char *)blob) +
+		be32_to_cpu(blob->off_dt_strings) + offset;
+}
+
+/**
+ * of_fdt_get_property - Given a node in the given flat blob, return
+ * the property ptr
+ */
+void *of_fdt_get_property(struct boot_param_header *blob,
+		       unsigned long node, const char *name,
+		       unsigned long *size)
+{
+	unsigned long p = node;
+
+	do {
+		u32 tag = be32_to_cpup((__be32 *)p);
+		u32 sz, noff;
+		const char *nstr;
+
+		p += 4;
+		if (tag == OF_DT_NOP)
+			continue;
+		if (tag != OF_DT_PROP)
+			return NULL;
+
+		sz = be32_to_cpup((__be32 *)p);
+		noff = be32_to_cpup((__be32 *)(p + 4));
+		p += 8;
+		if (be32_to_cpu(blob->version) < 0x10)
+			p = ALIGN(p, sz >= 8 ? 8 : 4);
+
+		nstr = of_fdt_get_string(blob, noff);
+		if (nstr == NULL) {
+			pr_warning("Can't find property index name !\n");
+			return NULL;
+		}
+		if (strcmp(name, nstr) == 0) {
+			if (size)
+				*size = sz;
+			return (void *)p;
+		}
+		p += sz;
+		p = ALIGN(p, 4);
+	} while (1);
+}
+
+/**
+ * of_fdt_is_compatible - Return true if given node from the given blob has
+ * compat in its compatible list
+ * @blob: A device tree blob
+ * @node: node to test
+ * @compat: compatible string to compare with compatible list.
+ */
+int of_fdt_is_compatible(struct boot_param_header *blob,
+		      unsigned long node, const char *compat)
+{
+	const char *cp;
+	unsigned long cplen, l;
+
+	cp = of_fdt_get_property(blob, node, "compatible", &cplen);
+	if (cp == NULL)
+		return 0;
+	while (cplen > 0) {
+		if (of_compat_cmp(cp, compat, strlen(compat)) == 0)
+			return 1;
+		l = strlen(cp) + 1;
+		cp += l;
+		cplen -= l;
+	}
+
+	return 0;
+}
+
+/* Everything below here references initial_boot_params directly. */
 int __initdata dt_root_addr_cells;
 int __initdata dt_root_size_cells;
 
@@ -29,12 +105,6 @@ struct boot_param_header *initial_boot_params;
 
 #ifdef CONFIG_OF_EARLY_FLATTREE
 
-char *find_flat_dt_string(u32 offset)
-{
-	return ((char *)initial_boot_params) +
-		be32_to_cpu(initial_boot_params->off_dt_strings) + offset;
-}
-
 /**
  * of_scan_flat_dt - scan flattened tree blob and call callback on each.
  * @it: callback function
@@ -123,38 +193,7 @@ unsigned long __init of_get_flat_dt_root(void)
 void *__init of_get_flat_dt_prop(unsigned long node, const char *name,
 				 unsigned long *size)
 {
-	unsigned long p = node;
-
-	do {
-		u32 tag = be32_to_cpup((__be32 *)p);
-		u32 sz, noff;
-		const char *nstr;
-
-		p += 4;
-		if (tag == OF_DT_NOP)
-			continue;
-		if (tag != OF_DT_PROP)
-			return NULL;
-
-		sz = be32_to_cpup((__be32 *)p);
-		noff = be32_to_cpup((__be32 *)(p + 4));
-		p += 8;
-		if (be32_to_cpu(initial_boot_params->version) < 0x10)
-			p = ALIGN(p, sz >= 8 ? 8 : 4);
-
-		nstr = find_flat_dt_string(noff);
-		if (nstr == NULL) {
-			pr_warning("Can't find property index name !\n");
-			return NULL;
-		}
-		if (strcmp(name, nstr) == 0) {
-			if (size)
-				*size = sz;
-			return (void *)p;
-		}
-		p += sz;
-		p = ALIGN(p, 4);
-	} while (1);
+	return of_fdt_get_property(initial_boot_params, node, name, size);
 }
 
 /**
@@ -164,21 +203,7 @@ void *__init of_get_flat_dt_prop(unsigned long node, const char *name,
  */
 int __init of_flat_dt_is_compatible(unsigned long node, const char *compat)
 {
-	const char *cp;
-	unsigned long cplen, l;
-
-	cp = of_get_flat_dt_prop(node, "compatible", &cplen);
-	if (cp == NULL)
-		return 0;
-	while (cplen > 0) {
-		if (of_compat_cmp(cp, compat, strlen(compat)) == 0)
-			return 1;
-		l = strlen(cp) + 1;
-		cp += l;
-		cplen -= l;
-	}
-
-	return 0;
+	return of_fdt_is_compatible(initial_boot_params, node, compat);
 }
 
 static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size,
@@ -303,7 +328,7 @@ unsigned long __init unflatten_dt_node(unsigned long mem,
 		if (be32_to_cpu(initial_boot_params->version) < 0x10)
 			*p = ALIGN(*p, sz >= 8 ? 8 : 4);
 
-		pname = find_flat_dt_string(noff);
+		pname = of_fdt_get_string(initial_boot_params, noff);
 		if (pname == NULL) {
 			pr_info("Can't find property name in list !\n");
 			break;
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 7bbf5b328438..70c5b736f0a3 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -58,6 +58,17 @@ struct boot_param_header {
 };
 
 #if defined(CONFIG_OF_FLATTREE)
+
+/* For scanning an arbitrary device-tree at any time */
+extern char *of_fdt_get_string(struct boot_param_header *blob, u32 offset);
+extern void *of_fdt_get_property(struct boot_param_header *blob,
+				 unsigned long node,
+				 const char *name,
+				 unsigned long *size);
+extern int of_fdt_is_compatible(struct boot_param_header *blob,
+				unsigned long node,
+				const char *compat);
+
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
 extern int __initdata dt_root_addr_cells;
 extern int __initdata dt_root_size_cells;
-- 
cgit v1.2.3-71-gd317


From fe14042358fac0673d4b6362a73796fd64379938 Mon Sep 17 00:00:00 2001
From: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Date: Thu, 18 Nov 2010 15:55:02 -0800
Subject: of/flattree: Refactor unflatten_device_tree and add
 fdt_unflatten_tree

unflatten_device_tree has two dependencies on things that happen
during boot time.  Firstly, it references the initial device tree
directly. Secondly, it allocates memory using the early boot
allocator.  This patch factors out these dependencies and uses
the new __unflatten_device_tree function to implement a driver-visible
fdt_unflatten_tree function, which can be used to unflatten a
blob after boot time.

V2:
- remove extra __va() call
- make dt_alloc functions return void *.  This doesn't fix the general
  strangeness in this code that constantly casts back and forth between
  unsigned long and __be32 *

Signed-off-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/fdt.c       | 149 ++++++++++++++++++++++++++++++++-----------------
 include/linux/of_fdt.h |   2 +
 2 files changed, 100 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 71edc9cecd62..8a90ee42071a 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -11,10 +11,12 @@
 
 #include <linux/kernel.h>
 #include <linux/initrd.h>
+#include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/string.h>
 #include <linux/errno.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_PPC
 #include <asm/machdep.h>
@@ -312,6 +314,94 @@ unsigned long unflatten_dt_node(struct boot_param_header *blob,
 	return mem;
 }
 
+/**
+ * __unflatten_device_tree - create tree of device_nodes from flat blob
+ *
+ * unflattens a device-tree, creating the
+ * tree of struct device_node. It also fills the "name" and "type"
+ * pointers of the nodes so the normal device-tree walking functions
+ * can be used.
+ * @blob: The blob to expand
+ * @mynodes: The device_node tree created by the call
+ * @dt_alloc: An allocator that provides a virtual address to memory
+ * for the resulting tree
+ */
+void __unflatten_device_tree(struct boot_param_header *blob,
+			     struct device_node **mynodes,
+			     void * (*dt_alloc)(u64 size, u64 align))
+{
+	unsigned long start, mem, size;
+	struct device_node **allnextp = mynodes;
+
+	pr_debug(" -> unflatten_device_tree()\n");
+
+	if (!blob) {
+		pr_debug("No device tree pointer\n");
+		return;
+	}
+
+	pr_debug("Unflattening device tree:\n");
+	pr_debug("magic: %08x\n", be32_to_cpu(blob->magic));
+	pr_debug("size: %08x\n", be32_to_cpu(blob->totalsize));
+	pr_debug("version: %08x\n", be32_to_cpu(blob->version));
+
+	if (be32_to_cpu(blob->magic) != OF_DT_HEADER) {
+		pr_err("Invalid device tree blob header\n");
+		return;
+	}
+
+	/* First pass, scan for size */
+	start = ((unsigned long)blob) +
+		be32_to_cpu(blob->off_dt_struct);
+	size = unflatten_dt_node(blob, 0, &start, NULL, NULL, 0);
+	size = (size | 3) + 1;
+
+	pr_debug("  size is %lx, allocating...\n", size);
+
+	/* Allocate memory for the expanded device tree */
+	mem = (unsigned long)
+		dt_alloc(size + 4, __alignof__(struct device_node));
+
+	((__be32 *)mem)[size / 4] = cpu_to_be32(0xdeadbeef);
+
+	pr_debug("  unflattening %lx...\n", mem);
+
+	/* Second pass, do actual unflattening */
+	start = ((unsigned long)blob) +
+		be32_to_cpu(blob->off_dt_struct);
+	unflatten_dt_node(blob, mem, &start, NULL, &allnextp, 0);
+	if (be32_to_cpup((__be32 *)start) != OF_DT_END)
+		pr_warning("Weird tag at end of tree: %08x\n", *((u32 *)start));
+	if (be32_to_cpu(((__be32 *)mem)[size / 4]) != 0xdeadbeef)
+		pr_warning("End of tree marker overwritten: %08x\n",
+			   be32_to_cpu(((__be32 *)mem)[size / 4]));
+	*allnextp = NULL;
+
+	pr_debug(" <- unflatten_device_tree()\n");
+}
+
+static void *kernel_tree_alloc(u64 size, u64 align)
+{
+	return kzalloc(size, GFP_KERNEL);
+}
+
+/**
+ * of_fdt_unflatten_tree - create tree of device_nodes from flat blob
+ *
+ * unflattens the device-tree passed by the firmware, creating the
+ * tree of struct device_node. It also fills the "name" and "type"
+ * pointers of the nodes so the normal device-tree walking functions
+ * can be used.
+ */
+void of_fdt_unflatten_tree(unsigned long *blob,
+			struct device_node **mynodes)
+{
+	struct boot_param_header *device_tree =
+		(struct boot_param_header *)blob;
+	__unflatten_device_tree(device_tree, mynodes, &kernel_tree_alloc);
+}
+EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
+
 /* Everything below here references initial_boot_params directly. */
 int __initdata dt_root_addr_cells;
 int __initdata dt_root_size_cells;
@@ -569,6 +659,12 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
 	return 1;
 }
 
+static void *__init early_device_tree_alloc(u64 size, u64 align)
+{
+	unsigned long mem = early_init_dt_alloc_memory_arch(size, align);
+	return __va(mem);
+}
+
 /**
  * unflatten_device_tree - create tree of device_nodes from flat blob
  *
@@ -579,62 +675,13 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
  */
 void __init unflatten_device_tree(void)
 {
-	unsigned long start, mem, size;
-	struct device_node **allnextp = &allnodes;
-
-	pr_debug(" -> unflatten_device_tree()\n");
-
-	if (!initial_boot_params) {
-		pr_debug("No device tree pointer\n");
-		return;
-	}
-
-	pr_debug("Unflattening device tree:\n");
-	pr_debug("magic: %08x\n", be32_to_cpu(initial_boot_params->magic));
-	pr_debug("size: %08x\n", be32_to_cpu(initial_boot_params->totalsize));
-	pr_debug("version: %08x\n", be32_to_cpu(initial_boot_params->version));
-
-	if (be32_to_cpu(initial_boot_params->magic) != OF_DT_HEADER) {
-		pr_err("Invalid device tree blob header\n");
-		return;
-	}
-
-	/* First pass, scan for size */
-	start = ((unsigned long)initial_boot_params) +
-		be32_to_cpu(initial_boot_params->off_dt_struct);
-	size = unflatten_dt_node(initial_boot_params, 0, &start,
-				 NULL, NULL, 0);
-	size = (size | 3) + 1;
-
-	pr_debug("  size is %lx, allocating...\n", size);
-
-	/* Allocate memory for the expanded device tree */
-	mem = early_init_dt_alloc_memory_arch(size + 4,
-			__alignof__(struct device_node));
-	mem = (unsigned long) __va(mem);
-
-	((__be32 *)mem)[size / 4] = cpu_to_be32(0xdeadbeef);
-
-	pr_debug("  unflattening %lx...\n", mem);
-
-	/* Second pass, do actual unflattening */
-	start = ((unsigned long)initial_boot_params) +
-		be32_to_cpu(initial_boot_params->off_dt_struct);
-	unflatten_dt_node(initial_boot_params, mem, &start,
-			  NULL, &allnextp, 0);
-	if (be32_to_cpup((__be32 *)start) != OF_DT_END)
-		pr_warning("Weird tag at end of tree: %08x\n", *((u32 *)start));
-	if (be32_to_cpu(((__be32 *)mem)[size / 4]) != 0xdeadbeef)
-		pr_warning("End of tree marker overwritten: %08x\n",
-			   be32_to_cpu(((__be32 *)mem)[size / 4]));
-	*allnextp = NULL;
+	__unflatten_device_tree(initial_boot_params, &allnodes,
+				early_device_tree_alloc);
 
 	/* Get pointer to OF "/chosen" node for use everywhere */
 	of_chosen = of_find_node_by_path("/chosen");
 	if (of_chosen == NULL)
 		of_chosen = of_find_node_by_path("/chosen@0");
-
-	pr_debug(" <- unflatten_device_tree()\n");
 }
 
 #endif /* CONFIG_OF_EARLY_FLATTREE */
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 70c5b736f0a3..9ce5dfd2186a 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -68,6 +68,8 @@ extern void *of_fdt_get_property(struct boot_param_header *blob,
 extern int of_fdt_is_compatible(struct boot_param_header *blob,
 				unsigned long node,
 				const char *compat);
+extern void of_fdt_unflatten_tree(unsigned long *blob,
+			       struct device_node **mynodes);
 
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
 extern int __initdata dt_root_addr_cells;
-- 
cgit v1.2.3-71-gd317


From a4f740cf33f7f6c164bbde3c0cdbcc77b0c4997c Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 30 Oct 2010 11:49:09 -0400
Subject: of/flattree: Add of_flat_dt_match() helper function

This patch adds of_flat_dt_match() which tests a node for
compatibility with a list of values and converts the relevant powerpc
platform code to use it.  This approach simplifies the board support
code a bit.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Reviewed-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
---
 arch/powerpc/platforms/40x/ppc40x_simple.c    | 13 +++-------
 arch/powerpc/platforms/512x/mpc5121_generic.c | 13 ++--------
 arch/powerpc/platforms/52xx/lite5200.c        | 16 +++++-------
 arch/powerpc/platforms/52xx/media5200.c       | 13 ++--------
 arch/powerpc/platforms/52xx/mpc5200_simple.c  | 13 ++--------
 arch/powerpc/platforms/83xx/mpc830x_rdb.c     | 13 ++++++----
 arch/powerpc/platforms/83xx/mpc831x_rdb.c     | 11 +++++---
 arch/powerpc/platforms/83xx/mpc837x_rdb.c     | 15 ++++++-----
 arch/powerpc/platforms/85xx/tqm85xx.c         | 20 +++++++--------
 drivers/of/fdt.c                              | 37 +++++++++++++++++++++++++--
 include/linux/of_fdt.h                        |  3 +++
 11 files changed, 89 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/40x/ppc40x_simple.c b/arch/powerpc/platforms/40x/ppc40x_simple.c
index 546bbc229d19..2521d93ef136 100644
--- a/arch/powerpc/platforms/40x/ppc40x_simple.c
+++ b/arch/powerpc/platforms/40x/ppc40x_simple.c
@@ -50,7 +50,7 @@ machine_device_initcall(ppc40x_simple, ppc40x_device_probe);
  * Again, if your board needs to do things differently then create a
  * board.c file for it rather than adding it to this list.
  */
-static char *board[] __initdata = {
+static const char *board[] __initdata = {
 	"amcc,acadia",
 	"amcc,haleakala",
 	"amcc,kilauea",
@@ -60,14 +60,9 @@ static char *board[] __initdata = {
 
 static int __init ppc40x_probe(void)
 {
-	unsigned long root = of_get_flat_dt_root();
-	int i = 0;
-
-	for (i = 0; i < ARRAY_SIZE(board); i++) {
-		if (of_flat_dt_is_compatible(root, board[i])) {
-			ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC);
-			return 1;
-		}
+	if (of_flat_dt_match(of_get_flat_dt_root(), board)) {
+		ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC);
+		return 1;
 	}
 
 	return 0;
diff --git a/arch/powerpc/platforms/512x/mpc5121_generic.c b/arch/powerpc/platforms/512x/mpc5121_generic.c
index e487eb06ec6b..926731f1ff01 100644
--- a/arch/powerpc/platforms/512x/mpc5121_generic.c
+++ b/arch/powerpc/platforms/512x/mpc5121_generic.c
@@ -26,7 +26,7 @@
 /*
  * list of supported boards
  */
-static char *board[] __initdata = {
+static const char *board[] __initdata = {
 	"prt,prtlvt",
 	NULL
 };
@@ -36,16 +36,7 @@ static char *board[] __initdata = {
  */
 static int __init mpc5121_generic_probe(void)
 {
-	unsigned long node = of_get_flat_dt_root();
-	int i = 0;
-
-	while (board[i]) {
-		if (of_flat_dt_is_compatible(node, board[i]))
-			break;
-		i++;
-	}
-
-	return board[i] != NULL;
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 define_machine(mpc5121_generic) {
diff --git a/arch/powerpc/platforms/52xx/lite5200.c b/arch/powerpc/platforms/52xx/lite5200.c
index de55bc0584b5..01ffa64d2aa7 100644
--- a/arch/powerpc/platforms/52xx/lite5200.c
+++ b/arch/powerpc/platforms/52xx/lite5200.c
@@ -172,20 +172,18 @@ static void __init lite5200_setup_arch(void)
 	mpc52xx_setup_pci();
 }
 
+static const char *board[] __initdata = {
+	"fsl,lite5200",
+	"fsl,lite5200b",
+	NULL,
+};
+
 /*
  * Called very early, MMU is off, device-tree isn't unflattened
  */
 static int __init lite5200_probe(void)
 {
-	unsigned long node = of_get_flat_dt_root();
-	const char *model = of_get_flat_dt_prop(node, "model", NULL);
-
-	if (!of_flat_dt_is_compatible(node, "fsl,lite5200") &&
-	    !of_flat_dt_is_compatible(node, "fsl,lite5200b"))
-		return 0;
-	pr_debug("%s board found\n", model ? model : "unknown");
-
-	return 1;
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 define_machine(lite5200) {
diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c
index 0bac3a3dbecf..2c7780cb68e5 100644
--- a/arch/powerpc/platforms/52xx/media5200.c
+++ b/arch/powerpc/platforms/52xx/media5200.c
@@ -239,7 +239,7 @@ static void __init media5200_setup_arch(void)
 }
 
 /* list of the supported boards */
-static char *board[] __initdata = {
+static const char *board[] __initdata = {
 	"fsl,media5200",
 	NULL
 };
@@ -249,16 +249,7 @@ static char *board[] __initdata = {
  */
 static int __init media5200_probe(void)
 {
-	unsigned long node = of_get_flat_dt_root();
-	int i = 0;
-
-	while (board[i]) {
-		if (of_flat_dt_is_compatible(node, board[i]))
-			break;
-		i++;
-	}
-
-	return (board[i] != NULL);
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 define_machine(media5200_platform) {
diff --git a/arch/powerpc/platforms/52xx/mpc5200_simple.c b/arch/powerpc/platforms/52xx/mpc5200_simple.c
index d45be5b5ad49..e36d6e232ae6 100644
--- a/arch/powerpc/platforms/52xx/mpc5200_simple.c
+++ b/arch/powerpc/platforms/52xx/mpc5200_simple.c
@@ -49,7 +49,7 @@ static void __init mpc5200_simple_setup_arch(void)
 }
 
 /* list of the supported boards */
-static char *board[] __initdata = {
+static const char *board[] __initdata = {
 	"intercontrol,digsy-mtc",
 	"manroland,mucmc52",
 	"manroland,uc101",
@@ -66,16 +66,7 @@ static char *board[] __initdata = {
  */
 static int __init mpc5200_simple_probe(void)
 {
-	unsigned long node = of_get_flat_dt_root();
-	int i = 0;
-
-	while (board[i]) {
-		if (of_flat_dt_is_compatible(node, board[i]))
-			break;
-		i++;
-	}
-	
-	return (board[i] != NULL);
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 define_machine(mpc5200_simple_platform) {
diff --git a/arch/powerpc/platforms/83xx/mpc830x_rdb.c b/arch/powerpc/platforms/83xx/mpc830x_rdb.c
index 846831d495b5..661d354e4ff2 100644
--- a/arch/powerpc/platforms/83xx/mpc830x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc830x_rdb.c
@@ -57,16 +57,19 @@ static void __init mpc830x_rdb_init_IRQ(void)
 	ipic_set_default_priority();
 }
 
+struct const char *board[] __initdata = {
+	"MPC8308RDB",
+	"fsl,mpc8308rdb",
+	"denx,mpc8308_p1m",
+	NULL
+}
+
 /*
  * Called very early, MMU is off, device-tree isn't unflattened
  */
 static int __init mpc830x_rdb_probe(void)
 {
-	unsigned long root = of_get_flat_dt_root();
-
-	return of_flat_dt_is_compatible(root, "MPC8308RDB") ||
-	       of_flat_dt_is_compatible(root, "fsl,mpc8308rdb") ||
-	       of_flat_dt_is_compatible(root, "denx,mpc8308_p1m");
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 static struct of_device_id __initdata of_bus_ids[] = {
diff --git a/arch/powerpc/platforms/83xx/mpc831x_rdb.c b/arch/powerpc/platforms/83xx/mpc831x_rdb.c
index ae525e4745d2..b54cd736a895 100644
--- a/arch/powerpc/platforms/83xx/mpc831x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc831x_rdb.c
@@ -60,15 +60,18 @@ static void __init mpc831x_rdb_init_IRQ(void)
 	ipic_set_default_priority();
 }
 
+struct const char *board[] __initdata = {
+	"MPC8313ERDB",
+	"fsl,mpc8315erdb",
+	NULL
+}
+
 /*
  * Called very early, MMU is off, device-tree isn't unflattened
  */
 static int __init mpc831x_rdb_probe(void)
 {
-	unsigned long root = of_get_flat_dt_root();
-
-	return of_flat_dt_is_compatible(root, "MPC8313ERDB") ||
-	       of_flat_dt_is_compatible(root, "fsl,mpc8315erdb");
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 static struct of_device_id __initdata of_bus_ids[] = {
diff --git a/arch/powerpc/platforms/83xx/mpc837x_rdb.c b/arch/powerpc/platforms/83xx/mpc837x_rdb.c
index 910caa6b5810..7bafbf2ec0f9 100644
--- a/arch/powerpc/platforms/83xx/mpc837x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc837x_rdb.c
@@ -101,17 +101,20 @@ static void __init mpc837x_rdb_init_IRQ(void)
 	ipic_set_default_priority();
 }
 
+static const char *board[] __initdata = {
+	"fsl,mpc8377rdb",
+	"fsl,mpc8378rdb",
+	"fsl,mpc8379rdb",
+	"fsl,mpc8377wlan",
+	NULL
+};
+
 /*
  * Called very early, MMU is off, device-tree isn't unflattened
  */
 static int __init mpc837x_rdb_probe(void)
 {
-	unsigned long root = of_get_flat_dt_root();
-
-	return of_flat_dt_is_compatible(root, "fsl,mpc8377rdb") ||
-	       of_flat_dt_is_compatible(root, "fsl,mpc8378rdb") ||
-	       of_flat_dt_is_compatible(root, "fsl,mpc8379rdb") ||
-	       of_flat_dt_is_compatible(root, "fsl,mpc8377wlan");
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 define_machine(mpc837x_rdb) {
diff --git a/arch/powerpc/platforms/85xx/tqm85xx.c b/arch/powerpc/platforms/85xx/tqm85xx.c
index 8f29bbce5360..5e847d0b47c8 100644
--- a/arch/powerpc/platforms/85xx/tqm85xx.c
+++ b/arch/powerpc/platforms/85xx/tqm85xx.c
@@ -186,21 +186,21 @@ static int __init declare_of_platform_devices(void)
 }
 machine_device_initcall(tqm85xx, declare_of_platform_devices);
 
+static const char *board[] __initdata = {
+	"tqc,tqm8540",
+	"tqc,tqm8541",
+	"tqc,tqm8548",
+	"tqc,tqm8555",
+	"tqc,tqm8560",
+	NULL
+};
+
 /*
  * Called very early, device-tree isn't unflattened
  */
 static int __init tqm85xx_probe(void)
 {
-	unsigned long root = of_get_flat_dt_root();
-
-	if ((of_flat_dt_is_compatible(root, "tqc,tqm8540")) ||
-	    (of_flat_dt_is_compatible(root, "tqc,tqm8541")) ||
-	    (of_flat_dt_is_compatible(root, "tqc,tqm8548")) ||
-	    (of_flat_dt_is_compatible(root, "tqc,tqm8555")) ||
-	    (of_flat_dt_is_compatible(root, "tqc,tqm8560")))
-		return 1;
-
-	return 0;
+	return of_flat_dt_match(of_get_flat_dt_root(), board);
 }
 
 define_machine(tqm85xx) {
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 8a90ee42071a..c787c3d95c60 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -78,19 +78,23 @@ void *of_fdt_get_property(struct boot_param_header *blob,
  * @blob: A device tree blob
  * @node: node to test
  * @compat: compatible string to compare with compatible list.
+ *
+ * On match, returns a non-zero value with smaller values returned for more
+ * specific compatible values.
  */
 int of_fdt_is_compatible(struct boot_param_header *blob,
 		      unsigned long node, const char *compat)
 {
 	const char *cp;
-	unsigned long cplen, l;
+	unsigned long cplen, l, score = 0;
 
 	cp = of_fdt_get_property(blob, node, "compatible", &cplen);
 	if (cp == NULL)
 		return 0;
 	while (cplen > 0) {
+		score++;
 		if (of_compat_cmp(cp, compat, strlen(compat)) == 0)
-			return 1;
+			return score;
 		l = strlen(cp) + 1;
 		cp += l;
 		cplen -= l;
@@ -99,6 +103,27 @@ int of_fdt_is_compatible(struct boot_param_header *blob,
 	return 0;
 }
 
+/**
+ * of_fdt_match - Return true if node matches a list of compatible values
+ */
+int of_fdt_match(struct boot_param_header *blob, unsigned long node,
+                 const char **compat)
+{
+	unsigned int tmp, score = 0;
+
+	if (!compat)
+		return 0;
+
+	while (*compat) {
+		tmp = of_fdt_is_compatible(blob, node, *compat);
+		if (tmp && (score == 0 || (tmp < score)))
+			score = tmp;
+		compat++;
+	}
+
+	return score;
+}
+
 static void *unflatten_dt_alloc(unsigned long *mem, unsigned long size,
 				       unsigned long align)
 {
@@ -511,6 +536,14 @@ int __init of_flat_dt_is_compatible(unsigned long node, const char *compat)
 	return of_fdt_is_compatible(initial_boot_params, node, compat);
 }
 
+/**
+ * of_flat_dt_match - Return true if node matches a list of compatible values
+ */
+int __init of_flat_dt_match(unsigned long node, const char **compat)
+{
+	return of_fdt_match(initial_boot_params, node, compat);
+}
+
 #ifdef CONFIG_BLK_DEV_INITRD
 /**
  * early_init_dt_check_for_initrd - Decode initrd location from flat tree
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 9ce5dfd2186a..ee96091f7d25 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -68,6 +68,8 @@ extern void *of_fdt_get_property(struct boot_param_header *blob,
 extern int of_fdt_is_compatible(struct boot_param_header *blob,
 				unsigned long node,
 				const char *compat);
+extern int of_fdt_match(struct boot_param_header *blob, unsigned long node,
+			const char **compat);
 extern void of_fdt_unflatten_tree(unsigned long *blob,
 			       struct device_node **mynodes);
 
@@ -84,6 +86,7 @@ extern int of_scan_flat_dt(int (*it)(unsigned long node, const char *uname,
 extern void *of_get_flat_dt_prop(unsigned long node, const char *name,
 				 unsigned long *size);
 extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
+extern int of_flat_dt_match(unsigned long node, const char **matches);
 extern unsigned long of_get_flat_dt_root(void);
 
 extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
-- 
cgit v1.2.3-71-gd317


From 410cf2bd3dc6ec1ed9e1b36b25b9d7aa927ed14e Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Mon, 13 Dec 2010 14:56:02 +0100
Subject: firewire: use split transaction timeout only for split transactions

Instead of starting the split transaction timeout timer when any request
is submitted, start it only when the destination's ACK_PENDING has been
received.  This prevents us from using a timeout that is too short, and,
if the controller's AT queue is emptying very slowly, from cancelling
a packet that has not yet been sent.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-transaction.c | 45 ++++++++++++++++++++++++++-----------
 include/linux/firewire.h            |  2 +-
 2 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 9e81cc54abd2..d00f8ce902cc 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -72,6 +72,15 @@
 #define PHY_CONFIG_ROOT_ID(node_id)	((((node_id) & 0x3f) << 24) | (1 << 23))
 #define PHY_IDENTIFIER(id)		((id) << 30)
 
+/* returns 0 if the split timeout handler is already running */
+static int try_cancel_split_timeout(struct fw_transaction *t)
+{
+	if (t->is_split_transaction)
+		return del_timer(&t->split_timeout_timer);
+	else
+		return 1;
+}
+
 static int close_transaction(struct fw_transaction *transaction,
 			     struct fw_card *card, int rcode)
 {
@@ -81,7 +90,7 @@ static int close_transaction(struct fw_transaction *transaction,
 	spin_lock_irqsave(&card->lock, flags);
 	list_for_each_entry(t, &card->transaction_list, link) {
 		if (t == transaction) {
-			if (!del_timer(&t->split_timeout_timer)) {
+			if (!try_cancel_split_timeout(t)) {
 				spin_unlock_irqrestore(&card->lock, flags);
 				goto timed_out;
 			}
@@ -141,16 +150,28 @@ static void split_transaction_timeout_callback(unsigned long data)
 	card->tlabel_mask &= ~(1ULL << t->tlabel);
 	spin_unlock_irqrestore(&card->lock, flags);
 
-	card->driver->cancel_packet(card, &t->packet);
-
-	/*
-	 * At this point cancel_packet will never call the transaction
-	 * callback, since we just took the transaction out of the list.
-	 * So do it here.
-	 */
 	t->callback(card, RCODE_CANCELLED, NULL, 0, t->callback_data);
 }
 
+static void start_split_transaction_timeout(struct fw_transaction *t,
+					    struct fw_card *card)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->lock, flags);
+
+	if (list_empty(&t->link) || WARN_ON(t->is_split_transaction)) {
+		spin_unlock_irqrestore(&card->lock, flags);
+		return;
+	}
+
+	t->is_split_transaction = true;
+	mod_timer(&t->split_timeout_timer,
+		  jiffies + card->split_timeout_jiffies);
+
+	spin_unlock_irqrestore(&card->lock, flags);
+}
+
 static void transmit_complete_callback(struct fw_packet *packet,
 				       struct fw_card *card, int status)
 {
@@ -162,7 +183,7 @@ static void transmit_complete_callback(struct fw_packet *packet,
 		close_transaction(t, card, RCODE_COMPLETE);
 		break;
 	case ACK_PENDING:
-		t->timestamp = packet->timestamp;
+		start_split_transaction_timeout(t, card);
 		break;
 	case ACK_BUSY_X:
 	case ACK_BUSY_A:
@@ -349,11 +370,9 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 	t->node_id = destination_id;
 	t->tlabel = tlabel;
 	t->card = card;
+	t->is_split_transaction = false;
 	setup_timer(&t->split_timeout_timer,
 		    split_transaction_timeout_callback, (unsigned long)t);
-	/* FIXME: start this timer later, relative to t->timestamp */
-	mod_timer(&t->split_timeout_timer,
-		  jiffies + card->split_timeout_jiffies);
 	t->callback = callback;
 	t->callback_data = callback_data;
 
@@ -926,7 +945,7 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p)
 	spin_lock_irqsave(&card->lock, flags);
 	list_for_each_entry(t, &card->transaction_list, link) {
 		if (t->node_id == source && t->tlabel == tlabel) {
-			if (!del_timer(&t->split_timeout_timer)) {
+			if (!try_cancel_split_timeout(t)) {
 				spin_unlock_irqrestore(&card->lock, flags);
 				goto timed_out;
 			}
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 1cd637ef62d2..9a3f5f9383f6 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -302,9 +302,9 @@ struct fw_packet {
 struct fw_transaction {
 	int node_id; /* The generation is implied; it is always the current. */
 	int tlabel;
-	int timestamp;
 	struct list_head link;
 	struct fw_card *card;
+	bool is_split_transaction;
 	struct timer_list split_timeout_timer;
 
 	struct fw_packet packet;
-- 
cgit v1.2.3-71-gd317


From bdd5f05d91e8ae68075b812ce244c918d3d752cd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 4 Jan 2011 13:31:45 -0500
Subject: SUNRPC: Remove more code when NFSD_DEPRECATED is not configured

Signed-off-by: NeilBrown <neilb@suse.de>
[bfields@redhat.com: moved svcauth_unix_purge outside ifdef's.]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h |  3 +++
 net/sunrpc/svcauth_unix.c    | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 6950c981882d..c2aebe8eeffd 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -255,10 +255,13 @@ static inline time_t get_expiry(char **bpp)
 	return rv - boot.tv_sec;
 }
 
+#ifdef CONFIG_NFSD_DEPRECATED
 static inline void sunrpc_invalidate(struct cache_head *h,
 				     struct cache_detail *detail)
 {
 	h->expiry_time = seconds_since_boot() - 1;
 	detail->nextcheck = seconds_since_boot();
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
+
 #endif /*  _LINUX_SUNRPC_CACHE_H_ */
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 59a7c524a8b1..30916b06c12b 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -30,7 +30,9 @@
 
 struct unix_domain {
 	struct auth_domain	h;
+#ifdef CONFIG_NFSD_DEPRECATED
 	int	addr_changes;
+#endif /* CONFIG_NFSD_DEPRECATED */
 	/* other stuff later */
 };
 
@@ -64,7 +66,9 @@ struct auth_domain *unix_domain_find(char *name)
 			return NULL;
 		}
 		new->h.flavour = &svcauth_unix;
+#ifdef CONFIG_NFSD_DEPRECATED
 		new->addr_changes = 0;
+#endif /* CONFIG_NFSD_DEPRECATED */
 		rv = auth_domain_lookup(name, &new->h);
 	}
 }
@@ -91,7 +95,9 @@ struct ip_map {
 	char			m_class[8]; /* e.g. "nfsd" */
 	struct in6_addr		m_addr;
 	struct unix_domain	*m_client;
+#ifdef CONFIG_NFSD_DEPRECATED
 	int			m_add_change;
+#endif /* CONFIG_NFSD_DEPRECATED */
 };
 
 static void ip_map_put(struct kref *kref)
@@ -145,7 +151,9 @@ static void update(struct cache_head *cnew, struct cache_head *citem)
 
 	kref_get(&item->m_client->h.ref);
 	new->m_client = item->m_client;
+#ifdef CONFIG_NFSD_DEPRECATED
 	new->m_add_change = item->m_add_change;
+#endif /* CONFIG_NFSD_DEPRECATED */
 }
 static struct cache_head *ip_map_alloc(void)
 {
@@ -330,6 +338,7 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
 	ip.h.flags = 0;
 	if (!udom)
 		set_bit(CACHE_NEGATIVE, &ip.h.flags);
+#ifdef CONFIG_NFSD_DEPRECATED
 	else {
 		ip.m_add_change = udom->addr_changes;
 		/* if this is from the legacy set_client system call,
@@ -338,6 +347,7 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
 		if (expiry == NEVER)
 			ip.m_add_change++;
 	}
+#endif /* CONFIG_NFSD_DEPRECATED */
 	ip.h.expiry_time = expiry;
 	ch = sunrpc_cache_update(cd, &ip.h, &ipm->h,
 				 hash_str(ipm->m_class, IP_HASHBITS) ^
@@ -357,6 +367,7 @@ static inline int ip_map_update(struct net *net, struct ip_map *ipm,
 	return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry);
 }
 
+#ifdef CONFIG_NFSD_DEPRECATED
 int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom)
 {
 	struct unix_domain *udom;
@@ -411,6 +422,7 @@ struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr)
 	return rv;
 }
 EXPORT_SYMBOL_GPL(auth_unix_lookup);
+#endif /* CONFIG_NFSD_DEPRECATED */
 
 void svcauth_unix_purge(void)
 {
-- 
cgit v1.2.3-71-gd317


From 9e701c610923aaeac8b38b9202a686d1cc9ee35d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 2 Jan 2011 21:56:36 -0500
Subject: svcrpc: simpler request dropping

Currently we use -EAGAIN returns to determine when to drop a deferred
request.  On its own, that is error-prone, as it makes us treat -EAGAIN
returns from other functions specially to prevent inadvertent dropping.

So, use a flag on the request instead.

Returning an error on request deferral is still required, to prevent
further processing, but we no longer need worry that an error return on
its own could result in a drop.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfssvc.c           | 2 +-
 include/linux/sunrpc/svc.h | 1 +
 net/sunrpc/svc.c           | 3 ++-
 net/sunrpc/svc_xprt.c      | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f2..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
-	if (nfserr == nfserr_dropit) {
+	if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
 		dprintk("nfsd: Dropping request; may be revisited later\n");
 		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
 		return 0;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5a3085b9b394..d45c482b6444 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -269,6 +269,7 @@ struct svc_rqst {
 	struct cache_req	rq_chandle;	/* handle passed to caches for 
 						 * request delaying 
 						 */
+	bool			rq_dropme;
 	/* Catering to nfsd */
 	struct auth_domain *	rq_client;	/* RPC peer info */
 	struct auth_domain *	rq_gssclient;	/* "gss/"-style peer info */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 6359c42c4941..df1931f8ae98 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1005,6 +1005,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	rqstp->rq_splice_ok = 1;
 	/* Will be turned off only when NFSv4 Sessions are used */
 	rqstp->rq_usedeferral = 1;
+	rqstp->rq_dropme = false;
 
 	/* Setup reply header */
 	rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
@@ -1106,7 +1107,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 		*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 
 		/* Encode reply */
-		if (*statp == rpc_drop_reply) {
+		if (rqstp->rq_dropme) {
 			if (procp->pc_release)
 				procp->pc_release(rqstp, NULL, rqstp->rq_resp);
 			goto dropit;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 5eae53b1e306..173f3b975d49 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1019,6 +1019,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
 	}
 	svc_xprt_get(rqstp->rq_xprt);
 	dr->xprt = rqstp->rq_xprt;
+	rqstp->rq_dropme = true;
 
 	dr->handle.revisit = svc_revisit;
 	return &dr->handle;
-- 
cgit v1.2.3-71-gd317


From c45821d263a8a5109d69a9e8942b8d65bcd5f31a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 31 Oct 2010 00:04:44 -0400
Subject: locks: eliminate fl_mylease callback

The nfs server only supports read delegations for now, so we don't care
how conflicts are determined.  All we care is that unlocks are
recognized as matching the leases they are meant to remove.  After the
last patch, a comparison of struct files will work for that purpose.  So
we no longer need this callback.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c          |  8 +-------
 fs/nfsd/nfs4state.c | 21 +--------------------
 include/linux/fs.h  |  1 -
 3 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 8729347bcd1a..5cb65062281a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
 	fl->fl_file->f_owner.signum = 0;
 }
 
-static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
-{
-	return fl->fl_file == try->fl_file;
-}
-
 static const struct lock_manager_operations lease_manager_ops = {
 	.fl_break = lease_break_callback,
 	.fl_release_private = lease_release_private_callback,
-	.fl_mylease = lease_mylease_callback,
 	.fl_change = lease_modify,
 };
 
@@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	for (before = &inode->i_flock;
 			((fl = *before) != NULL) && IS_LEASE(fl);
 			before = &fl->fl_next) {
-		if (lease->fl_lmops->fl_mylease(fl, lease))
+		if (fl->fl_file == lease->fl_file)
 			my_before = before;
 		else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
 			/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cbe1b81c147d..87d4c48b6069 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2295,24 +2295,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 	nfsd4_cb_recall(dp);
 }
 
-/*
- * Called from setlease() with lock_flocks() held
- */
-static
-int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
-{
-	struct nfs4_delegation *onlistd =
-		(struct nfs4_delegation *)onlist->fl_owner;
-	struct nfs4_delegation *tryd =
-		(struct nfs4_delegation *)try->fl_owner;
-
-	if (onlist->fl_lmops != try->fl_lmops)
-		return 0;
-
-	return onlistd->dl_client == tryd->dl_client;
-}
-
-
 static
 int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 {
@@ -2324,7 +2306,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
-	.fl_mylease = nfsd_same_client_deleg_cb,
 	.fl_change = nfsd_change_deleg_cb,
 };
 
@@ -2630,7 +2611,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	dp->dl_flock = fl;
 
 	/* vfs_setlease checks to see if delegation should be handed out.
-	 * the lock_manager callbacks fl_mylease and fl_change are used
+	 * the lock_manager callback fl_change is used
 	 */
 	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 090f0eacde29..73ce69f728e1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1059,7 +1059,6 @@ struct lock_manager_operations {
 	int (*fl_grant)(struct file_lock *, struct file_lock *, int);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *);
-	int (*fl_mylease)(struct file_lock *, struct file_lock *);
 	int (*fl_change)(struct file_lock **, int);
 };
 
-- 
cgit v1.2.3-71-gd317


From 2ca72e17e5acb1052c35c9faba609c2289ce7a92 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 4 Jan 2011 17:37:15 -0500
Subject: nfsd4: move idmap and acl header files into fs/nfsd

These are internal nfsd interfaces.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/acl.h              | 61 +++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/idmap.h            | 64 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4acl.c          |  2 +-
 fs/nfsd/nfs4idmap.c        |  2 +-
 fs/nfsd/nfs4xdr.c          |  5 ++--
 fs/nfsd/nfsctl.c           |  2 +-
 fs/nfsd/vfs.c              |  4 +--
 include/linux/nfs4_acl.h   | 61 -------------------------------------------
 include/linux/nfsd_idmap.h | 64 ----------------------------------------------
 9 files changed, 133 insertions(+), 132 deletions(-)
 create mode 100644 fs/nfsd/acl.h
 create mode 100644 fs/nfsd/idmap.h
 delete mode 100644 include/linux/nfs4_acl.h
 delete mode 100644 include/linux/nfsd_idmap.h

(limited to 'include/linux')

diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..c9c05a78e9bb
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,61 @@
+/*
+ *  include/linux/nfs4_acl.c
+ *
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+
+#include <linux/posix_acl.h>
+
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+		                        uid_t who, u32 mask);
+
+#define NFS4_ACL_TYPE_DEFAULT	0x01
+#define NFS4_ACL_DIR		0x02
+#define NFS4_ACL_OWNER		0x04
+
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+				struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+				struct posix_acl **, unsigned int flags);
+
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..d4a2ac18bd4c
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,64 @@
+/*
+ *  include/linux/nfsd_idmap.h
+ *
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+	return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+
+int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
 
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs4_acl.h>
+#include "acl.h"
 
 
 /* mode bit translations: */
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 844960fd0395..cbd599732765 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,10 @@
  */
 
 #include <linux/module.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "idmap.h"
 
 /*
  * Cache entry
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 364aae7d5998..2a0814d0ab1a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/nfs4_acl.h>
 #include <linux/sunrpc/svcauth_gss.h>
 
+#include "idmap.h"
+#include "acl.h"
 #include "xdr4.h"
 #include "vfs.h"
 
+
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
 /*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6840ec3ceecf..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
 #include <linux/namei.h>
 #include <linux/ctype.h>
 
-#include <linux/nfsd_idmap.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 
+#include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6a3af2ff3afe..b991125ce4a5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -35,8 +35,8 @@
 #endif /* CONFIG_NFSD_V3 */
 
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4_acl.h>
-#include <linux/nfsd_idmap.h>
+#include "acl.h"
+#include "idmap.h"
 #endif /* CONFIG_NFSD_V4 */
 
 #include "nfsd.h"
diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h
deleted file mode 100644
index c9c05a78e9bb..000000000000
--- a/include/linux/nfs4_acl.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  include/linux/nfs4_acl.c
- *
- *  Common NFSv4 ACL handling definitions.
- *
- *  Copyright (c) 2002 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Marius Aamodt Eriksen <marius@umich.edu>
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the University nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef LINUX_NFS4_ACL_H
-#define LINUX_NFS4_ACL_H
-
-#include <linux/posix_acl.h>
-
-/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
- * fit in a page: */
-#define NFS4_ACL_MAX 170
-
-struct nfs4_acl *nfs4_acl_new(int);
-int nfs4_acl_get_whotype(char *, u32);
-int nfs4_acl_write_who(int who, char *p);
-int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
-		                        uid_t who, u32 mask);
-
-#define NFS4_ACL_TYPE_DEFAULT	0x01
-#define NFS4_ACL_DIR		0x02
-#define NFS4_ACL_OWNER		0x04
-
-struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
-				struct posix_acl *, unsigned int flags);
-int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
-				struct posix_acl **, unsigned int flags);
-
-#endif /* LINUX_NFS4_ACL_H */
diff --git a/include/linux/nfsd_idmap.h b/include/linux/nfsd_idmap.h
deleted file mode 100644
index d4a2ac18bd4c..000000000000
--- a/include/linux/nfsd_idmap.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  include/linux/nfsd_idmap.h
- *
- *  Mapping of UID to name and vice versa.
- *
- *  Copyright (c) 2002, 2003 The Regents of the University of
- *  Michigan.  All rights reserved.
-> *
- *  Marius Aamodt Eriksen <marius@umich.edu>
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the University nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef LINUX_NFSD_IDMAP_H
-#define LINUX_NFSD_IDMAP_H
-
-#include <linux/in.h>
-#include <linux/sunrpc/svc.h>
-
-/* XXX from linux/nfs_idmap.h */
-#define IDMAP_NAMESZ 128
-
-#ifdef CONFIG_NFSD_V4
-int nfsd_idmap_init(void);
-void nfsd_idmap_shutdown(void);
-#else
-static inline int nfsd_idmap_init(void)
-{
-	return 0;
-}
-static inline void nfsd_idmap_shutdown(void)
-{
-}
-#endif
-
-int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
-int nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
-int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
-int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
-
-#endif /* LINUX_NFSD_IDMAP_H */
-- 
cgit v1.2.3-71-gd317


From 91aa5fadb831e7b6ea473a526a6b49c6dc4819ce Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:31:04 +0000
Subject: ARM: PL08x: fix atomic_t usage and tx_submit() return value range

The last_issued variable uses an atomic type, which is only
incremented inside a protected region, and then read.  Everywhere else
only reads the value, so it isn't using atomic_t correctly, and it
doesn't even need to.  Moreover, the DMA engine code provides us with
a variable for this already - chan.cookie.  Use chan.cookie instead.

Also, avoid negative dma_cookie_t values - negative returns from
tx_submit() mean failure, yet in reality we always succeed.  Restart
from cookie 1, just like other DMA engine drivers do.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 18 +++++++++---------
 include/linux/amba/pl08x.h |  1 -
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 0809810f9e7a..5d9a15652dba 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -74,7 +74,6 @@
 #include <asm/hardware/pl080.h>
 #include <asm/dma.h>
 #include <asm/mach/dma.h>
-#include <asm/atomic.h>
 #include <asm/processor.h>
 #include <asm/cacheflush.h>
 
@@ -1082,8 +1081,10 @@ static dma_cookie_t pl08x_tx_submit(struct dma_async_tx_descriptor *tx)
 {
 	struct pl08x_dma_chan *plchan = to_pl08x_chan(tx->chan);
 
-	atomic_inc(&plchan->last_issued);
-	tx->cookie = atomic_read(&plchan->last_issued);
+	plchan->chan.cookie += 1;
+	if (plchan->chan.cookie < 0)
+		plchan->chan.cookie = 1;
+	tx->cookie = plchan->chan.cookie;
 	/* This unlock follows the lock in the prep() function */
 	spin_unlock_irqrestore(&plchan->lock, plchan->lockflags);
 
@@ -1115,7 +1116,7 @@ pl08x_dma_tx_status(struct dma_chan *chan,
 	enum dma_status ret;
 	u32 bytesleft = 0;
 
-	last_used = atomic_read(&plchan->last_issued);
+	last_used = plchan->chan.cookie;
 	last_complete = plchan->lc;
 
 	ret = dma_async_is_complete(cookie, last_complete, last_used);
@@ -1131,7 +1132,7 @@ pl08x_dma_tx_status(struct dma_chan *chan,
 	/*
 	 * This cookie not complete yet
 	 */
-	last_used = atomic_read(&plchan->last_issued);
+	last_used = plchan->chan.cookie;
 	last_complete = plchan->lc;
 
 	/* Get number of bytes left in the active transactions and queue */
@@ -1641,8 +1642,7 @@ static void pl08x_tasklet(unsigned long data)
 		/*
 		 * Update last completed
 		 */
-		plchan->lc =
-			(plchan->at->tx.cookie);
+		plchan->lc = plchan->at->tx.cookie;
 
 		/*
 		 * Callback to signal completion
@@ -1820,8 +1820,8 @@ static int pl08x_dma_init_virtual_channels(struct pl08x_driver_data *pl08x,
 			 chan->name);
 
 		chan->chan.device = dmadev;
-		atomic_set(&chan->last_issued, 0);
-		chan->lc = atomic_read(&chan->last_issued);
+		chan->chan.cookie = 0;
+		chan->lc = 0;
 
 		spin_lock_init(&chan->lock);
 		INIT_LIST_HEAD(&chan->desc_list);
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 521a0f8974ac..4ae62b4684f9 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -174,7 +174,6 @@ struct pl08x_dma_chan {
 	struct pl08x_channel_data *cd;
 	dma_addr_t runtime_addr;
 	enum dma_data_direction	runtime_direction;
-	atomic_t last_issued;
 	dma_cookie_t lc;
 	struct list_head desc_list;
 	struct pl08x_txd *at;
-- 
cgit v1.2.3-71-gd317


From 7cb72ad959b16ac594118977b7954a7d2ec7a052 Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:35:28 +0000
Subject: ARM: PL08x: avoid 'void *' struct fields when we can type them
 properly

Avoid using 'void *' struct fields when the structs are not defined
in linux/amba/pl08x.h - instead, forward declare the struct names, and
use these instead.  This ensures we have proper typechecking.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 16 ++++++++--------
 include/linux/amba/pl08x.h |  5 ++++-
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 707fa081c9fa..fada97873d7d 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -107,7 +107,7 @@ struct vendor_data {
  * An LLI struct - see PL08x TRM.  Note that next uses bit[0] as a bus bit,
  * start & end do not - their bus bit info is in cctl.
  */
-struct lli {
+struct pl08x_lli {
 	dma_addr_t src;
 	dma_addr_t dst;
 	dma_addr_t next;
@@ -160,7 +160,7 @@ struct pl08x_driver_data {
 
 /* Maximum times we call dma_pool_alloc on this pool without freeing */
 #define PL08X_MAX_ALLOCS	0x40
-#define MAX_NUM_TSFR_LLIS	(PL08X_LLI_TSFR_SIZE/sizeof(struct lli))
+#define MAX_NUM_TSFR_LLIS	(PL08X_LLI_TSFR_SIZE/sizeof(struct pl08x_lli))
 #define PL08X_ALIGN		8
 
 static inline struct pl08x_dma_chan *to_pl08x_chan(struct dma_chan *chan)
@@ -354,8 +354,8 @@ static u32 pl08x_getbytes_chan(struct pl08x_dma_chan *plchan)
 	 * currently active transaction.
 	 */
 	if (ch && txd) {
-		struct lli *llis_va = txd->llis_va;
-		struct lli *llis_bus = (struct lli *) txd->llis_bus;
+		struct pl08x_lli *llis_va = txd->llis_va;
+		struct pl08x_lli *llis_bus = (struct pl08x_lli *) txd->llis_bus;
 		u32 clli = readl(ch->base + PL080_CH_LLI);
 
 		/* First get the bytes in the current active LLI */
@@ -558,8 +558,8 @@ static int pl08x_fill_lli_for_desc(struct pl08x_driver_data *pl08x,
 			    struct pl08x_txd *txd, int num_llis, int len,
 			    u32 cctl, u32 *remainder)
 {
-	struct lli *llis_va = txd->llis_va;
-	struct lli *llis_bus = (struct lli *) txd->llis_bus;
+	struct pl08x_lli *llis_va = txd->llis_va;
+	struct pl08x_lli *llis_bus = (struct pl08x_lli *) txd->llis_bus;
 
 	BUG_ON(num_llis >= MAX_NUM_TSFR_LLIS);
 
@@ -620,8 +620,8 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	u32 cctl;
 	int max_bytes_per_lli;
 	int total_bytes = 0;
-	struct lli *llis_va;
-	struct lli *llis_bus;
+	struct pl08x_lli *llis_va;
+	struct pl08x_lli *llis_bus;
 
 	txd->llis_va = dma_pool_alloc(pl08x->pool, GFP_NOWAIT,
 				      &txd->llis_bus);
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 4ae62b4684f9..3ecc20fce26a 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -22,6 +22,9 @@
 #include <linux/dmaengine.h>
 #include <linux/interrupt.h>
 
+struct pl08x_lli;
+struct pl08x_driver_data;
+
 /**
  * struct pl08x_channel_data - data structure to pass info between
  * platform and PL08x driver regarding channel configuration
@@ -179,7 +182,7 @@ struct pl08x_dma_chan {
 	struct pl08x_txd *at;
 	unsigned long lockflags;
 	spinlock_t lock;
-	void *host;
+	struct pl08x_driver_data *host;
 	enum pl08x_dma_chan_state state;
 	bool slave;
 	struct pl08x_txd *waiting;
-- 
cgit v1.2.3-71-gd317


From cace658572ba5d1075f3891e823130a66f3e330f Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:37:31 +0000
Subject: ARM: PL08x: use 'size_t' for lengths

Use size_t for variables denoting lengths throughout, and use the 'z'
qualifier for printing the value.  For safety, add a BUG_ON() in
pl08x_fill_lli_for_desc() to catch the remainder potentially becoming
negative.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 40 +++++++++++++++++-----------------------
 include/linux/amba/pl08x.h |  4 ++--
 2 files changed, 19 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 760b71eec84c..fa78697790c0 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -342,7 +342,7 @@ static u32 pl08x_getbytes_chan(struct pl08x_dma_chan *plchan)
 	struct pl08x_txd *txdi = NULL;
 	struct pl08x_txd *txd;
 	unsigned long flags;
-	u32 bytes = 0;
+	size_t bytes = 0;
 
 	spin_lock_irqsave(&plchan->lock, flags);
 
@@ -470,7 +470,7 @@ static inline unsigned int pl08x_get_bytes_for_cctl(unsigned int coded)
 }
 
 static inline u32 pl08x_cctl_bits(u32 cctl, u8 srcwidth, u8 dstwidth,
-				  u32 tsize)
+				  size_t tsize)
 {
 	u32 retbits = cctl;
 
@@ -583,6 +583,8 @@ static int pl08x_fill_lli_for_desc(struct pl08x_driver_data *pl08x,
 	if (cctl & PL080_CONTROL_DST_INCR)
 		txd->dstbus.addr += len;
 
+	BUG_ON(*remainder < len);
+
 	*remainder -= len;
 
 	return num_llis + 1;
@@ -591,7 +593,7 @@ static int pl08x_fill_lli_for_desc(struct pl08x_driver_data *pl08x,
 /*
  * Return number of bytes to fill to boundary, or len
  */
-static inline u32 pl08x_pre_boundary(u32 addr, u32 len)
+static inline size_t pl08x_pre_boundary(u32 addr, size_t len)
 {
 	u32 boundary;
 
@@ -614,11 +616,11 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 {
 	struct pl08x_channel_data *cd = txd->cd;
 	struct pl08x_bus_data *mbus, *sbus;
-	u32 remainder;
+	size_t remainder;
 	int num_llis = 0;
 	u32 cctl;
-	int max_bytes_per_lli;
-	int total_bytes = 0;
+	size_t max_bytes_per_lli;
+	size_t total_bytes = 0;
 	struct pl08x_lli *llis_va;
 
 	txd->llis_va = dma_pool_alloc(pl08x->pool, GFP_NOWAIT,
@@ -686,13 +688,13 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	max_bytes_per_lli = min(txd->srcbus.buswidth, txd->dstbus.buswidth) *
 		PL080_CONTROL_TRANSFER_SIZE_MASK;
 	dev_vdbg(&pl08x->adev->dev,
-		 "%s max bytes per lli = %d\n",
+		 "%s max bytes per lli = %zu\n",
 		 __func__, max_bytes_per_lli);
 
 	/* We need to count this down to zero */
 	remainder = txd->len;
 	dev_vdbg(&pl08x->adev->dev,
-		 "%s remainder = %d\n",
+		 "%s remainder = %zu\n",
 		 __func__, remainder);
 
 	/*
@@ -760,9 +762,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 		 * width left
 		 */
 		while (remainder > (mbus->buswidth - 1)) {
-			int lli_len, target_len;
-			int tsize;
-			int odd_bytes;
+			size_t lli_len, target_len, tsize, odd_bytes;
 
 			/*
 			 * If enough left try to send max possible,
@@ -805,7 +805,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
 			if (lli_len <= 0) {
 				dev_err(&pl08x->adev->dev,
-					"%s lli_len is %d, <= 0\n",
+					"%s lli_len is %zu, <= 0\n",
 						__func__, lli_len);
 				return 0;
 			}
@@ -853,7 +853,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
 				if (target_len != lli_len) {
 					dev_vdbg(&pl08x->adev->dev,
-					"%s can't send what we want. Desired 0x%08x, lli of 0x%08x bytes in txd of 0x%08x\n",
+					"%s can't send what we want. Desired 0x%08zx, lli of 0x%08zx bytes in txd of 0x%08zx\n",
 					__func__, target_len, lli_len, txd->len);
 				}
 
@@ -863,7 +863,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 						       tsize);
 
 				dev_vdbg(&pl08x->adev->dev,
-					"%s fill lli with single lli chunk of size 0x%08x (remainder 0x%08x)\n",
+					"%s fill lli with single lli chunk of size 0x%08zx (remainder 0x%08zx)\n",
 					__func__, lli_len, remainder);
 				num_llis = pl08x_fill_lli_for_desc(pl08x, txd,
 						num_llis, lli_len, cctl,
@@ -882,7 +882,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 						&& (remainder); j++) {
 					cctl = pl08x_cctl_bits(cctl, 1, 1, 1);
 					dev_vdbg(&pl08x->adev->dev,
-						"%s align with boundary, single byte (remain 0x%08x)\n",
+						"%s align with boundary, single byte (remain 0x%08zx)\n",
 						__func__, remainder);
 					num_llis =
 						pl08x_fill_lli_for_desc(pl08x,
@@ -896,16 +896,10 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 		/*
 		 * Send any odd bytes
 		 */
-		if (remainder < 0) {
-			dev_err(&pl08x->adev->dev, "%s remainder not fitted 0x%08x bytes\n",
-					__func__, remainder);
-			return 0;
-		}
-
 		while (remainder) {
 			cctl = pl08x_cctl_bits(cctl, 1, 1, 1);
 			dev_vdbg(&pl08x->adev->dev,
-				"%s align with boundary, single odd byte (remain %d)\n",
+				"%s align with boundary, single odd byte (remain %zu)\n",
 				__func__, remainder);
 			num_llis = pl08x_fill_lli_for_desc(pl08x, txd, num_llis,
 					1, cctl, &remainder);
@@ -914,7 +908,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	}
 	if (total_bytes != txd->len) {
 		dev_err(&pl08x->adev->dev,
-			"%s size of encoded lli:s don't match total txd, transferred 0x%08x from size 0x%08x\n",
+			"%s size of encoded lli:s don't match total txd, transferred 0x%08zx from size 0x%08zx\n",
 			__func__, total_bytes, txd->len);
 		return 0;
 	}
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 3ecc20fce26a..2c834ed5f41f 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -77,7 +77,7 @@ struct pl08x_bus_data {
 	dma_addr_t addr;
 	u8 maxwidth;
 	u8 buswidth;
-	u32 fill_bytes;
+	size_t fill_bytes;
 };
 
 /**
@@ -113,7 +113,7 @@ struct pl08x_txd {
 	enum dma_data_direction	direction;
 	struct pl08x_bus_data srcbus;
 	struct pl08x_bus_data dstbus;
-	int len;
+	size_t len;
 	dma_addr_t llis_bus;
 	void *llis_va;
 	struct pl08x_channel_data *cd;
-- 
cgit v1.2.3-71-gd317


From 19524d77ec34faf58d313ba34fb755ef6e159216 Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:39:13 +0000
Subject: ARM: PL08x: avoid duplicating registers in txd and phychan structures

As we now have all the code accessing the phychan {csrc,cdst,clli,cctl,
ccfg} members in one function, there's no point storing the data into
the struct.  Get rid of the struct members.  Re-order the register dump
in the dev_dbg() to reflect the order we write the registers to the DMA
device.

The txd {csrc,cdst,clli,cctl} values are duplicates of the lli[0]
values, so there's no point duplicating these either.  Program the DMAC
registers directly from the lli[0] values.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 57 ++++++++++++++++------------------------------
 include/linux/amba/pl08x.h | 13 -----------
 2 files changed, 19 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index c025a4b4bae4..a1a18bde6b7f 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -193,33 +193,25 @@ static void pl08x_start_txd(struct pl08x_dma_chan *plchan,
 {
 	struct pl08x_driver_data *pl08x = plchan->host;
 	struct pl08x_phy_chan *phychan = plchan->phychan;
-	u32 val;
+	struct pl08x_lli *lli = &txd->llis_va[0];
+	u32 val, ccfg;
 
 	plchan->at = txd;
 
-	/* Copy the basic control register calculated at transfer config */
-	phychan->csrc = txd->csrc;
-	phychan->cdst = txd->cdst;
-	phychan->clli = txd->clli;
-	phychan->cctl = txd->cctl;
-
 	/* Assign the signal to the proper control registers */
-	phychan->ccfg = plchan->cd->ccfg;
-	phychan->ccfg &= ~PL080_CONFIG_SRC_SEL_MASK;
-	phychan->ccfg &= ~PL080_CONFIG_DST_SEL_MASK;
+	ccfg = plchan->cd->ccfg;
+	ccfg &= ~(PL080_CONFIG_SRC_SEL_MASK | PL080_CONFIG_DST_SEL_MASK);
+
 	/* If it wasn't set from AMBA, ignore it */
 	if (txd->direction == DMA_TO_DEVICE)
 		/* Select signal as destination */
-		phychan->ccfg |=
-			(phychan->signal << PL080_CONFIG_DST_SEL_SHIFT);
+		ccfg |= phychan->signal << PL080_CONFIG_DST_SEL_SHIFT;
 	else if (txd->direction == DMA_FROM_DEVICE)
 		/* Select signal as source */
-		phychan->ccfg |=
-			(phychan->signal << PL080_CONFIG_SRC_SEL_SHIFT);
-	/* Always enable error interrupts */
-	phychan->ccfg |= PL080_CONFIG_ERR_IRQ_MASK;
-	/* Always enable terminal interrupts */
-	phychan->ccfg |= PL080_CONFIG_TC_IRQ_MASK;
+		ccfg |= phychan->signal << PL080_CONFIG_SRC_SEL_SHIFT;
+
+	/* Always enable error and terminal interrupts */
+	ccfg |= PL080_CONFIG_ERR_IRQ_MASK | PL080_CONFIG_TC_IRQ_MASK;
 
 	/* Wait for channel inactive */
 	while (pl08x_phy_channel_busy(phychan))
@@ -227,19 +219,15 @@ static void pl08x_start_txd(struct pl08x_dma_chan *plchan,
 
 	dev_vdbg(&pl08x->adev->dev,
 		"WRITE channel %d: csrc=0x%08x, cdst=0x%08x, "
-		 "cctl=0x%08x, clli=0x%08x, ccfg=0x%08x\n",
-		phychan->id,
-		phychan->csrc,
-		phychan->cdst,
-		phychan->cctl,
-		phychan->clli,
-		phychan->ccfg);
-
-	writel(phychan->csrc, phychan->base + PL080_CH_SRC_ADDR);
-	writel(phychan->cdst, phychan->base + PL080_CH_DST_ADDR);
-	writel(phychan->clli, phychan->base + PL080_CH_LLI);
-	writel(phychan->cctl, phychan->base + PL080_CH_CONTROL);
-	writel(phychan->ccfg, phychan->base + PL080_CH_CONFIG);
+		"clli=0x%08x, cctl=0x%08x, ccfg=0x%08x\n",
+		phychan->id, lli->src, lli->dst, lli->lli, lli->cctl,
+		ccfg);
+
+	writel(lli->src, phychan->base + PL080_CH_SRC_ADDR);
+	writel(lli->dst, phychan->base + PL080_CH_DST_ADDR);
+	writel(lli->lli, phychan->base + PL080_CH_LLI);
+	writel(lli->cctl, phychan->base + PL080_CH_CONTROL);
+	writel(ccfg, phychan->base + PL080_CH_CONFIG);
 
 	/* Enable the DMA channel */
 	/* Do not access config register until channel shows as disabled */
@@ -920,13 +908,6 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	 */
 	llis_va[num_llis - 1].cctl |= PL080_CONTROL_TC_IRQ_EN;
 
-	/* Now store the channel register values */
-	txd->csrc = llis_va[0].src;
-	txd->cdst = llis_va[0].dst;
-	txd->clli = llis_va[0].lli;
-	txd->cctl = llis_va[0].cctl;
-	/* ccfg will be set at physical channel allocation time */
-
 #ifdef VERBOSE_DEBUG
 	{
 		int i;
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 2c834ed5f41f..29d974562df9 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -95,11 +95,6 @@ struct pl08x_phy_chan {
 	spinlock_t lock;
 	int signal;
 	struct pl08x_dma_chan *serving;
-	u32 csrc;
-	u32 cdst;
-	u32 clli;
-	u32 cctl;
-	u32 ccfg;
 };
 
 /**
@@ -118,14 +113,6 @@ struct pl08x_txd {
 	void *llis_va;
 	struct pl08x_channel_data *cd;
 	bool active;
-	/*
-	 * Settings to be put into the physical channel when we
-	 * trigger this txd
-	 */
-	u32 csrc;
-	u32 cdst;
-	u32 clli;
-	u32 cctl;
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 4983a04fd2562986360b646b378f267308bc22c0 Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:39:33 +0000
Subject: ARM: PL08x: move ccfg into txd structure

The ccfg register is used to configure the channel parameters - the type
and direction of transfer, the flow control signal and IRQ mask enables.
The type and direction of transfer is known in the relevent prep_*
function where a txd is created.  The IRQ mask enables are always set,
and the flow control signals are always set when we start processing a
txd according to phychan->signal.

If we store the ccfg value in the txd structure, we can avoid modifying
platform data - and even having it in platform data at all.

So, remove it from platform data too.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 28 ++++++++++++----------------
 include/linux/amba/pl08x.h |  6 +++++-
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index a1a18bde6b7f..75f9e2d4b032 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -194,15 +194,11 @@ static void pl08x_start_txd(struct pl08x_dma_chan *plchan,
 	struct pl08x_driver_data *pl08x = plchan->host;
 	struct pl08x_phy_chan *phychan = plchan->phychan;
 	struct pl08x_lli *lli = &txd->llis_va[0];
-	u32 val, ccfg;
+	u32 val, ccfg = txd->ccfg;
 
 	plchan->at = txd;
 
-	/* Assign the signal to the proper control registers */
-	ccfg = plchan->cd->ccfg;
-	ccfg &= ~(PL080_CONFIG_SRC_SEL_MASK | PL080_CONFIG_DST_SEL_MASK);
-
-	/* If it wasn't set from AMBA, ignore it */
+	/* Assign the flow control signal to this channel */
 	if (txd->direction == DMA_TO_DEVICE)
 		/* Select signal as destination */
 		ccfg |= phychan->signal << PL080_CONFIG_DST_SEL_SHIFT;
@@ -210,9 +206,6 @@ static void pl08x_start_txd(struct pl08x_dma_chan *plchan,
 		/* Select signal as source */
 		ccfg |= phychan->signal << PL080_CONFIG_SRC_SEL_SHIFT;
 
-	/* Always enable error and terminal interrupts */
-	ccfg |= PL080_CONFIG_ERR_IRQ_MASK | PL080_CONFIG_TC_IRQ_MASK;
-
 	/* Wait for channel inactive */
 	while (pl08x_phy_channel_busy(phychan))
 		cpu_relax();
@@ -1161,8 +1154,6 @@ static void dma_set_runtime_config(struct dma_chan *chan,
 	enum dma_slave_buswidth addr_width;
 	u32 maxburst;
 	u32 cctl = 0;
-	/* Mask out all except src and dst channel */
-	u32 ccfg = cd->ccfg & 0x000003DEU;
 	int i;
 
 	/* Transfer direction */
@@ -1170,13 +1161,11 @@ static void dma_set_runtime_config(struct dma_chan *chan,
 	if (config->direction == DMA_TO_DEVICE) {
 		plchan->runtime_addr = config->dst_addr;
 		cctl |= PL080_CONTROL_SRC_INCR;
-		ccfg |= PL080_FLOW_MEM2PER << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		addr_width = config->dst_addr_width;
 		maxburst = config->dst_maxburst;
 	} else if (config->direction == DMA_FROM_DEVICE) {
 		plchan->runtime_addr = config->src_addr;
 		cctl |= PL080_CONTROL_DST_INCR;
-		ccfg |= PL080_FLOW_PER2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		addr_width = config->src_addr_width;
 		maxburst = config->src_maxburst;
 	} else {
@@ -1226,16 +1215,15 @@ static void dma_set_runtime_config(struct dma_chan *chan,
 
 	/* Modify the default channel data to fit PrimeCell request */
 	cd->cctl = cctl;
-	cd->ccfg = ccfg;
 
 	dev_dbg(&pl08x->adev->dev,
 		"configured channel %s (%s) for %s, data width %d, "
-		"maxburst %d words, LE, CCTL=0x%08x, CCFG=0x%08x\n",
+		"maxburst %d words, LE, CCTL=0x%08x\n",
 		dma_chan_name(chan), plchan->name,
 		(config->direction == DMA_FROM_DEVICE) ? "RX" : "TX",
 		addr_width,
 		maxburst,
-		cctl, ccfg);
+		cctl);
 }
 
 /*
@@ -1340,6 +1328,10 @@ static struct pl08x_txd *pl08x_get_txd(struct pl08x_dma_chan *plchan)
 		dma_async_tx_descriptor_init(&txd->tx, &plchan->chan);
 		txd->tx.tx_submit = pl08x_tx_submit;
 		INIT_LIST_HEAD(&txd->node);
+
+		/* Always enable error and terminal interrupts */
+		txd->ccfg = PL080_CONFIG_ERR_IRQ_MASK |
+			    PL080_CONFIG_TC_IRQ_MASK;
 	}
 	return txd;
 }
@@ -1369,6 +1361,8 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 
 	/* Set platform data for m2m */
 	txd->cd = &pl08x->pd->memcpy_channel;
+	txd->ccfg |= PL080_FLOW_MEM2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
+
 	/* Both to be incremented or the code will break */
 	txd->cd->cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR;
 	txd->len = len;
@@ -1424,12 +1418,14 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 	 */
 	txd->direction = direction;
 	if (direction == DMA_TO_DEVICE) {
+		txd->ccfg |= PL080_FLOW_MEM2PER << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		txd->srcbus.addr = sgl->dma_address;
 		if (plchan->runtime_addr)
 			txd->dstbus.addr = plchan->runtime_addr;
 		else
 			txd->dstbus.addr = plchan->cd->addr;
 	} else if (direction == DMA_FROM_DEVICE) {
+		txd->ccfg |= PL080_FLOW_PER2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		if (plchan->runtime_addr)
 			txd->srcbus.addr = plchan->runtime_addr;
 		else
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 29d974562df9..8e74cb1845dd 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -58,7 +58,6 @@ struct pl08x_channel_data {
 	int max_signal;
 	u32 muxval;
 	u32 cctl;
-	u32 ccfg;
 	dma_addr_t addr;
 	bool circular_buffer;
 	bool single;
@@ -113,6 +112,11 @@ struct pl08x_txd {
 	void *llis_va;
 	struct pl08x_channel_data *cd;
 	bool active;
+	/*
+	 * Settings to be put into the physical channel when we
+	 * trigger this txd.  Other registers are in llis_va[0].
+	 */
+	u32 ccfg;
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 70b5ed6b6d72cd8b1a3d4b7b878a0dd132bec7ba Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:40:13 +0000
Subject: ARM: PL08x: move default cctl into txd structure

Rather than modifying platform data while preparing a transfer, copy
the cctl value into the txd structure and modify the value there.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 21 ++++++---------------
 include/linux/amba/pl08x.h |  3 ++-
 2 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index f0a29885cb83..6d224d41be2d 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -577,7 +577,6 @@ static inline size_t pl08x_pre_boundary(u32 addr, size_t len)
 static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 			      struct pl08x_txd *txd)
 {
-	struct pl08x_channel_data *cd = txd->cd;
 	struct pl08x_bus_data *mbus, *sbus;
 	size_t remainder;
 	int num_llis = 0;
@@ -595,17 +594,8 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
 	pl08x->pool_ctr++;
 
-	/*
-	 * Initialize bus values for this transfer
-	 * from the passed optimal values
-	 */
-	if (!cd) {
-		dev_err(&pl08x->adev->dev, "%s no channel data\n", __func__);
-		return 0;
-	}
-
-	/* Get the default CCTL from the platform data */
-	cctl = cd->cctl;
+	/* Get the default CCTL */
+	cctl = txd->cctl;
 
 	/*
 	 * On the PL080 we have two bus masters and we
@@ -1358,11 +1348,11 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 	txd->dstbus.addr = dest;
 
 	/* Set platform data for m2m */
-	txd->cd = &pl08x->pd->memcpy_channel;
 	txd->ccfg |= PL080_FLOW_MEM2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
+	txd->cctl = pl08x->pd->memcpy_channel.cctl;
 
 	/* Both to be incremented or the code will break */
-	txd->cd->cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR;
+	txd->cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR;
 	txd->len = len;
 
 	ret = pl08x_prep_channel_resources(plchan, txd);
@@ -1415,6 +1405,8 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 	 * channel target address dynamically at runtime.
 	 */
 	txd->direction = direction;
+	txd->cctl = plchan->cd->cctl;
+
 	if (direction == DMA_TO_DEVICE) {
 		txd->ccfg |= PL080_FLOW_MEM2PER << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		txd->srcbus.addr = sgl->dma_address;
@@ -1434,7 +1426,6 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 			"%s direction unsupported\n", __func__);
 		return NULL;
 	}
-	txd->cd = plchan->cd;
 	txd->len = sgl->length;
 
 	ret = pl08x_prep_channel_resources(plchan, txd);
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 8e74cb1845dd..8d9083067d3d 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -110,8 +110,9 @@ struct pl08x_txd {
 	size_t len;
 	dma_addr_t llis_bus;
 	void *llis_va;
-	struct pl08x_channel_data *cd;
 	bool active;
+	/* Default cctl value for LLIs */
+	u32 cctl;
 	/*
 	 * Settings to be put into the physical channel when we
 	 * trigger this txd.  Other registers are in llis_va[0].
-- 
cgit v1.2.3-71-gd317


From 30749cb4a40f02a199640011e5ab5c5f60b8482e Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:41:13 +0000
Subject: ARM: PL08x: allow AHB master port selection to be configured

Platforms need to be able to control which AHB master interface is used,
as each AHB master interface may be asymetric.  Allow the interfaces
used for fetching LLIs, memory, and each peripheral to be configured
individually.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 77 ++++++++++++++++++++++++++--------------------
 include/linux/amba/pl08x.h | 15 +++++++--
 2 files changed, 56 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 4ee0ab19b1b6..986c12775b0b 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -126,6 +126,8 @@ struct pl08x_lli {
  * @phy_chans: array of data for the physical channels
  * @pool: a pool for the LLI descriptors
  * @pool_ctr: counter of LLIs in the pool
+ * @lli_buses: bitmask to or in to LLI pointer selecting AHB port for LLI fetches
+ * @mem_buses: set to indicate memory transfers on AHB2.
  * @lock: a spinlock for this struct
  */
 struct pl08x_driver_data {
@@ -138,6 +140,8 @@ struct pl08x_driver_data {
 	struct pl08x_phy_chan *phy_chans;
 	struct dma_pool *pool;
 	int pool_ctr;
+	u8 lli_buses;
+	u8 mem_buses;
 	spinlock_t lock;
 };
 
@@ -526,20 +530,12 @@ static int pl08x_fill_lli_for_desc(struct pl08x_driver_data *pl08x,
 
 	BUG_ON(num_llis >= MAX_NUM_TSFR_LLIS);
 
-	llis_va[num_llis].cctl		= cctl;
-	llis_va[num_llis].src		= txd->srcbus.addr;
-	llis_va[num_llis].dst		= txd->dstbus.addr;
-
-	/*
-	 * On versions with dual masters, you can optionally AND on
-	 * PL080_LLI_LM_AHB2 to the LLI to tell the hardware to read
-	 * in new LLIs with that controller, but we always try to
-	 * choose AHB1 to point into memory. The idea is to have AHB2
-	 * fixed on the peripheral and AHB1 messing around in the
-	 * memory. So we don't manipulate this bit currently.
-	 */
-
+	llis_va[num_llis].cctl = cctl;
+	llis_va[num_llis].src = txd->srcbus.addr;
+	llis_va[num_llis].dst = txd->dstbus.addr;
 	llis_va[num_llis].lli = llis_bus + (num_llis + 1) * sizeof(struct pl08x_lli);
+	if (pl08x->lli_buses & PL08X_AHB2)
+		llis_va[num_llis].lli |= PL080_LLI_LM_AHB2;
 
 	if (cctl & PL080_CONTROL_SRC_INCR)
 		txd->srcbus.addr += len;
@@ -639,13 +635,6 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	pl08x_choose_master_bus(&txd->srcbus, &txd->dstbus,
 				&mbus, &sbus, cctl);
 
-
-	/*
-	 * The lowest bit of the LLI register
-	 * is also used to indicate which master to
-	 * use for reading the LLIs.
-	 */
-
 	if (txd->len < mbus->buswidth) {
 		/*
 		 * Less than a bus width available
@@ -1282,6 +1271,23 @@ static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan,
 	return 0;
 }
 
+/*
+ * Given the source and destination available bus masks, select which
+ * will be routed to each port.  We try to have source and destination
+ * on separate ports, but always respect the allowable settings.
+ */
+static u32 pl08x_select_bus(struct pl08x_driver_data *pl08x, u8 src, u8 dst)
+{
+	u32 cctl = 0;
+
+	if (!(dst & PL08X_AHB1) || ((dst & PL08X_AHB2) && (src & PL08X_AHB1)))
+		cctl |= PL080_CONTROL_DST_AHB2;
+	if (!(src & PL08X_AHB1) || ((src & PL08X_AHB2) && !(dst & PL08X_AHB2)))
+		cctl |= PL080_CONTROL_SRC_AHB2;
+
+	return cctl;
+}
+
 static struct pl08x_txd *pl08x_get_txd(struct pl08x_dma_chan *plchan)
 {
 	struct pl08x_txd *txd = kzalloc(sizeof(struct pl08x_txd), GFP_NOWAIT);
@@ -1330,15 +1336,9 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 	/* Both to be incremented or the code will break */
 	txd->cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR;
 
-	/*
-	 * On the PL080 we have two bus masters and we should select one for
-	 * source and one for destination. We try to use AHB2 for the bus
-	 * which does not increment (typically the peripheral) else we just
-	 * choose something.
-	 */
 	if (pl08x->vd->dualmaster)
-		/* Source increments, use AHB2 for destination */
-		txd->cctl |= PL080_CONTROL_DST_AHB2;
+		txd->cctl |= pl08x_select_bus(pl08x,
+					pl08x->mem_buses, pl08x->mem_buses);
 
 	ret = pl08x_prep_channel_resources(plchan, txd);
 	if (ret)
@@ -1359,6 +1359,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 	struct pl08x_dma_chan *plchan = to_pl08x_chan(chan);
 	struct pl08x_driver_data *pl08x = plchan->host;
 	struct pl08x_txd *txd;
+	u8 src_buses, dst_buses;
 	int ret;
 
 	/*
@@ -1403,31 +1404,31 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 	if (direction == DMA_TO_DEVICE) {
 		txd->ccfg |= PL080_FLOW_MEM2PER << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		txd->cctl |= PL080_CONTROL_SRC_INCR;
-		if (pl08x->vd->dualmaster)
-			/* Source increments, use AHB2 for destination */
-			txd->cctl |= PL080_CONTROL_DST_AHB2;
 		txd->srcbus.addr = sgl->dma_address;
 		if (plchan->runtime_addr)
 			txd->dstbus.addr = plchan->runtime_addr;
 		else
 			txd->dstbus.addr = plchan->cd->addr;
+		src_buses = pl08x->mem_buses;
+		dst_buses = plchan->cd->periph_buses;
 	} else if (direction == DMA_FROM_DEVICE) {
 		txd->ccfg |= PL080_FLOW_PER2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		txd->cctl |= PL080_CONTROL_DST_INCR;
-		if (pl08x->vd->dualmaster)
-			/* Destination increments, use AHB2 for source */
-			txd->cctl |= PL080_CONTROL_SRC_AHB2;
 		if (plchan->runtime_addr)
 			txd->srcbus.addr = plchan->runtime_addr;
 		else
 			txd->srcbus.addr = plchan->cd->addr;
 		txd->dstbus.addr = sgl->dma_address;
+		src_buses = plchan->cd->periph_buses;
+		dst_buses = pl08x->mem_buses;
 	} else {
 		dev_err(&pl08x->adev->dev,
 			"%s direction unsupported\n", __func__);
 		return NULL;
 	}
 
+	txd->cctl |= pl08x_select_bus(pl08x, src_buses, dst_buses);
+
 	ret = pl08x_prep_channel_resources(plchan, txd);
 	if (ret)
 		return NULL;
@@ -1879,6 +1880,14 @@ static int pl08x_probe(struct amba_device *adev, struct amba_id *id)
 	pl08x->adev = adev;
 	pl08x->vd = vd;
 
+	/* By default, AHB1 only.  If dualmaster, from platform */
+	pl08x->lli_buses = PL08X_AHB1;
+	pl08x->mem_buses = PL08X_AHB1;
+	if (pl08x->vd->dualmaster) {
+		pl08x->lli_buses = pl08x->pd->lli_buses;
+		pl08x->mem_buses = pl08x->pd->mem_buses;
+	}
+
 	/* A DMA memory pool for LLIs, align on 1-byte boundary */
 	pl08x->pool = dma_pool_create(DRIVER_NAME, &pl08x->adev->dev,
 			PL08X_LLI_TSFR_SIZE, PL08X_ALIGN, 0);
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 8d9083067d3d..f858651027fd 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -25,6 +25,12 @@
 struct pl08x_lli;
 struct pl08x_driver_data;
 
+/* Bitmasks for selecting AHB ports for DMA transfers */
+enum {
+	PL08X_AHB1 = (1 << 0),
+	PL08X_AHB2 = (1 << 1)
+};
+
 /**
  * struct pl08x_channel_data - data structure to pass info between
  * platform and PL08x driver regarding channel configuration
@@ -51,6 +57,8 @@ struct pl08x_driver_data;
  * round round round)
  * @single: the device connected to this channel will request single
  * DMA transfers, not bursts. (Bursts are default.)
+ * @periph_buses: the device connected to this channel is accessible via
+ * these buses (use PL08X_AHB1 | PL08X_AHB2).
  */
 struct pl08x_channel_data {
 	char *bus_id;
@@ -61,6 +69,7 @@ struct pl08x_channel_data {
 	dma_addr_t addr;
 	bool circular_buffer;
 	bool single;
+	u8 periph_buses;
 };
 
 /**
@@ -193,8 +202,8 @@ struct pl08x_dma_chan {
  * less than zero, else it returns the allocated signal number
  * @put_signal: indicate to the platform that this physical signal is not
  * running any DMA transfer and multiplexing can be recycled
- * @bus_bit_lli: Bit[0] of the address indicated which AHB bus master the
- * LLI addresses are on 0/1 Master 1/2.
+ * @lli_buses: buses which LLIs can be fetched from: PL08X_AHB1 | PL08X_AHB2
+ * @mem_buses: buses which memory can be accessed from: PL08X_AHB1 | PL08X_AHB2
  */
 struct pl08x_platform_data {
 	struct pl08x_channel_data *slave_channels;
@@ -202,6 +211,8 @@ struct pl08x_platform_data {
 	struct pl08x_channel_data memcpy_channel;
 	int (*get_signal)(struct pl08x_dma_chan *);
 	void (*put_signal)(struct pl08x_dma_chan *);
+	u8 lli_buses;
+	u8 mem_buses;
 };
 
 #ifdef CONFIG_AMBA_PL08X
-- 
cgit v1.2.3-71-gd317


From d7244e9a27a3da27d62aabf560ee828d7991493e Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:43:35 +0000
Subject: ARM: PL08x: shrink srcbus/dstbus in txd structure

We only need to store the dma address.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 20 ++++++++++----------
 include/linux/amba/pl08x.h |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 39970c5038d6..8d8d33ce5684 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -595,8 +595,8 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
 	bd.txd = txd;
 	bd.pl08x = pl08x;
-	bd.srcbus.addr = txd->srcbus.addr;
-	bd.dstbus.addr = txd->dstbus.addr;
+	bd.srcbus.addr = txd->src_addr;
+	bd.dstbus.addr = txd->dst_addr;
 
 	/* Find maximum width of the source bus */
 	bd.srcbus.maxwidth =
@@ -1313,8 +1313,8 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 	}
 
 	txd->direction = DMA_NONE;
-	txd->srcbus.addr = src;
-	txd->dstbus.addr = dest;
+	txd->src_addr = src;
+	txd->dst_addr = dest;
 	txd->len = len;
 
 	/* Set platform data for m2m */
@@ -1393,21 +1393,21 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 	if (direction == DMA_TO_DEVICE) {
 		txd->ccfg |= PL080_FLOW_MEM2PER << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		txd->cctl |= PL080_CONTROL_SRC_INCR;
-		txd->srcbus.addr = sgl->dma_address;
+		txd->src_addr = sgl->dma_address;
 		if (plchan->runtime_addr)
-			txd->dstbus.addr = plchan->runtime_addr;
+			txd->dst_addr = plchan->runtime_addr;
 		else
-			txd->dstbus.addr = plchan->cd->addr;
+			txd->dst_addr = plchan->cd->addr;
 		src_buses = pl08x->mem_buses;
 		dst_buses = plchan->cd->periph_buses;
 	} else if (direction == DMA_FROM_DEVICE) {
 		txd->ccfg |= PL080_FLOW_PER2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT;
 		txd->cctl |= PL080_CONTROL_DST_INCR;
 		if (plchan->runtime_addr)
-			txd->srcbus.addr = plchan->runtime_addr;
+			txd->src_addr = plchan->runtime_addr;
 		else
-			txd->srcbus.addr = plchan->cd->addr;
-		txd->dstbus.addr = sgl->dma_address;
+			txd->src_addr = plchan->cd->addr;
+		txd->dst_addr = sgl->dma_address;
 		src_buses = plchan->cd->periph_buses;
 		dst_buses = pl08x->mem_buses;
 	} else {
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index f858651027fd..4e9e71860e1c 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -114,8 +114,8 @@ struct pl08x_txd {
 	struct dma_async_tx_descriptor tx;
 	struct list_head node;
 	enum dma_data_direction	direction;
-	struct pl08x_bus_data srcbus;
-	struct pl08x_bus_data dstbus;
+	dma_addr_t src_addr;
+	dma_addr_t dst_addr;
 	size_t len;
 	dma_addr_t llis_bus;
 	void *llis_va;
-- 
cgit v1.2.3-71-gd317


From 15c17232fbd1f7687c740c3c26f9e7f337bd9e36 Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:44:36 +0000
Subject: ARM: PL08x: rename 'desc_list' as 'pend_list'

This 'desc_list' is actually a list of pending descriptors, so name
it after its function (pending list) rather than what it contains
(descriptors).

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 20 ++++++++++----------
 include/linux/amba/pl08x.h |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index fb469dedcdf3..433b9e747f75 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -348,9 +348,9 @@ static u32 pl08x_getbytes_chan(struct pl08x_dma_chan *plchan)
 	}
 
 	/* Sum up all queued transactions */
-	if (!list_empty(&plchan->desc_list)) {
+	if (!list_empty(&plchan->pend_list)) {
 		struct pl08x_txd *txdi;
-		list_for_each_entry(txdi, &plchan->desc_list, node) {
+		list_for_each_entry(txdi, &plchan->pend_list, node) {
 			bytes += txdi->len;
 		}
 	}
@@ -880,9 +880,9 @@ static void pl08x_free_txd_list(struct pl08x_driver_data *pl08x,
 	struct pl08x_txd *txdi = NULL;
 	struct pl08x_txd *next;
 
-	if (!list_empty(&plchan->desc_list)) {
+	if (!list_empty(&plchan->pend_list)) {
 		list_for_each_entry_safe(txdi,
-					 next, &plchan->desc_list, node) {
+					 next, &plchan->pend_list, node) {
 			list_del(&txdi->node);
 			pl08x_free_txd(pl08x, txdi);
 		}
@@ -1183,10 +1183,10 @@ static void pl08x_issue_pending(struct dma_chan *chan)
 	}
 
 	/* Take the first element in the queue and execute it */
-	if (!list_empty(&plchan->desc_list)) {
+	if (!list_empty(&plchan->pend_list)) {
 		struct pl08x_txd *next;
 
-		next = list_first_entry(&plchan->desc_list,
+		next = list_first_entry(&plchan->pend_list,
 					struct pl08x_txd,
 					node);
 		list_del(&next->node);
@@ -1213,7 +1213,7 @@ static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan,
 
 	spin_lock_irqsave(&plchan->lock, plchan->lockflags);
 
-	list_add_tail(&txd->node, &plchan->desc_list);
+	list_add_tail(&txd->node, &plchan->pend_list);
 
 	/*
 	 * See if we already have a physical channel allocated,
@@ -1571,10 +1571,10 @@ static void pl08x_tasklet(unsigned long data)
 	 * If a new descriptor is queued, set it up
 	 * plchan->at is NULL here
 	 */
-	if (!list_empty(&plchan->desc_list)) {
+	if (!list_empty(&plchan->pend_list)) {
 		struct pl08x_txd *next;
 
-		next = list_first_entry(&plchan->desc_list,
+		next = list_first_entry(&plchan->pend_list,
 					struct pl08x_txd,
 					node);
 		list_del(&next->node);
@@ -1736,7 +1736,7 @@ static int pl08x_dma_init_virtual_channels(struct pl08x_driver_data *pl08x,
 		chan->lc = 0;
 
 		spin_lock_init(&chan->lock);
-		INIT_LIST_HEAD(&chan->desc_list);
+		INIT_LIST_HEAD(&chan->pend_list);
 		tasklet_init(&chan->tasklet, pl08x_tasklet,
 			     (unsigned long) chan);
 
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 4e9e71860e1c..08a9024e2d2f 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -158,7 +158,7 @@ enum pl08x_dma_chan_state {
  * @runtime_direction: current direction of this channel according to
  * runtime config
  * @lc: last completed transaction on this channel
- * @desc_list: queued transactions pending on this channel
+ * @pend_list: queued transactions pending on this channel
  * @at: active transaction on this channel
  * @lockflags: sometimes we let a lock last between two function calls,
  * especially prep/submit, and then we need to store the IRQ flags
@@ -179,7 +179,7 @@ struct pl08x_dma_chan {
 	dma_addr_t runtime_addr;
 	enum dma_data_direction	runtime_direction;
 	dma_cookie_t lc;
-	struct list_head desc_list;
+	struct list_head pend_list;
 	struct pl08x_txd *at;
 	unsigned long lockflags;
 	spinlock_t lock;
-- 
cgit v1.2.3-71-gd317


From 8087aacda040bdbf84940712d132ce80c30b9d5d Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:45:17 +0000
Subject: ARM: PL08x: introduce 'phychan_hold' to hold on to physical channels

Introduce 'phychan_hold' to hold on to physical DMA channels while we're
preparing a new descriptor for it.  This will be incremented when we
allocate a physical channel and set the MUX registers during the
preparation of the TXD, and will only be decremented when the TXD is
submitted.

This prevents the physical channel being given up before the new TXD
is placed on the queue.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 11 +++++++++++
 include/linux/amba/pl08x.h |  3 +++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 650e2bbc7aad..bf6f7d02c9f6 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -959,6 +959,7 @@ static int prep_phy_channel(struct pl08x_dma_chan *plchan,
 		 ch->signal,
 		 plchan->name);
 
+	plchan->phychan_hold++;
 	plchan->phychan = ch;
 
 	return 0;
@@ -998,6 +999,8 @@ static dma_cookie_t pl08x_tx_submit(struct dma_async_tx_descriptor *tx)
 		/* Do this memcpy whenever there is a channel ready */
 		plchan->state = PL08X_CHAN_WAITING;
 		plchan->waiting = txd;
+	} else {
+		plchan->phychan_hold--;
 	}
 
 	/* This unlock follows the lock in the prep() function */
@@ -1585,6 +1588,7 @@ static void pl08x_tasklet(unsigned long data)
 		 */
 		plchan->lc = txd->tx.cookie;
 	}
+
 	/*
 	 * If a new descriptor is queued, set it up
 	 * plchan->at is NULL here
@@ -1598,6 +1602,12 @@ static void pl08x_tasklet(unsigned long data)
 		list_del(&next->node);
 
 		pl08x_start_txd(plchan, next);
+	} else if (plchan->phychan_hold) {
+		/*
+		 * This channel is still in use - we have a new txd being
+		 * prepared and will soon be queued.  Don't give up the
+		 * physical channel.
+		 */
 	} else {
 		struct pl08x_dma_chan *waiting = NULL;
 
@@ -1625,6 +1635,7 @@ static void pl08x_tasklet(unsigned long data)
 				ret = prep_phy_channel(waiting,
 						       waiting->waiting);
 				BUG_ON(ret);
+				waiting->phychan_hold--;
 				waiting->state = PL08X_CHAN_RUNNING;
 				waiting->waiting = NULL;
 				pl08x_issue_pending(&waiting->chan);
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 08a9024e2d2f..95b76ea1829f 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -151,6 +151,8 @@ enum pl08x_dma_chan_state {
  * struct pl08x_dma_chan - this structure wraps a DMA ENGINE channel
  * @chan: wrappped abstract channel
  * @phychan: the physical channel utilized by this channel, if there is one
+ * @phychan_hold: if non-zero, hold on to the physical channel even if we
+ *  have no pending entries
  * @tasklet: tasklet scheduled by the IRQ to handle actual work etc
  * @name: name of channel
  * @cd: channel platform data
@@ -173,6 +175,7 @@ enum pl08x_dma_chan_state {
 struct pl08x_dma_chan {
 	struct dma_chan chan;
 	struct pl08x_phy_chan *phychan;
+	int phychan_hold;
 	struct tasklet_struct tasklet;
 	char *name;
 	struct pl08x_channel_data *cd;
-- 
cgit v1.2.3-71-gd317


From c370e594efe2993620d24d41a78f325102e99d1c Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Mon, 3 Jan 2011 22:45:37 +0000
Subject: ARM: PL08x: fix locking between prepare function and submit function

The PL08x driver holds on to the channel lock with interrupts disabled
between the prepare and the subsequent submit API functions.  This
means that the locking state when the prepare function returns is
dependent on whether it suceeeds or not.

It did this to ensure that the physical channel wasn't released, and
as it used to add the descriptor onto the pending list at prepare time
rather than submit time.

Now that we have reorganized the code to remove those reasons, we can
now safely release the spinlock at the end of preparation and reacquire
it in our submit function.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 28 +++++++++-------------------
 include/linux/amba/pl08x.h |  4 ----
 2 files changed, 9 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index bf6f7d02c9f6..1c9f712520d6 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -981,6 +981,9 @@ static dma_cookie_t pl08x_tx_submit(struct dma_async_tx_descriptor *tx)
 {
 	struct pl08x_dma_chan *plchan = to_pl08x_chan(tx->chan);
 	struct pl08x_txd *txd = to_pl08x_txd(tx);
+	unsigned long flags;
+
+	spin_lock_irqsave(&plchan->lock, flags);
 
 	plchan->chan.cookie += 1;
 	if (plchan->chan.cookie < 0)
@@ -1003,8 +1006,7 @@ static dma_cookie_t pl08x_tx_submit(struct dma_async_tx_descriptor *tx)
 		plchan->phychan_hold--;
 	}
 
-	/* This unlock follows the lock in the prep() function */
-	spin_unlock_irqrestore(&plchan->lock, plchan->lockflags);
+	spin_unlock_irqrestore(&plchan->lock, flags);
 
 	return tx->cookie;
 }
@@ -1225,9 +1227,9 @@ static void pl08x_issue_pending(struct dma_chan *chan)
 static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan,
 					struct pl08x_txd *txd)
 {
-	int num_llis;
 	struct pl08x_driver_data *pl08x = plchan->host;
-	int ret;
+	unsigned long flags;
+	int num_llis, ret;
 
 	num_llis = pl08x_fill_llis_for_desc(pl08x, txd);
 	if (!num_llis) {
@@ -1235,7 +1237,7 @@ static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan,
 		return -EINVAL;
 	}
 
-	spin_lock_irqsave(&plchan->lock, plchan->lockflags);
+	spin_lock_irqsave(&plchan->lock, flags);
 
 	/*
 	 * See if we already have a physical channel allocated,
@@ -1258,7 +1260,7 @@ static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan,
 		if (plchan->slave) {
 			pl08x_free_txd_list(pl08x, plchan);
 			pl08x_free_txd(pl08x, txd);
-			spin_unlock_irqrestore(&plchan->lock, plchan->lockflags);
+			spin_unlock_irqrestore(&plchan->lock, flags);
 			return -EBUSY;
 		}
 	} else
@@ -1272,11 +1274,7 @@ static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan,
 		if (plchan->state == PL08X_CHAN_IDLE)
 			plchan->state = PL08X_CHAN_PAUSED;
 
-	/*
-	 * Notice that we leave plchan->lock locked on purpose:
-	 * it will be unlocked in the subsequent tx_submit()
-	 * call. This is a consequence of the current API.
-	 */
+	spin_unlock_irqrestore(&plchan->lock, flags);
 
 	return 0;
 }
@@ -1355,10 +1353,6 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 	ret = pl08x_prep_channel_resources(plchan, txd);
 	if (ret)
 		return NULL;
-	/*
-	 * NB: the channel lock is held at this point so tx_submit()
-	 * must be called in direct succession.
-	 */
 
 	return &txd->tx;
 }
@@ -1444,10 +1438,6 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 	ret = pl08x_prep_channel_resources(plchan, txd);
 	if (ret)
 		return NULL;
-	/*
-	 * NB: the channel lock is held at this point so tx_submit()
-	 * must be called in direct succession.
-	 */
 
 	return &txd->tx;
 }
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 95b76ea1829f..933b4ed12be5 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -162,9 +162,6 @@ enum pl08x_dma_chan_state {
  * @lc: last completed transaction on this channel
  * @pend_list: queued transactions pending on this channel
  * @at: active transaction on this channel
- * @lockflags: sometimes we let a lock last between two function calls,
- * especially prep/submit, and then we need to store the IRQ flags
- * in the channel state, here
  * @lock: a lock for this channel data
  * @host: a pointer to the host (internal use)
  * @state: whether the channel is idle, paused, running etc
@@ -184,7 +181,6 @@ struct pl08x_dma_chan {
 	dma_cookie_t lc;
 	struct list_head pend_list;
 	struct pl08x_txd *at;
-	unsigned long lockflags;
 	spinlock_t lock;
 	struct pl08x_driver_data *host;
 	enum pl08x_dma_chan_state state;
-- 
cgit v1.2.3-71-gd317


From 8d608aa6295242fe4c4b6105b8c59c6a5b232d89 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 7 Dec 2010 08:57:57 +1000
Subject: vga_switcheroo: add reprobe hook for fbcon to recheck connected
 outputs.

This adds a hook after the mux is switched for the driver to reprobe
the connected outputs.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/i915/i915_dma.c         | 1 +
 drivers/gpu/drm/nouveau/nouveau_state.c | 7 +++++++
 drivers/gpu/drm/radeon/radeon_device.c  | 1 +
 drivers/gpu/vga/vga_switcheroo.c        | 6 ++++++
 include/linux/vga_switcheroo.h          | 1 +
 5 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 8cb7f93732c2..ec1f650f6fab 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1231,6 +1231,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
 
 	ret = vga_switcheroo_register_client(dev->pdev,
 					     i915_switcheroo_set_state,
+					     NULL,
 					     i915_switcheroo_can_switch);
 	if (ret)
 		goto cleanup_vga_client;
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
index db4b410f4a8c..1b87eee22fa9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -605,6 +605,12 @@ static void nouveau_switcheroo_set_state(struct pci_dev *pdev,
 	}
 }
 
+static void nouveau_switcheroo_reprobe(struct pci_dev *pdev)
+{
+	struct drm_device *dev = pci_get_drvdata(pdev);
+	nouveau_fbcon_output_poll_changed(dev);
+}
+
 static bool nouveau_switcheroo_can_switch(struct pci_dev *pdev)
 {
 	struct drm_device *dev = pci_get_drvdata(pdev);
@@ -625,6 +631,7 @@ nouveau_card_init(struct drm_device *dev)
 
 	vga_client_register(dev->pdev, dev, NULL, nouveau_vga_set_decode);
 	vga_switcheroo_register_client(dev->pdev, nouveau_switcheroo_set_state,
+				       nouveau_switcheroo_reprobe,
 				       nouveau_switcheroo_can_switch);
 
 	/* Initialise internal driver API hooks */
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 96f0eb484ce6..1a1017f0d9db 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -779,6 +779,7 @@ int radeon_device_init(struct radeon_device *rdev,
 	vga_client_register(rdev->pdev, rdev, NULL, radeon_vga_set_decode);
 	vga_switcheroo_register_client(rdev->pdev,
 				       radeon_switcheroo_set_state,
+				       NULL,
 				       radeon_switcheroo_can_switch);
 
 	r = radeon_init(rdev);
diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c
index 463691a1650e..2b8e1d25e8d0 100644
--- a/drivers/gpu/vga/vga_switcheroo.c
+++ b/drivers/gpu/vga/vga_switcheroo.c
@@ -33,6 +33,7 @@ struct vga_switcheroo_client {
 	struct fb_info *fb_info;
 	int pwr_state;
 	void (*set_gpu_state)(struct pci_dev *pdev, enum vga_switcheroo_state);
+	void (*reprobe)(struct pci_dev *pdev);
 	bool (*can_switch)(struct pci_dev *pdev);
 	int id;
 	bool active;
@@ -103,6 +104,7 @@ static void vga_switcheroo_enable(void)
 
 int vga_switcheroo_register_client(struct pci_dev *pdev,
 				   void (*set_gpu_state)(struct pci_dev *pdev, enum vga_switcheroo_state),
+				   void (*reprobe)(struct pci_dev *pdev),
 				   bool (*can_switch)(struct pci_dev *pdev))
 {
 	int index;
@@ -117,6 +119,7 @@ int vga_switcheroo_register_client(struct pci_dev *pdev,
 	vgasr_priv.clients[index].pwr_state = VGA_SWITCHEROO_ON;
 	vgasr_priv.clients[index].pdev = pdev;
 	vgasr_priv.clients[index].set_gpu_state = set_gpu_state;
+	vgasr_priv.clients[index].reprobe = reprobe;
 	vgasr_priv.clients[index].can_switch = can_switch;
 	vgasr_priv.clients[index].id = -1;
 	if (pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW)
@@ -251,6 +254,9 @@ static int vga_switchto(struct vga_switcheroo_client *new_client)
 	if (ret)
 		return ret;
 
+	if (new_client->reprobe)
+		new_client->reprobe(new_client->pdev);
+
 	if (active->pwr_state == VGA_SWITCHEROO_ON)
 		vga_switchoff(active);
 
diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h
index ae9ab13b963d..f80fa9de5b82 100644
--- a/include/linux/vga_switcheroo.h
+++ b/include/linux/vga_switcheroo.h
@@ -33,6 +33,7 @@ struct vga_switcheroo_handler {
 void vga_switcheroo_unregister_client(struct pci_dev *dev);
 int vga_switcheroo_register_client(struct pci_dev *dev,
 				   void (*set_gpu_state)(struct pci_dev *dev, enum vga_switcheroo_state),
+				   void (*reprobe)(struct pci_dev *dev),
 				   bool (*can_switch)(struct pci_dev *dev));
 
 void vga_switcheroo_client_fb_set(struct pci_dev *dev,
-- 
cgit v1.2.3-71-gd317


From e4a683c899cd5a49f8d684a054c95bd115a0c005 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 5 Jan 2011 16:57:37 +0100
Subject: kref: add kref_test_and_get

Add kref_test_and_get() function, which atomically add a reference only if
refcount is not zero. This prevent to add a reference to an object that is
already being removed.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/kref.h |  1 +
 lib/kref.c           | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 6cc38fc07ab7..90b9e44abf54 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -23,6 +23,7 @@ struct kref {
 
 void kref_init(struct kref *kref);
 void kref_get(struct kref *kref);
+int kref_test_and_get(struct kref *kref);
 int kref_put(struct kref *kref, void (*release) (struct kref *kref));
 
 #endif /* _KREF_H_ */
diff --git a/lib/kref.c b/lib/kref.c
index d3d227a08a4b..e7a6e1067122 100644
--- a/lib/kref.c
+++ b/lib/kref.c
@@ -36,6 +36,18 @@ void kref_get(struct kref *kref)
 	smp_mb__after_atomic_inc();
 }
 
+/**
+ * kref_test_and_get - increment refcount for object only if refcount is not
+ * zero.
+ * @kref: object.
+ *
+ * Return non-zero if the refcount was incremented, 0 otherwise
+ */
+int kref_test_and_get(struct kref *kref)
+{
+	return atomic_inc_not_zero(&kref->refcount);
+}
+
 /**
  * kref_put - decrement refcount for object.
  * @kref: object.
-- 
cgit v1.2.3-71-gd317


From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 5 Jan 2011 16:57:38 +0100
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

Also add a refcount to struct hd_struct to keep the partition in
memory as long as users exist. We use kref_test_and_get() to ensure
we don't add a reference to a partition which is going away.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 26 +++++++++++++++++++++-----
 block/blk-merge.c      |  3 ++-
 block/genhd.c          |  1 +
 fs/partitions/check.c  | 10 +++++++++-
 include/linux/blkdev.h |  1 +
 include/linux/genhd.h  |  2 ++
 6 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3689319a5974..500c080a6a6b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+		if (!kref_test_and_get(&part->ref)) {
+			/*
+			 * The partition is already being removed,
+			 * the request will be accounted on the disk only
+			 *
+			 * We take a reference on disk->part0 although that
+			 * partition will never be deleted, so we can treat
+			 * it as any other partition.
+			 */
+			part = &rq->rq_disk->part0;
+			kref_get(&part->ref);
+		}
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 
+		kref_put(&part->ref, __delete_partition);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26df6b5..b06b83b89d89 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
 
+		kref_put(&part->ref, __delete_partition);
 		part_stat_unlock();
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index 16ccc0d2d5d3..85c150598830 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			return NULL;
 		}
 		disk->part_tbl->part[0] = &disk->part0;
+		kref_init(&disk->part0.ref);
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index bdf8d3cc95a4..48209f58522b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -381,6 +381,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	put_device(part_to_dev(part));
 }
 
+void __delete_partition(struct kref *ref)
+{
+	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
+
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -399,7 +406,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+	kref_put(&part->ref, __delete_partition);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -498,6 +505,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
+	kref_init(&p->ref);
 	return p;
 
 out_free_info:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd10c4f..482a7fd48831 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
+	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7a7b9c1644e4..2ba2792a3dd4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
 	struct disk_stats dkstats;
 #endif
 	struct rcu_head rcu_head;
+	struct kref ref;
 };
 
 #define GENHD_FL_REMOVABLE			1
@@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     sector_t len, int flags,
 						     struct partition_meta_info
 						       *info);
+extern void __delete_partition(struct kref *ref);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3-71-gd317


From b27dcfb0670ea7352a67137f4ff7947c2a9f6892 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 17 Nov 2010 22:56:48 -0500
Subject: [libata] avoid needlessly passing around ptr to SCSI completion func

It's stored in struct scsi_cmnd->scsi_done, making several 'done'
parameters to functions redundant.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-scsi.c           | 60 ++++++++++++++++---------------------
 drivers/scsi/ipr.c                  |  2 +-
 drivers/scsi/libsas/sas_scsi_host.c |  3 +-
 include/linux/libata.h              |  6 ++--
 4 files changed, 29 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 66aa4bee80a6..5defc74973d7 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -346,12 +346,11 @@ struct device_attribute *ata_common_sdev_attrs[] = {
 };
 EXPORT_SYMBOL_GPL(ata_common_sdev_attrs);
 
-static void ata_scsi_invalid_field(struct scsi_cmnd *cmd,
-				   void (*done)(struct scsi_cmnd *))
+static void ata_scsi_invalid_field(struct scsi_cmnd *cmd)
 {
 	ata_scsi_set_sense(cmd, ILLEGAL_REQUEST, 0x24, 0x0);
 	/* "Invalid field in cbd" */
-	done(cmd);
+	cmd->scsi_done(cmd);
 }
 
 /**
@@ -719,7 +718,6 @@ EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
  *	ata_scsi_qc_new - acquire new ata_queued_cmd reference
  *	@dev: ATA device to which the new command is attached
  *	@cmd: SCSI command that originated this ATA command
- *	@done: SCSI command completion function
  *
  *	Obtain a reference to an unused ata_queued_cmd structure,
  *	which is the basic libata structure representing a single
@@ -736,21 +734,20 @@ EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
  *	Command allocated, or %NULL if none available.
  */
 static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
-					      struct scsi_cmnd *cmd,
-					      void (*done)(struct scsi_cmnd *))
+					      struct scsi_cmnd *cmd)
 {
 	struct ata_queued_cmd *qc;
 
 	qc = ata_qc_new_init(dev);
 	if (qc) {
 		qc->scsicmd = cmd;
-		qc->scsidone = done;
+		qc->scsidone = cmd->scsi_done;
 
 		qc->sg = scsi_sglist(cmd);
 		qc->n_elem = scsi_sg_count(cmd);
 	} else {
 		cmd->result = (DID_OK << 16) | (QUEUE_FULL << 1);
-		done(cmd);
+		cmd->scsi_done(cmd);
 	}
 
 	return qc;
@@ -1735,7 +1732,6 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
  *	ata_scsi_translate - Translate then issue SCSI command to ATA device
  *	@dev: ATA device to which the command is addressed
  *	@cmd: SCSI command to execute
- *	@done: SCSI command completion function
  *	@xlat_func: Actor which translates @cmd to an ATA taskfile
  *
  *	Our ->queuecommand() function has decided that the SCSI
@@ -1759,7 +1755,6 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
  *	needs to be deferred.
  */
 static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
-			      void (*done)(struct scsi_cmnd *),
 			      ata_xlat_func_t xlat_func)
 {
 	struct ata_port *ap = dev->link->ap;
@@ -1768,7 +1763,7 @@ static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
 
 	VPRINTK("ENTER\n");
 
-	qc = ata_scsi_qc_new(dev, cmd, done);
+	qc = ata_scsi_qc_new(dev, cmd);
 	if (!qc)
 		goto err_mem;
 
@@ -1804,14 +1799,14 @@ static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
 
 early_finish:
 	ata_qc_free(qc);
-	qc->scsidone(cmd);
+	cmd->scsi_done(cmd);
 	DPRINTK("EXIT - early finish (good or error)\n");
 	return 0;
 
 err_did:
 	ata_qc_free(qc);
 	cmd->result = (DID_ERROR << 16);
-	qc->scsidone(cmd);
+	cmd->scsi_done(cmd);
 err_mem:
 	DPRINTK("EXIT - internal\n");
 	return 0;
@@ -3116,7 +3111,6 @@ static inline void ata_scsi_dump_cdb(struct ata_port *ap,
 }
 
 static inline int __ata_scsi_queuecmd(struct scsi_cmnd *scmd,
-				      void (*done)(struct scsi_cmnd *),
 				      struct ata_device *dev)
 {
 	u8 scsi_op = scmd->cmnd[0];
@@ -3150,9 +3144,9 @@ static inline int __ata_scsi_queuecmd(struct scsi_cmnd *scmd,
 	}
 
 	if (xlat_func)
-		rc = ata_scsi_translate(dev, scmd, done, xlat_func);
+		rc = ata_scsi_translate(dev, scmd, xlat_func);
 	else
-		ata_scsi_simulate(dev, scmd, done);
+		ata_scsi_simulate(dev, scmd);
 
 	return rc;
 
@@ -3160,7 +3154,7 @@ static inline int __ata_scsi_queuecmd(struct scsi_cmnd *scmd,
 	DPRINTK("bad CDB len=%u, scsi_op=0x%02x, max=%u\n",
 		scmd->cmd_len, scsi_op, dev->cdb_len);
 	scmd->result = DID_ERROR << 16;
-	done(scmd);
+	scmd->scsi_done(scmd);
 	return 0;
 }
 
@@ -3199,7 +3193,7 @@ int ata_scsi_queuecmd(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
 
 	dev = ata_scsi_find_dev(ap, scsidev);
 	if (likely(dev))
-		rc = __ata_scsi_queuecmd(cmd, cmd->scsi_done, dev);
+		rc = __ata_scsi_queuecmd(cmd, dev);
 	else {
 		cmd->result = (DID_BAD_TARGET << 16);
 		cmd->scsi_done(cmd);
@@ -3214,7 +3208,6 @@ int ata_scsi_queuecmd(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
  *	ata_scsi_simulate - simulate SCSI command on ATA device
  *	@dev: the target device
  *	@cmd: SCSI command being sent to device.
- *	@done: SCSI command completion function.
  *
  *	Interprets and directly executes a select list of SCSI commands
  *	that can be handled internally.
@@ -3223,8 +3216,7 @@ int ata_scsi_queuecmd(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
  *	spin_lock_irqsave(host lock)
  */
 
-void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
-		      void (*done)(struct scsi_cmnd *))
+void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
 {
 	struct ata_scsi_args args;
 	const u8 *scsicmd = cmd->cmnd;
@@ -3233,17 +3225,17 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 	args.dev = dev;
 	args.id = dev->id;
 	args.cmd = cmd;
-	args.done = done;
+	args.done = cmd->scsi_done;
 
 	switch(scsicmd[0]) {
 	/* TODO: worth improving? */
 	case FORMAT_UNIT:
-		ata_scsi_invalid_field(cmd, done);
+		ata_scsi_invalid_field(cmd);
 		break;
 
 	case INQUIRY:
 		if (scsicmd[1] & 2)	           /* is CmdDt set?  */
-			ata_scsi_invalid_field(cmd, done);
+			ata_scsi_invalid_field(cmd);
 		else if ((scsicmd[1] & 1) == 0)    /* is EVPD clear? */
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_std);
 		else switch (scsicmd[2]) {
@@ -3269,7 +3261,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
 			break;
 		default:
-			ata_scsi_invalid_field(cmd, done);
+			ata_scsi_invalid_field(cmd);
 			break;
 		}
 		break;
@@ -3281,7 +3273,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 
 	case MODE_SELECT:	/* unconditionally return */
 	case MODE_SELECT_10:	/* bad-field-in-cdb */
-		ata_scsi_invalid_field(cmd, done);
+		ata_scsi_invalid_field(cmd);
 		break;
 
 	case READ_CAPACITY:
@@ -3292,7 +3284,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 		if ((scsicmd[1] & 0x1f) == SAI_READ_CAPACITY_16)
 			ata_scsi_rbuf_fill(&args, ata_scsiop_read_cap);
 		else
-			ata_scsi_invalid_field(cmd, done);
+			ata_scsi_invalid_field(cmd);
 		break;
 
 	case REPORT_LUNS:
@@ -3302,7 +3294,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 	case REQUEST_SENSE:
 		ata_scsi_set_sense(cmd, 0, 0, 0);
 		cmd->result = (DRIVER_SENSE << 24);
-		done(cmd);
+		cmd->scsi_done(cmd);
 		break;
 
 	/* if we reach this, then writeback caching is disabled,
@@ -3324,14 +3316,14 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 		if ((tmp8 == 0x4) && (!scsicmd[3]) && (!scsicmd[4]))
 			ata_scsi_rbuf_fill(&args, ata_scsiop_noop);
 		else
-			ata_scsi_invalid_field(cmd, done);
+			ata_scsi_invalid_field(cmd);
 		break;
 
 	/* all other commands */
 	default:
 		ata_scsi_set_sense(cmd, ILLEGAL_REQUEST, 0x20, 0x0);
 		/* "Invalid command operation code" */
-		done(cmd);
+		cmd->scsi_done(cmd);
 		break;
 	}
 }
@@ -3858,7 +3850,6 @@ EXPORT_SYMBOL_GPL(ata_sas_slave_configure);
 /**
  *	ata_sas_queuecmd - Issue SCSI cdb to libata-managed device
  *	@cmd: SCSI command to be sent
- *	@done: Completion function, called when command is complete
  *	@ap:	ATA port to which the command is being sent
  *
  *	RETURNS:
@@ -3866,18 +3857,17 @@ EXPORT_SYMBOL_GPL(ata_sas_slave_configure);
  *	0 otherwise.
  */
 
-int ata_sas_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *),
-		     struct ata_port *ap)
+int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap)
 {
 	int rc = 0;
 
 	ata_scsi_dump_cdb(ap, cmd);
 
 	if (likely(ata_dev_enabled(ap->link.device)))
-		rc = __ata_scsi_queuecmd(cmd, done, ap->link.device);
+		rc = __ata_scsi_queuecmd(cmd, ap->link.device);
 	else {
 		cmd->result = (DID_BAD_TARGET << 16);
-		done(cmd);
+		cmd->scsi_done(cmd);
 	}
 	return rc;
 }
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 5bbaee597e88..2c71927aad64 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -5743,7 +5743,7 @@ static int ipr_queuecommand_lck(struct scsi_cmnd *scsi_cmd,
 	}
 
 	if (ipr_is_gata(res) && res->sata_port)
-		return ata_sas_queuecmd(scsi_cmd, done, res->sata_port->ap);
+		return ata_sas_queuecmd(scsi_cmd, res->sata_port->ap);
 
 	ipr_cmd = ipr_get_free_ipr_cmnd(ioa_cfg);
 	ioarcb = &ipr_cmd->ioarcb;
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 29251fabecc6..5815cbeb27a6 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -211,8 +211,7 @@ static int sas_queuecommand_lck(struct scsi_cmnd *cmd,
 			unsigned long flags;
 
 			spin_lock_irqsave(dev->sata_dev.ap->lock, flags);
-			res = ata_sas_queuecmd(cmd, scsi_done,
-					       dev->sata_dev.ap);
+			res = ata_sas_queuecmd(cmd, dev->sata_dev.ap);
 			spin_unlock_irqrestore(dev->sata_dev.ap->lock, flags);
 			goto out;
 		}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d947b1231662..c9c5d7ad1a2b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -996,8 +996,7 @@ extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_port_start(struct ata_port *ap);
 extern void ata_sas_port_stop(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
-extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *),
-			    struct ata_port *ap);
+extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
 extern int sata_scr_valid(struct ata_link *link);
 extern int sata_scr_read(struct ata_link *link, int reg, u32 *val);
 extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
@@ -1040,8 +1039,7 @@ extern unsigned int ata_do_dev_read_id(struct ata_device *dev,
 					struct ata_taskfile *tf, u16 *id);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active);
-extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
-			      void (*done)(struct scsi_cmnd *));
+extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd);
 extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
 			      sector_t capacity, int geom[]);
-- 
cgit v1.2.3-71-gd317


From af5dd83b873efd4e1477f2265b6fa15a825aff26 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 6 Jan 2011 13:04:32 +1000
Subject: vga_switcheroo: fix build with non switcheroo enabled path.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/vga_switcheroo.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h
index f80fa9de5b82..4b9a7f596f92 100644
--- a/include/linux/vga_switcheroo.h
+++ b/include/linux/vga_switcheroo.h
@@ -49,6 +49,7 @@ int vga_switcheroo_process_delayed_switch(void);
 static inline void vga_switcheroo_unregister_client(struct pci_dev *dev) {}
 static inline int vga_switcheroo_register_client(struct pci_dev *dev,
 					  void (*set_gpu_state)(struct pci_dev *dev, enum vga_switcheroo_state),
+					  void (*reprobe)(struct pci_dev *dev),
 					  bool (*can_switch)(struct pci_dev *dev)) { return 0; }
 static inline void vga_switcheroo_client_fb_set(struct pci_dev *dev, struct fb_info *info) {}
 static inline int vga_switcheroo_register_handler(struct vga_switcheroo_handler *handler) { return 0; }
-- 
cgit v1.2.3-71-gd317


From cf24dc85ff29a41abd8e73730e5feb22b2666bd3 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Fri, 19 Feb 2010 15:39:52 +0100
Subject: mtd: OneNAND: add enable / disable methods to onenand_chip

Add enable / disable methods called from get_device() / release_device().
These can be used, for example, to allow the driver to prevent the voltage
regulator from being put to sleep while OneNAND is in use.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/onenand/onenand_base.c | 4 ++++
 include/linux/mtd/onenand.h        | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 2d7c90de3144..2edef5868ace 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -948,6 +948,8 @@ static int onenand_get_device(struct mtd_info *mtd, int new_state)
 		if (this->state == FL_READY) {
 			this->state = new_state;
 			spin_unlock(&this->chip_lock);
+			if (new_state != FL_PM_SUSPENDED && this->enable)
+				this->enable(mtd);
 			break;
 		}
 		if (new_state == FL_PM_SUSPENDED) {
@@ -974,6 +976,8 @@ static void onenand_release_device(struct mtd_info *mtd)
 {
 	struct onenand_chip *this = mtd->priv;
 
+	if (this->state != FL_PM_SUSPENDED && this->disable)
+		this->disable(mtd);
 	/* Release the chip */
 	spin_lock(&this->chip_lock);
 	this->state = FL_READY;
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 6da3fe314828..ae418e41d8f5 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -118,6 +118,8 @@ struct onenand_chip {
 	int (*chip_probe)(struct mtd_info *mtd);
 	int (*block_markbad)(struct mtd_info *mtd, loff_t ofs);
 	int (*scan_bbt)(struct mtd_info *mtd);
+	int (*enable)(struct mtd_info *mtd);
+	int (*disable)(struct mtd_info *mtd);
 
 	struct completion	complete;
 	int			irq;
-- 
cgit v1.2.3-71-gd317


From 0e4ca7e5101e7f4054452b8d71c535eec64a187b Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Thu, 16 Dec 2010 23:42:14 +0100
Subject: mtd: add writebufsize field to mtd_info struct

This field will be used to indicate the write buffer size
of the MTD device. UBI will set it's minimal I/O unit size
(min_io_size) to the indicated write buffer size. By this
change we intend to fix failed recovery of UBIFS partitions
we currently observe on NOR flash when mounting the partition
after unclean unmount.

Currently the min_io_size is set to mtd->writesize (which is 1
byte for NOR flash). But flash programming is often done from
prepared write buffer containing multiple bytes and is performed
in one programming operation which could be interrupted by a power
cut or a system reset causing corrupted (partially written) areas
in a flash sector. Knowing the size of potentially corrupted areas
UBIFS scanning and recovery algorithms are able to perform
successful recovery.

In case of NOR flash minimal I/O size must be equal to the
maximal size of the write buffer used by embedded flash
programming algorithm. In case of NAND flash mtd->writebufsize
should be equivalent to mtd->writesize.

The subsequent patches will add mtd->writebufsize initialization
where needed.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/mtd.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index fe8d77ebec13..9d5306bad117 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -144,6 +144,17 @@ struct mtd_info {
 	 */
 	uint32_t writesize;
 
+	/*
+	 * Size of the write buffer used by the MTD. MTD devices having a write
+	 * buffer can write multiple writesize chunks at a time. E.g. while
+	 * writing 4 * writesize bytes to a device with 2 * writesize bytes
+	 * buffer the MTD driver can (but doesn't have to) do 2 writesize
+	 * operations, but not 4. Currently, all NANDs have writebufsize
+	 * equivalent to writesize (NAND page size). Some NOR flashes do have
+	 * writebufsize greater than writesize.
+	 */
+	uint32_t writebufsize;
+
 	uint32_t oobsize;   // Amount of OOB data per block (e.g. 16)
 	uint32_t oobavail;  // Available OOB bytes per block
 
-- 
cgit v1.2.3-71-gd317


From 1f11a034cdc4b45ee56d51b87a9e37cb776fb15b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 6 Jan 2011 02:04:26 +0000
Subject: SUNRPC new transport for the NFSv4.1 shared back channel

Move the current sock create and destroy routines into the new transport ops.
Back channel socket will be destroyed by the svc_closs_all call in svc_destroy.

Added check: only TCP supported on shared back channel.

Signed-off-by: Andy Adamson <andros@netapp.com>
Acked-by: Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svcsock.h |  1 +
 net/sunrpc/svc.c               |  4 ---
 net/sunrpc/svcsock.c           | 82 +++++++++++++++++++++++++++++++-----------
 3 files changed, 63 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 1b353a76c304..3a45a80760b9 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -45,6 +45,7 @@ int		svc_sock_names(struct svc_serv *serv, char *buf,
 int		svc_addsock(struct svc_serv *serv, const int fd,
 					char *name_return, const size_t len);
 void		svc_init_xprt_sock(void);
+void		svc_init_bc_xprt_sock(void);
 void		svc_cleanup_xprt_sock(void);
 struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot);
 void		svc_sock_destroy(struct svc_xprt *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 606d182895a9..261e2d1dff10 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -488,10 +488,6 @@ svc_destroy(struct svc_serv *serv)
 	if (svc_serv_is_pooled(serv))
 		svc_pool_map_put();
 
-#if defined(CONFIG_NFS_V4_1)
-	svc_sock_destroy(serv->bc_xprt);
-#endif /* CONFIG_NFS_V4_1 */
-
 	svc_unregister(serv);
 	kfree(serv->sv_pools);
 	kfree(serv);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 07919e16be3e..6630f2922f40 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -66,6 +66,13 @@ static void		svc_sock_free(struct svc_xprt *);
 static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
 					  struct net *, struct sockaddr *,
 					  int, int);
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
+					     struct net *, struct sockaddr *,
+					     int, int);
+static void svc_bc_sock_free(struct svc_xprt *xprt);
+#endif /* CONFIG_NFS_V4_1 */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key svc_key[2];
 static struct lock_class_key svc_slock_key[2];
@@ -1184,6 +1191,39 @@ static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
 	return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
+					     struct net *, struct sockaddr *,
+					     int, int);
+static void svc_bc_sock_free(struct svc_xprt *xprt);
+
+static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv,
+				       struct net *net,
+				       struct sockaddr *sa, int salen,
+				       int flags)
+{
+	return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
+}
+
+static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt)
+{
+}
+
+static struct svc_xprt_ops svc_tcp_bc_ops = {
+	.xpo_create = svc_bc_tcp_create,
+	.xpo_detach = svc_bc_tcp_sock_detach,
+	.xpo_free = svc_bc_sock_free,
+	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+};
+
+static struct svc_xprt_class svc_tcp_bc_class = {
+	.xcl_name = "tcp-bc",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_tcp_bc_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
 static struct svc_xprt_ops svc_tcp_ops = {
 	.xpo_create = svc_tcp_create,
 	.xpo_recvfrom = svc_tcp_recvfrom,
@@ -1509,41 +1549,43 @@ static void svc_sock_free(struct svc_xprt *xprt)
 	kfree(svsk);
 }
 
+#if defined(CONFIG_NFS_V4_1)
 /*
- * Create a svc_xprt.
- *
- * For internal use only (e.g. nfsv4.1 backchannel).
- * Callers should typically use the xpo_create() method.
+ * Create a back channel svc_xprt which shares the fore channel socket.
  */
-struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot)
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
+					     int protocol,
+					     struct net *net,
+					     struct sockaddr *sin, int len,
+					     int flags)
 {
 	struct svc_sock *svsk;
-	struct svc_xprt *xprt = NULL;
+	struct svc_xprt *xprt;
+
+	if (protocol != IPPROTO_TCP) {
+		printk(KERN_WARNING "svc: only TCP sockets"
+			" supported on shared back channel\n");
+		return ERR_PTR(-EINVAL);
+	}
 
-	dprintk("svc: %s\n", __func__);
 	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
 	if (!svsk)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
 	xprt = &svsk->sk_xprt;
-	if (prot == IPPROTO_TCP)
-		svc_xprt_init(&svc_tcp_class, xprt, serv);
-	else if (prot == IPPROTO_UDP)
-		svc_xprt_init(&svc_udp_class, xprt, serv);
-	else
-		BUG();
-out:
-	dprintk("svc: %s return %p\n", __func__, xprt);
+	svc_xprt_init(&svc_tcp_bc_class, xprt, serv);
+
+	serv->bc_xprt = xprt;
+
 	return xprt;
 }
-EXPORT_SYMBOL_GPL(svc_sock_create);
 
 /*
- * Destroy a svc_sock.
+ * Free a back channel svc_sock.
  */
-void svc_sock_destroy(struct svc_xprt *xprt)
+static void svc_bc_sock_free(struct svc_xprt *xprt)
 {
 	if (xprt)
 		kfree(container_of(xprt, struct svc_sock, sk_xprt));
 }
-EXPORT_SYMBOL_GPL(svc_sock_destroy);
+#endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-71-gd317


From 16b2d1e1d12de000404d7c845d0db1226511f84d Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 6 Jan 2011 02:04:27 +0000
Subject: SUNRPC register and unregister the back channel transport

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svcsock.h |  1 -
 net/sunrpc/svcsock.c           | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 3a45a80760b9..1b353a76c304 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -45,7 +45,6 @@ int		svc_sock_names(struct svc_serv *serv, char *buf,
 int		svc_addsock(struct svc_serv *serv, const int fd,
 					char *name_return, const size_t len);
 void		svc_init_xprt_sock(void);
-void		svc_init_bc_xprt_sock(void);
 void		svc_cleanup_xprt_sock(void);
 struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot);
 void		svc_sock_destroy(struct svc_xprt *);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6630f2922f40..e6b66d81115e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1222,6 +1222,24 @@ static struct svc_xprt_class svc_tcp_bc_class = {
 	.xcl_ops = &svc_tcp_bc_ops,
 	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
 };
+
+static void svc_init_bc_xprt_sock(void)
+{
+	svc_reg_xprt_class(&svc_tcp_bc_class);
+}
+
+static void svc_cleanup_bc_xprt_sock(void)
+{
+	svc_unreg_xprt_class(&svc_tcp_bc_class);
+}
+#else /* CONFIG_NFS_V4_1 */
+static void svc_init_bc_xprt_sock(void)
+{
+}
+
+static void svc_cleanup_bc_xprt_sock(void)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static struct svc_xprt_ops svc_tcp_ops = {
@@ -1247,12 +1265,14 @@ void svc_init_xprt_sock(void)
 {
 	svc_reg_xprt_class(&svc_tcp_class);
 	svc_reg_xprt_class(&svc_udp_class);
+	svc_init_bc_xprt_sock();
 }
 
 void svc_cleanup_xprt_sock(void)
 {
 	svc_unreg_xprt_class(&svc_tcp_class);
 	svc_unreg_xprt_class(&svc_udp_class);
+	svc_cleanup_bc_xprt_sock();
 }
 
 static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
-- 
cgit v1.2.3-71-gd317


From f4eecd5da3422e82e88e36c33cbd2595eebcacb1 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 6 Jan 2011 02:04:30 +0000
Subject: NFS implement v4.0 callback_ident

Use the small id to pointer translator service to provide a unique callback
identifier per SETCLIENTID call used to identify the v4.0 callback service
associated with the clientid.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 51 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/inode.c            |  1 +
 fs/nfs/internal.h         |  1 +
 fs/nfs/nfs4proc.c         |  1 +
 include/linux/nfs_fs_sb.h |  1 +
 5 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 855add62abc1..bc3a8620e8c3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+	int ret = 0;
+
+	if (clp->rpc_ops->version != 4 || minorversion != 0)
+		return ret;
+retry:
+	if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+		return -ENOMEM;
+	spin_lock(&nfs_client_lock);
+	ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+	spin_unlock(&nfs_client_lock);
+	if (ret == -EAGAIN)
+		goto retry;
+	return ret;
+}
+#endif /* CONFIG_NFS_V4 */
 
 /*
  * RPC cruft for NFS
@@ -144,6 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_proto = cl_init->proto;
 
 #ifdef CONFIG_NFS_V4
+	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+	if (err)
+		goto error_cleanup;
+
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
@@ -202,10 +230,32 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 
 	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+	idr_destroy(&cb_ident_idr);
+}
+
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+	if (clp->cl_cb_ident)
+		idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
+
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4 */
 
 /*
@@ -244,6 +294,7 @@ void nfs_put_client(struct nfs_client *clp)
 
 	if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
 		list_del(&clp->cl_share_link);
+		nfs_cb_idr_remove_locked(clp);
 		spin_unlock(&nfs_client_lock);
 
 		BUG_ON(!list_empty(&clp->cl_superblocks));
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e67e31c73416..c7782b278e8b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1612,6 +1612,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
+	nfs_cleanup_cb_ident_idr();
 	unregister_nfs_fs();
 	nfs_fs_proc_exit();
 	nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 435eae3666bd..7c803c916574 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,6 +128,7 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
 
+extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
 extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 82f3a82b7115..e165c53db08f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3484,6 +3484,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	struct nfs4_setclientid setclientid = {
 		.sc_verifier = &sc_verifier,
 		.sc_prog = program,
+		.sc_cb_ident = clp->cl_cb_ident,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 452d96436d26..1eaa054a2c7d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -71,6 +71,7 @@ struct nfs_client {
 	 */
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
+	u32			cl_cb_ident;	/* v4.0 callback identifier */
 	const struct nfs4_minor_version_ops *cl_mvops;
 #endif /* CONFIG_NFS_V4 */
 
-- 
cgit v1.2.3-71-gd317


From 2c2618c6f29c41a0a966f14f05c8bf45fcabb750 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 6 Jan 2011 02:04:31 +0000
Subject: NFS associate sessionid with callback connection

The sessions based callback service is started prior to the CREATE_SESSION call
so that it can handle CB_NULL requests which can be sent before the
CREATE_SESSION call returns and the session ID is known.

Set the callback sessionid after a sucessful CREATE_SESSION.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c               | 31 +++++++++++++++++++++++++++++++
 fs/nfs/callback.h               |  1 +
 fs/nfs/nfs4state.c              |  6 ++++++
 include/linux/sunrpc/svc_xprt.h |  1 +
 net/sunrpc/svcsock.c            |  4 +++-
 5 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0e9fae831dfa..c0b05497972b 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -136,6 +136,33 @@ out_err:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+/*
+ *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
+ *   */
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+	struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
+	struct nfs4_sessionid *bc_sid;
+
+	if (!serv->bc_xprt)
+		return -EINVAL;
+
+	/* on success freed in xprt_free */
+	bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
+	if (!bc_sid)
+		return -ENOMEM;
+	memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
+		NFS4_MAX_SESSIONID_LEN);
+	spin_lock_bh(&serv->sv_cb_lock);
+	serv->bc_xprt->xpt_bc_sid = bc_sid;
+	spin_unlock_bh(&serv->sv_cb_lock);
+	dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for bc_xprt %p\n", __func__,
+		((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
+		((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
+		serv->bc_xprt);
+	return 0;
+}
+
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -241,6 +268,10 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct nfs_callback_data *cb_info)
 {
 }
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8dd..58d61a8ce8b9 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -137,6 +137,7 @@ extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
 					    const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
  * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a3126737..485e95e8fd62 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -192,6 +192,12 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 	status = nfs4_proc_create_session(clp);
 	if (status != 0)
 		goto out;
+	status = nfs4_set_callback_sessionid(clp);
+	if (status != 0) {
+		printk(KERN_WARNING "Sessionid not set. No callback service\n");
+		nfs_callback_down(1);
+		status = 0;
+	}
 	nfs41_setup_state_renewal(clp);
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index aea0d438e3c7..357da5e0daa3 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -78,6 +78,7 @@ struct svc_xprt {
 	size_t			xpt_remotelen;	/* length of address */
 	struct rpc_wait_queue	xpt_bc_pending;	/* backchannel wait queue */
 	struct list_head	xpt_users;	/* callbacks on free */
+	void			*xpt_bc_sid;	/* back channel session ID */
 
 	struct net		*xpt_net;
 };
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e6b66d81115e..db3013e4aa04 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1605,7 +1605,9 @@ static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
  */
 static void svc_bc_sock_free(struct svc_xprt *xprt)
 {
-	if (xprt)
+	if (xprt) {
+		kfree(xprt->xpt_bc_sid);
 		kfree(container_of(xprt, struct svc_sock, sk_xprt));
+	}
 }
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-71-gd317


From c36fca52f5e4594ffd0ff175b328966b0d393184 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 6 Jan 2011 02:04:32 +0000
Subject: NFS refactor nfs_find_client and reference client across callback
 processing

Fixes a bug where the nfs_client could be freed during callback processing.
Refactor nfs_find_client to use minorversion specific means to locate the
correct nfs_client structure.

In the NFS layer, V4.0 clients are found using the callback_ident field in the
CB_COMPOUND header.  V4.1 clients are found using the sessionID in the
CB_SEQUENCE operation which is also compared against the sessionID associated
with the back channel thread after a successful CREATE_SESSION.

Each of these methods finds the one an only nfs_client associated
with the incoming callback request - so nfs_find_client_next is not needed.

In the RPC layer, the pg_authenticate call needs to find the nfs_client. For
the v4.0 callback service, the callback identifier has not been decoded so a
search by address, version, and minorversion is used.  The sessionid for the
sessions based callback service has (usually) not been set for the
pg_authenticate on a CB_NULL call which can be sent prior to the return
of a CREATE_SESSION call, so the sessionid associated with the back channel
thread is not used to find the client in pg_authenticate for CB_NULL calls.

Pass the referenced nfs_client to each CB_COMPOUND operation being proceesed
via the new cb_process_state structure. The reference is held across
cb_compound processing.

Use the new cb_process_state struct to move the NFS4ERR_RETRY_UNCACHED_REP
processing from process_op into nfs4_callback_sequence where it belongs.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c              |  21 ++++-
 fs/nfs/callback.h              |  28 +++++--
 fs/nfs/callback_proc.c         | 167 ++++++++++++++++------------------------
 fs/nfs/callback_xdr.c          |  39 ++++++----
 fs/nfs/client.c                | 171 +++++++++++++++++++++++++++--------------
 fs/nfs/internal.h              |   7 +-
 include/linux/sunrpc/bc_xprt.h |  13 ++++
 7 files changed, 258 insertions(+), 188 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c0b05497972b..15677e7bede5 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,9 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#if defined(CONFIG_NFS_V4_1)
 #include <linux/sunrpc/bc_xprt.h>
-#endif
 
 #include <net/inet_sock.h>
 
@@ -384,6 +382,23 @@ static int check_gss_callback_principal(struct nfs_client *clp,
 	return SVC_OK;
 }
 
+/* pg_authenticate method helper */
+static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
+{
+	struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
+	int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
+
+	dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
+	if (svc_is_backchannel(rqstp))
+		/* Sessionid (usually) set after CB_NULL ping */
+		return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
+						  is_cb_compound);
+	else
+		/* No callback identifier in pg_authenticate */
+		return nfs4_find_client_no_ident(svc_addr(rqstp));
+}
+
+/* pg_authenticate method for nfsv4 callback threads. */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
 	struct nfs_client *clp;
@@ -391,7 +406,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 	int ret = SVC_OK;
 
 	/* Don't talk to strangers */
-	clp = nfs_find_client(svc_addr(rqstp), 4);
+	clp = nfs_cb_find_client(rqstp);
 	if (clp == NULL)
 		return SVC_DROP;
 
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 58d61a8ce8b9..25e8802a51d1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -34,10 +34,17 @@ enum nfs4_callback_opnum {
 	OP_CB_ILLEGAL = 10044,
 };
 
+struct cb_process_state {
+	__be32			drc_status;
+	struct nfs_client	*clp;
+	struct nfs4_sessionid	*svc_sid; /* v4.1 callback service sessionid */
+};
+
 struct cb_compound_hdr_arg {
 	unsigned int taglen;
 	const char *tag;
 	unsigned int minorversion;
+	unsigned int cb_ident; /* v4.0 callback identifier */
 	unsigned nops;
 };
 
@@ -103,8 +110,9 @@ struct cb_sequenceres {
 	uint32_t			csr_target_highestslotid;
 };
 
-extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
-				       struct cb_sequenceres *res);
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+				       struct cb_sequenceres *res,
+				       struct cb_process_state *cps);
 
 extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
 					     const nfs4_stateid *stateid);
@@ -118,19 +126,25 @@ struct cb_recallanyargs {
 	uint32_t	craa_type_mask;
 };
 
-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+					void *dummy,
+					struct cb_process_state *cps);
 
 struct cb_recallslotargs {
 	struct sockaddr	*crsa_addr;
 	uint32_t	crsa_target_max_slots;
 };
-extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
-					  void *dummy);
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
+					 void *dummy,
+					 struct cb_process_state *cps);
 
 #endif /* CONFIG_NFS_V4_1 */
 
-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+				    struct cb_getattrres *res,
+				    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+				   struct cb_process_state *cps);
 
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61b..b70e46da16fc 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -16,26 +16,28 @@
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 #endif
- 
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+			     struct cb_getattrres *res,
+			     struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	struct nfs_delegation *delegation;
 	struct nfs_inode *nfsi;
 	struct inode *inode;
 
+	res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+		goto out;
+
 	res->bitmap[0] = res->bitmap[1] = 0;
 	res->status = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client(args->addr, 4);
-	if (clp == NULL)
-		goto out;
 
 	dprintk("NFS: GETATTR callback request from %s\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 
-	inode = nfs_delegation_find_inode(clp, &args->fh);
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
 	if (inode == NULL)
-		goto out_putclient;
+		goto out;
 	nfsi = NFS_I(inode);
 	rcu_read_lock();
 	delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +57,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 out_iput:
 	rcu_read_unlock();
 	iput(inode);
-out_putclient:
-	nfs_put_client(clp);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
 	return res->status;
 }
 
-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+			    struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	struct inode *inode;
 	__be32 res;
 	
-	res = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client(args->addr, 4);
-	if (clp == NULL)
+	res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
 		goto out;
 
 	dprintk("NFS: RECALL callback request from %s\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-
-	do {
-		struct nfs_client *prev = clp;
-
-		inode = nfs_delegation_find_inode(clp, &args->fh);
-		if (inode != NULL) {
-			/* Set up a helper thread to actually return the delegation */
-			switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-				case 0:
-					res = 0;
-					break;
-				case -ENOENT:
-					if (res != 0)
-						res = htonl(NFS4ERR_BAD_STATEID);
-					break;
-				default:
-					res = htonl(NFS4ERR_RESOURCE);
-			}
-			iput(inode);
-		}
-		clp = nfs_find_client_next(prev);
-		nfs_put_client(prev);
-	} while (clp != NULL);
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	res = htonl(NFS4ERR_BADHANDLE);
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+	if (inode == NULL)
+		goto out;
+	/* Set up a helper thread to actually return the delegation */
+	switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+	case 0:
+		res = 0;
+		break;
+	case -ENOENT:
+		if (res != 0)
+			res = htonl(NFS4ERR_BAD_STATEID);
+		break;
+	default:
+		res = htonl(NFS4ERR_RESOURCE);
+	}
+	iput(inode);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
 	return res;
@@ -184,42 +178,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 	return htonl(NFS4ERR_SEQ_MISORDERED);
 }
 
-/*
- * Returns a pointer to a held 'struct nfs_client' that matches the server's
- * address, major version number, and session ID.  It is the caller's
- * responsibility to release the returned reference.
- *
- * Returns NULL if there are no connections with sessions, or if no session
- * matches the one of interest.
- */
- static struct nfs_client *find_client_with_session(
-	const struct sockaddr *addr, u32 nfsversion,
-	struct nfs4_sessionid *sessionid)
-{
-	struct nfs_client *clp;
-
-	clp = nfs_find_client(addr, 4);
-	if (clp == NULL)
-		return NULL;
-
-	do {
-		struct nfs_client *prev = clp;
-
-		if (clp->cl_session != NULL) {
-			if (memcmp(clp->cl_session->sess_id.data,
-					sessionid->data,
-					NFS4_MAX_SESSIONID_LEN) == 0) {
-				/* Returns a held reference to clp */
-				return clp;
-			}
-		}
-		clp = nfs_find_client_next(prev);
-		nfs_put_client(prev);
-	} while (clp != NULL);
-
-	return NULL;
-}
-
 /*
  * For each referring call triple, check the session's slot table for
  * a match.  If the slot is in use and the sequence numbers match, the
@@ -276,20 +234,28 @@ out:
 }
 
 __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-				struct cb_sequenceres *res)
+			      struct cb_sequenceres *res,
+			      struct cb_process_state *cps)
 {
 	struct nfs_client *clp;
 	int i;
 	__be32 status;
 
+	cps->clp = NULL;
+
 	status = htonl(NFS4ERR_BADSESSION);
-	clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+	/* Incoming session must match the callback session */
+	if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
+		goto out;
+
+	clp = nfs4_find_client_sessionid(args->csa_addr,
+					 &args->csa_sessionid, 1);
 	if (clp == NULL)
 		goto out;
 
 	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
 	if (status)
-		goto out_putclient;
+		goto out;
 
 	/*
 	 * Check for pending referring calls.  If a match is found, a
@@ -298,7 +264,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	 */
 	if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
 		status = htonl(NFS4ERR_DELAY);
-		goto out_putclient;
+		goto out;
 	}
 
 	memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,36 +273,36 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	res->csr_slotid = args->csa_slotid;
 	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+	cps->clp = clp; /* put in nfs4_callback_compound */
 
-out_putclient:
-	nfs_put_client(clp);
 out:
 	for (i = 0; i < args->csa_nrclists; i++)
 		kfree(args->csa_rclists[i].rcl_refcalls);
 	kfree(args->csa_rclists);
 
-	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
-		res->csr_status = 0;
-	else
+	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+		cps->drc_status = status;
+		status = 0;
+	} else
 		res->csr_status = status;
+
 	dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
 		ntohl(status), ntohl(res->csr_status));
 	return status;
 }
 
-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+			       struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	__be32 status;
 	fmode_t flags = 0;
 
 	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-	clp = nfs_find_client(args->craa_addr, 4);
-	if (clp == NULL)
+	if (!cps->clp) /* set in cb_sequence */
 		goto out;
 
 	dprintk("NFS: RECALL_ANY callback request from %s\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 
 	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
 		     &args->craa_type_mask))
@@ -346,7 +312,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
 		flags |= FMODE_WRITE;
 
 	if (flags)
-		nfs_expire_all_delegation_types(clp, flags);
+		nfs_expire_all_delegation_types(cps->clp, flags);
 	status = htonl(NFS4_OK);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
@@ -354,36 +320,33 @@ out:
 }
 
 /* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+				struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	struct nfs4_slot_table *fc_tbl;
 	__be32 status;
 
 	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-	clp = nfs_find_client(args->crsa_addr, 4);
-	if (clp == NULL)
+	if (!cps->clp) /* set in cb_sequence */
 		goto out;
 
 	dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
 		args->crsa_target_max_slots);
 
-	fc_tbl = &clp->cl_session->fc_slot_table;
+	fc_tbl = &cps->clp->cl_session->fc_slot_table;
 
 	status = htonl(NFS4ERR_BAD_HIGH_SLOT);
 	if (args->crsa_target_max_slots > fc_tbl->max_slots ||
 	    args->crsa_target_max_slots < 1)
-		goto out_putclient;
+		goto out;
 
 	status = htonl(NFS4_OK);
 	if (args->crsa_target_max_slots == fc_tbl->max_slots)
-		goto out_putclient;
+		goto out;
 
 	fc_tbl->target_max_slots = args->crsa_target_max_slots;
-	nfs41_handle_recall_slot(clp);
-out_putclient:
-	nfs_put_client(clp);	/* balance nfs_find_client */
+	nfs41_handle_recall_slot(cps->clp);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0edf..dbd0d649805c 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 
 #define CB_OP_TAGLEN_MAXSZ	(512)
 #define CB_OP_HDR_RES_MAXSZ	(2 + CB_OP_TAGLEN_MAXSZ)
@@ -33,7 +35,8 @@
 /* Internal error code */
 #define NFS4ERR_RESOURCE_HDR	11050
 
-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+					struct cb_process_state *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
 
@@ -160,7 +163,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	hdr->minorversion = ntohl(*p++);
 	/* Check minor version is zero or one. */
 	if (hdr->minorversion <= 1) {
-		p++;	/* skip callback_ident */
+		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
 	} else {
 		printk(KERN_WARNING "%s: NFSv4 server callback with "
 			"illegal minor version %u!\n",
@@ -621,7 +624,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
 		struct svc_rqst *rqstp,
 		struct xdr_stream *xdr_in, void *argp,
-		struct xdr_stream *xdr_out, void *resp, int* drc_status)
+		struct xdr_stream *xdr_out, void *resp,
+		struct cb_process_state *cps)
 {
 	struct callback_op *op = &callback_ops[0];
 	unsigned int op_nr;
@@ -644,8 +648,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
 	if (status)
 		goto encode_hdr;
 
-	if (*drc_status) {
-		status = *drc_status;
+	if (cps->drc_status) {
+		status = cps->drc_status;
 		goto encode_hdr;
 	}
 
@@ -653,16 +657,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
 	if (maxlen > 0 && maxlen < PAGE_SIZE) {
 		status = op->decode_args(rqstp, xdr_in, argp);
 		if (likely(status == 0))
-			status = op->process_op(argp, resp);
+			status = op->process_op(argp, resp, cps);
 	} else
 		status = htonl(NFS4ERR_RESOURCE);
 
-	/* Only set by OP_CB_SEQUENCE processing */
-	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-		*drc_status = status;
-		status = 0;
-	}
-
 encode_hdr:
 	res = encode_op_hdr(xdr_out, op_nr, status);
 	if (unlikely(res))
@@ -681,8 +679,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	struct cb_compound_hdr_arg hdr_arg = { 0 };
 	struct cb_compound_hdr_res hdr_res = { NULL };
 	struct xdr_stream xdr_in, xdr_out;
-	__be32 *p;
-	__be32 status, drc_status = 0;
+	__be32 *p, status;
+	struct cb_process_state cps = {
+		.drc_status = 0,
+		.clp = NULL,
+	};
 	unsigned int nops = 0;
 
 	dprintk("%s: start\n", __func__);
@@ -696,6 +697,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	if (status == __constant_htonl(NFS4ERR_RESOURCE))
 		return rpc_garbage_args;
 
+	if (hdr_arg.minorversion == 0) {
+		cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+		if (!cps.clp)
+			return rpc_drop_reply;
+	} else
+		cps.svc_sid = bc_xprt_sid(rqstp);
+
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
 	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +711,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	while (status == 0 && nops != hdr_arg.nops) {
 		status = process_op(hdr_arg.minorversion, nops, rqstp,
-				    &xdr_in, argp, &xdr_out, resp, &drc_status);
+				    &xdr_in, argp, &xdr_out, resp, &cps);
 		nops++;
 	}
 
@@ -716,6 +724,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
+	nfs_put_client(cps.clp);
 	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
 	return rpc_success;
 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bc3a8620e8c3..11eb9934c747 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -410,70 +410,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
 	return 0;
 }
 
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
+/* Common match routine for v4.0 and v4.1 callback services */
+bool
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
+		     u32 minorversion)
 {
-	struct nfs_client *clp;
+	struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 
-	spin_lock(&nfs_client_lock);
-	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
-		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+	/* Don't match clients that failed to initialise */
+	if (!(clp->cl_cons_state == NFS_CS_READY ||
+	    clp->cl_cons_state == NFS_CS_SESSION_INITING))
+		return false;
 
-		/* Don't match clients that failed to initialise properly */
-		if (!(clp->cl_cons_state == NFS_CS_READY ||
-		      clp->cl_cons_state == NFS_CS_SESSION_INITING))
-			continue;
+	/* Match the version and minorversion */
+	if (clp->rpc_ops->version != 4 ||
+	    clp->cl_minorversion != minorversion)
+		return false;
 
-		/* Different NFS versions cannot share the same nfs_client */
-		if (clp->rpc_ops->version != nfsversion)
-			continue;
-
-		/* Match only the IP address, not the port number */
-		if (!nfs_sockaddr_match_ipaddr(addr, clap))
-			continue;
+	/* Match only the IP address, not the port number */
+	if (!nfs_sockaddr_match_ipaddr(addr, clap))
+		return false;
 
-		atomic_inc(&clp->cl_count);
-		spin_unlock(&nfs_client_lock);
-		return clp;
-	}
-	spin_unlock(&nfs_client_lock);
-	return NULL;
-}
-
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
-{
-	struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
-	u32 nfsvers = clp->rpc_ops->version;
-
-	spin_lock(&nfs_client_lock);
-	list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
-		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-
-		/* Don't match clients that failed to initialise properly */
-		if (clp->cl_cons_state != NFS_CS_READY)
-			continue;
-
-		/* Different NFS versions cannot share the same nfs_client */
-		if (clp->rpc_ops->version != nfsvers)
-			continue;
-
-		/* Match only the IP address, not the port number */
-		if (!nfs_sockaddr_match_ipaddr(sap, clap))
-			continue;
-
-		atomic_inc(&clp->cl_count);
-		spin_unlock(&nfs_client_lock);
-		return clp;
-	}
-	spin_unlock(&nfs_client_lock);
-	return NULL;
+	return true;
 }
 
 /*
@@ -1171,6 +1129,101 @@ error:
 }
 
 #ifdef CONFIG_NFS_V4
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		if (nfs4_cb_match_client(addr, clp, 0) == false)
+			continue;
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nfs_client_lock);
+	return NULL;
+}
+
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	clp = idr_find(&cb_ident_idr, cb_ident);
+	if (clp)
+		atomic_inc(&clp->cl_count);
+	spin_unlock(&nfs_client_lock);
+	return clp;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
+ * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
+ * can arrive before the callback sessionid is set. For CB_NULL calls,
+ * find a client by IP address protocol version, and minorversion.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid, int is_cb_compound)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		if (nfs4_cb_match_client(addr, clp, 1) == false)
+			continue;
+
+		if (!nfs4_has_session(clp))
+			continue;
+
+		/* Match sessionid unless cb_null call*/
+		if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
+		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nfs_client_lock);
+	return NULL;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid, int is_cb_compound)
+{
+	return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * Initialize the NFS4 callback service
  */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7c803c916574..bfa3a34af801 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -130,8 +130,11 @@ extern struct rpc_program nfs_program;
 
 extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
-extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
+			   int);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
 					struct nfs_fh *);
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 7c91260c44a9..2c60e094fdec 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -47,6 +47,14 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp)
 		return 1;
 	return 0;
 }
+static inline struct nfs4_sessionid *bc_xprt_sid(struct svc_rqst *rqstp)
+{
+	if (svc_is_backchannel(rqstp))
+		return (struct nfs4_sessionid *)
+					rqstp->rq_server->bc_xprt->xpt_bc_sid;
+	return NULL;
+}
+
 #else /* CONFIG_NFS_V4_1 */
 static inline int xprt_setup_backchannel(struct rpc_xprt *xprt,
 					 unsigned int min_reqs)
@@ -59,6 +67,11 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp)
 	return 0;
 }
 
+static inline struct nfs4_sessionid *bc_xprt_sid(struct svc_rqst *rqstp)
+{
+	return NULL;
+}
+
 static inline void xprt_free_bc_request(struct rpc_rqst *req)
 {
 }
-- 
cgit v1.2.3-71-gd317


From 42acd021824578fa0eeb6eb58d457c23ec5dc9c0 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 6 Jan 2011 02:04:34 +0000
Subject: NFS add session back channel draining

Currently session draining only drains the fore channel.
The back channel processing must also be drained.

Use the back channel highest_slot_used to indicate that a callback is being
processed by the callback thread.  Move the session complete to be per channel.

When the session is draininig, wait for any current back channel processing
to complete and stop all new back channel processing by returning NFS4ERR_DELAY
to the back channel client.

Drain the back channel, then the fore channel.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.h         |  3 ++-
 fs/nfs/callback_proc.c    |  7 +++++++
 fs/nfs/callback_xdr.c     | 35 +++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c         | 26 +++++++++++++++++++-------
 fs/nfs/nfs4state.c        | 29 ++++++++++++++++++++++-------
 include/linux/nfs_fs_sb.h |  2 +-
 6 files changed, 86 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 25e8802a51d1..b678e3e15bd9 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -138,6 +138,8 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
 					 void *dummy,
 					 struct cb_process_state *cps);
 
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+extern void nfs4_cb_take_slot(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4_1 */
 
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
@@ -145,7 +147,6 @@ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 				    struct cb_process_state *cps);
 extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
 				   struct cb_process_state *cps);
-
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b70e46da16fc..c1bead2f3e04 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -253,6 +253,12 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	if (clp == NULL)
 		goto out;
 
+	/* state manager is resetting the session */
+	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+		status = NFS4ERR_DELAY;
+		goto out;
+	}
+
 	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
 	if (status)
 		goto out;
@@ -273,6 +279,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	res->csr_slotid = args->csa_slotid;
 	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+	nfs4_cb_take_slot(clp);
 	cps->clp = clp; /* put in nfs4_callback_compound */
 
 out:
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index dbd0d649805c..7a2d6c5864ca 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -596,6 +596,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	return htonl(NFS_OK);
 }
 
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+	struct nfs4_slot_table *tbl = &session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/*
+	 * Let the state manager know callback processing done.
+	 * A single slot, so highest used slotid is either 0 or -1
+	 */
+	tbl->highest_used_slotid--;
+	nfs4_check_drain_bc_complete(session);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+	if (clp && clp->cl_session)
+		nfs4_callback_free_slot(clp->cl_session);
+}
+
+/* A single slot, so highest used slotid is either 0 or -1 */
+void nfs4_cb_take_slot(struct nfs_client *clp)
+{
+	struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	tbl->highest_used_slotid++;
+	BUG_ON(tbl->highest_used_slotid != 0);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
 #else /* CONFIG_NFS_V4_1 */
 
 static __be32
@@ -604,6 +635,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
 
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static __be32
@@ -724,6 +758,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
+	nfs4_cb_free_slot(cps.clp);
 	nfs_put_client(cps.clp);
 	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
 	return rpc_success;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e165c53db08f..18a4d5a9a4e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -356,9 +356,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 }
 
 /*
- * Signal state manager thread if session is drained
+ * Signal state manager thread if session fore channel is drained
  */
-static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
 	struct rpc_task *task;
 
@@ -372,8 +372,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
 	if (ses->fc_slot_table.highest_used_slotid != -1)
 		return;
 
-	dprintk("%s COMPLETE: Session Drained\n", __func__);
-	complete(&ses->complete);
+	dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
+	complete(&ses->fc_slot_table.complete);
+}
+
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+	    ses->bc_slot_table.highest_used_slotid != -1)
+		return;
+	dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+	complete(&ses->bc_slot_table.complete);
 }
 
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -390,7 +402,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 
 	spin_lock(&tbl->slot_tbl_lock);
 	nfs4_free_slot(tbl, res->sr_slot);
-	nfs41_check_drain_session_complete(res->sr_session);
+	nfs4_check_drain_fc_complete(res->sr_session);
 	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slot = NULL;
 }
@@ -4777,17 +4789,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	if (!session)
 		return NULL;
 
-	init_completion(&session->complete);
-
 	tbl = &session->fc_slot_table;
 	tbl->highest_used_slotid = -1;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+	init_completion(&tbl->complete);
 
 	tbl = &session->bc_slot_table;
 	tbl->highest_used_slotid = -1;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+	init_completion(&tbl->complete);
 
 	session->session_state = 1<<NFS4_SESSION_INITING;
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 485e95e8fd62..6891dedd80f1 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -142,6 +142,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 	return status;
 }
 
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
 	struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +170,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 	}
 }
 
-static int nfs4_begin_drain_session(struct nfs_client *clp)
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
-	struct nfs4_session *ses = clp->cl_session;
-	struct nfs4_slot_table *tbl = &ses->fc_slot_table;
-
 	spin_lock(&tbl->slot_tbl_lock);
-	set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
 	if (tbl->highest_used_slotid != -1) {
-		INIT_COMPLETION(ses->complete);
+		INIT_COMPLETION(tbl->complete);
 		spin_unlock(&tbl->slot_tbl_lock);
-		return wait_for_completion_interruptible(&ses->complete);
+		return wait_for_completion_interruptible(&tbl->complete);
 	}
 	spin_unlock(&tbl->slot_tbl_lock);
 	return 0;
 }
 
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+	struct nfs4_session *ses = clp->cl_session;
+	int ret = 0;
+
+	set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+	/* back channel */
+	ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+	if (ret)
+		return ret;
+	/* fore channel */
+	return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
+
 int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1eaa054a2c7d..e93ada0565fc 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -197,6 +197,7 @@ struct nfs4_slot_table {
 						 * op for dynamic resizing */
 	int		target_max_slots;	/* Set by CB_RECALL_SLOT as
 						 * the new max_slots */
+	struct completion complete;
 };
 
 static inline int slot_idx(struct nfs4_slot_table *tbl, struct nfs4_slot *sp)
@@ -213,7 +214,6 @@ struct nfs4_session {
 	unsigned long			session_state;
 	u32				hash_alg;
 	u32				ssv_len;
-	struct completion		complete;
 
 	/* The fore and back channel */
 	struct nfs4_channel_attrs	fc_attrs;
-- 
cgit v1.2.3-71-gd317


From 4a19de0f4b693139bb10b7cc3cfe1f618576ba67 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 6 Jan 2011 02:04:35 +0000
Subject: NFS rename client back channel transport field

Differentiate from server backchannel

Signed-off-by: Andy Adamson <andros@netapp.com>
Acked-by: Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c              | 12 ++++++------
 include/linux/sunrpc/bc_xprt.h |  4 ++--
 include/linux/sunrpc/svc.h     |  2 +-
 net/sunrpc/svc.c               |  2 +-
 net/sunrpc/svcsock.c           |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 753a9e315518..199016528fcb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -142,7 +142,7 @@ int nfs4_set_callback_sessionid(struct nfs_client *clp)
 	struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
 	struct nfs4_sessionid *bc_sid;
 
-	if (!serv->bc_xprt)
+	if (!serv->sv_bc_xprt)
 		return -EINVAL;
 
 	/* on success freed in xprt_free */
@@ -152,12 +152,12 @@ int nfs4_set_callback_sessionid(struct nfs_client *clp)
 	memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
 		NFS4_MAX_SESSIONID_LEN);
 	spin_lock_bh(&serv->sv_cb_lock);
-	serv->bc_xprt->xpt_bc_sid = bc_sid;
+	serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
 	spin_unlock_bh(&serv->sv_cb_lock);
-	dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for bc_xprt %p\n", __func__,
+	dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
 		((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
 		((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
-		serv->bc_xprt);
+		serv->sv_bc_xprt);
 	return 0;
 }
 
@@ -228,8 +228,8 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 	init_waitqueue_head(&serv->sv_cb_waitq);
 	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
 	if (IS_ERR(rqstp)) {
-		svc_xprt_put(serv->bc_xprt);
-		serv->bc_xprt = NULL;
+		svc_xprt_put(serv->sv_bc_xprt);
+		serv->sv_bc_xprt = NULL;
 	}
 out:
 	dprintk("--> %s return %ld\n", __func__,
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 2c60e094fdec..c50b458b8a3f 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -43,7 +43,7 @@ int bc_send(struct rpc_rqst *req);
  */
 static inline int svc_is_backchannel(const struct svc_rqst *rqstp)
 {
-	if (rqstp->rq_server->bc_xprt)
+	if (rqstp->rq_server->sv_bc_xprt)
 		return 1;
 	return 0;
 }
@@ -51,7 +51,7 @@ static inline struct nfs4_sessionid *bc_xprt_sid(struct svc_rqst *rqstp)
 {
 	if (svc_is_backchannel(rqstp))
 		return (struct nfs4_sessionid *)
-					rqstp->rq_server->bc_xprt->xpt_bc_sid;
+			rqstp->rq_server->sv_bc_xprt->xpt_bc_sid;
 	return NULL;
 }
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5a3085b9b394..c81d4d8be3a9 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -99,7 +99,7 @@ struct svc_serv {
 	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
 	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
 						 * entries in the svc_cb_list */
-	struct svc_xprt		*bc_xprt;
+	struct svc_xprt		*sv_bc_xprt;	/* callback on fore channel */
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 261e2d1dff10..0e659c665a8d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1262,7 +1262,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	struct kvec	*resv = &rqstp->rq_res.head[0];
 
 	/* Build the svc_rqst used by the common processing routine */
-	rqstp->rq_xprt = serv->bc_xprt;
+	rqstp->rq_xprt = serv->sv_bc_xprt;
 	rqstp->rq_xid = req->rq_xid;
 	rqstp->rq_prot = req->rq_xprt->prot;
 	rqstp->rq_server = serv;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index db3013e4aa04..d265aa700bb3 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1595,7 +1595,7 @@ static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
 	xprt = &svsk->sk_xprt;
 	svc_xprt_init(&svc_tcp_bc_class, xprt, serv);
 
-	serv->bc_xprt = xprt;
+	serv->sv_bc_xprt = xprt;
 
 	return xprt;
 }
-- 
cgit v1.2.3-71-gd317


From daaa82d1c72e10dc16cad3a810e225f9188dc7aa Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 6 Jan 2011 11:36:19 +0000
Subject: pnfs: remove unnecessary field lgp->status

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 3 +--
 include/linux/nfs_xdr.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 18a4d5a9a4e9..28e175e74de2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5326,7 +5326,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 			return;
 		}
 	}
-	lgp->status = task->tk_status;
 	dprintk("<-- %s\n", __func__);
 }
 
@@ -5382,7 +5381,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
 	status = nfs4_wait_for_completion_rpc_task(task);
 	if (status != 0)
 		goto out;
-	status = lgp->status;
+	status = task->tk_status;
 	if (status != 0)
 		goto out;
 	status = pnfs_layout_process(lgp);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 236e7e4b99a0..8fcc54267bba 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -223,7 +223,6 @@ struct nfs4_layoutget {
 	struct nfs4_layoutget_args args;
 	struct nfs4_layoutget_res res;
 	struct pnfs_layout_segment **lsegpp;
-	int status;
 };
 
 struct nfs4_getdeviceinfo_args {
-- 
cgit v1.2.3-71-gd317


From cf7d63f1f9895713551df2e6d18b006f8af26e91 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 6 Jan 2011 11:36:25 +0000
Subject: pnfs: serialize LAYOUTGET(openstateid)

We shouldn't send a LAYOUTGET(openstateid) unless all outstanding RPCs
using the previous stateid are completed.  This requires choosing the
stateid to encode earlier, so we can abort if one is not available (we
want to use the open stateid, but a LAYOUTGET is already out using
it), and adding a count of the number of outstanding rpc calls using
layout state (which for now consist solely of LAYOUTGETs).

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       |  7 ++++++-
 fs/nfs/nfs4xdr.c        |  5 +----
 fs/nfs/pnfs.c           | 24 +++++++++++++++++++-----
 fs/nfs/pnfs.h           |  1 +
 include/linux/nfs_xdr.h |  1 +
 5 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5bee453d36d6..a3549ce72ab2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5304,6 +5304,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 	if (nfs4_setup_sequence(server, &lgp->args.seq_args,
 				&lgp->res.seq_res, 0, task))
 		return;
+	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+					  NFS_I(lgp->args.inode)->layout,
+					  lgp->args.ctx->state)) {
+		rpc_exit(task, NFS4_OK);
+		return;
+	}
 	rpc_call_start(task);
 }
 
@@ -5338,7 +5344,6 @@ static void nfs4_layoutget_release(void *calldata)
 	struct nfs4_layoutget *lgp = calldata;
 
 	dprintk("--> %s\n", __func__);
-	put_layout_hdr(lgp->args.inode);
 	if (lgp->res.layout.buf != NULL)
 		free_page((unsigned long) lgp->res.layout.buf);
 	put_nfs_open_context(lgp->args.ctx);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e28242360d6..3cbdd0c80a2d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1787,7 +1787,6 @@ encode_layoutget(struct xdr_stream *xdr,
 		      const struct nfs4_layoutget_args *args,
 		      struct compound_hdr *hdr)
 {
-	nfs4_stateid stateid;
 	__be32 *p;
 
 	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1798,9 +1797,7 @@ encode_layoutget(struct xdr_stream *xdr,
 	p = xdr_encode_hyper(p, args->range.offset);
 	p = xdr_encode_hyper(p, args->range.length);
 	p = xdr_encode_hyper(p, args->minlength);
-	pnfs_choose_layoutget_stateid(&stateid, NFS_I(args->inode)->layout,
-				args->ctx->state);
-	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
 	*p = cpu_to_be32(args->maxcount);
 
 	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 212cbc22c59d..59ed68bf79fa 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -371,6 +371,14 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 		memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
 }
 
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, int lget)
+{
+	return (list_empty(&lo->plh_segs) &&
+		 (atomic_read(&lo->plh_outstanding) > lget));
+}
+
 int
 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 			      struct nfs4_state *open_state)
@@ -379,7 +387,9 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->plh_inode->i_lock);
-	if (list_empty(&lo->plh_segs)) {
+	if (pnfs_layoutgets_blocked(lo, 1)) {
+		status = -EAGAIN;
+	} else if (list_empty(&lo->plh_segs)) {
 		int seq;
 
 		do {
@@ -414,10 +424,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 
 	BUG_ON(ctx == NULL);
 	lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-	if (lgp == NULL) {
-		put_layout_hdr(lo->plh_inode);
+	if (lgp == NULL)
 		return NULL;
-	}
 	lgp->args.minlength = NFS4_MAX_UINT64;
 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 	lgp->args.range.iomode = iomode;
@@ -613,10 +621,16 @@ pnfs_update_layout(struct inode *ino,
 	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
 		goto out_unlock;
 
-	get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+	if (pnfs_layoutgets_blocked(lo, 0))
+		goto out_unlock;
+	atomic_inc(&lo->plh_outstanding);
+
+	get_layout_hdr_locked(lo);
 	spin_unlock(&ino->i_lock);
 
 	lseg = send_layoutget(lo, ctx, iomode);
+	atomic_dec(&lo->plh_outstanding);
+	put_layout_hdr(ino);
 out:
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
 		nfsi->layout->plh_flags, lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 787253e6fca3..698380da24cc 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -69,6 +69,7 @@ struct pnfs_layout_hdr {
 	struct list_head	plh_layouts;   /* other client layouts */
 	struct list_head	plh_segs;      /* layout segments list */
 	nfs4_stateid		plh_stateid;
+	atomic_t		plh_outstanding; /* number of RPCs out */
 	unsigned long		plh_flags;
 	struct inode		*plh_inode;
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8fcc54267bba..83d36d3a12e6 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -208,6 +208,7 @@ struct nfs4_layoutget_args {
 	struct inode *inode;
 	struct nfs_open_context *ctx;
 	struct nfs4_sequence_args seq_args;
+	nfs4_stateid stateid;
 };
 
 struct nfs4_layoutget_res {
-- 
cgit v1.2.3-71-gd317


From f7e8917a67980924651a9e244510e63ef05c7755 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 6 Jan 2011 11:36:32 +0000
Subject: pnfs: layout roc code

A layout can request return-on-close.  How this interacts with the
forgetful model of never sending LAYOUTRETURNS is a bit ambiguous.
We forget any layouts marked roc, and wait for them to be completely
forgotten before continuing with the close.  In addition, to compensate
for races with any inflight LAYOUTGETs, and the fact that we do not get
any layout stateid back from the server, we set the barrier to the worst
case scenario of current_seqid + number of outstanding LAYOUTGETS.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 12 +++++++
 fs/nfs/nfs4_fs.h          |  2 +-
 fs/nfs/nfs4proc.c         | 21 ++++++++++--
 fs/nfs/nfs4state.c        |  7 ++--
 fs/nfs/pnfs.c             | 86 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.h             | 29 ++++++++++++++++
 include/linux/nfs_fs_sb.h |  1 +
 7 files changed, 152 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 11eb9934c747..684b67771199 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -244,6 +244,11 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 		idr_remove(&cb_ident_idr, clp->cl_cb_ident);
 }
 
+static void pnfs_init_server(struct nfs_server *server)
+{
+	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
@@ -256,6 +261,11 @@ void nfs_cleanup_cb_ident_idr(void)
 static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 {
 }
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
 #endif /* CONFIG_NFS_V4 */
 
 /*
@@ -1024,6 +1034,8 @@ static struct nfs_server *nfs_alloc_server(void)
 		return NULL;
 	}
 
+	pnfs_init_server(server);
+
 	return server;
 }
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8f169dc789db..18d64cb5985b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -236,7 +236,7 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a3549ce72ab2..88f590feeb72 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1839,6 +1839,8 @@ struct nfs4_closedata {
 	struct nfs_closeres res;
 	struct nfs_fattr fattr;
 	unsigned long timestamp;
+	bool roc;
+	u32 roc_barrier;
 };
 
 static void nfs4_free_closedata(void *data)
@@ -1846,6 +1848,8 @@ static void nfs4_free_closedata(void *data)
 	struct nfs4_closedata *calldata = data;
 	struct nfs4_state_owner *sp = calldata->state->owner;
 
+	if (calldata->roc)
+		pnfs_roc_release(calldata->state->inode);
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
@@ -1878,6 +1882,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	 */
 	switch (task->tk_status) {
 		case 0:
+			if (calldata->roc)
+				pnfs_roc_set_barrier(state->inode,
+						     calldata->roc_barrier);
 			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
 			renew_lease(server, calldata->timestamp);
 			nfs4_close_clear_stateid_flags(state,
@@ -1930,8 +1937,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		return;
 	}
 
-	if (calldata->arg.fmode == 0)
+	if (calldata->arg.fmode == 0) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+		if (calldata->roc &&
+		    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+			rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+				     task, NULL);
+			return;
+		}
+	}
 
 	nfs_fattr_init(calldata->res.fattr);
 	calldata->timestamp = jiffies;
@@ -1959,7 +1973,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
  *
  * NOTE: Caller must be holding the sp->so_owner semaphore!
  */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs4_closedata *calldata;
@@ -1994,6 +2008,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
+	calldata->roc = roc;
 	path_get(path);
 	calldata->path = *path;
 
@@ -2011,6 +2026,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 out_free_calldata:
 	kfree(calldata);
 out:
+	if (roc)
+		pnfs_roc_release(state->inode);
 	nfs4_put_open_state(state);
 	nfs4_put_state_owner(sp);
 	return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6891dedd80f1..286084f148e3 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -606,8 +606,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
 	if (!call_close) {
 		nfs4_put_open_state(state);
 		nfs4_put_state_owner(owner);
-	} else
-		nfs4_do_close(path, state, gfp_mask, wait);
+	} else {
+		bool roc = pnfs_roc(state->inode);
+
+		nfs4_do_close(path, state, gfp_mask, wait, roc);
+	}
 }
 
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bf4186b8f2fc..bc4089769735 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -256,6 +256,7 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
 			spin_unlock(&clp->cl_lock);
 			clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
 		}
+		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
 		list_add(&lseg->pls_list, tmp_list);
 		return 1;
 	}
@@ -401,7 +402,8 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
 	if ((stateid) &&
 	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
 		return true;
-	return test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+	return lo->plh_block_lgets ||
+		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 		(list_empty(&lo->plh_segs) &&
 		 (atomic_read(&lo->plh_outstanding) > lget));
 }
@@ -474,6 +476,83 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	return lseg;
 }
 
+bool pnfs_roc(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg, *tmp;
+	LIST_HEAD(tmp_list);
+	bool found = false;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+		goto out_nolayout;
+	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			mark_lseg_invalid(lseg, &tmp_list);
+			found = true;
+		}
+	if (!found)
+		goto out_nolayout;
+	lo->plh_block_lgets++;
+	get_layout_hdr(lo); /* matched in pnfs_roc_release */
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+	return true;
+
+out_nolayout:
+	spin_unlock(&ino->i_lock);
+	return false;
+}
+
+void pnfs_roc_release(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	lo->plh_block_lgets--;
+	put_layout_hdr_locked(lo);
+	spin_unlock(&ino->i_lock);
+}
+
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if ((int)(barrier - lo->plh_barrier) > 0)
+		lo->plh_barrier = barrier;
+	spin_unlock(&ino->i_lock);
+}
+
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_segment *lseg;
+	bool found = false;
+
+	spin_lock(&ino->i_lock);
+	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			found = true;
+			break;
+		}
+	if (!found) {
+		struct pnfs_layout_hdr *lo = nfsi->layout;
+		u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+
+		/* Since close does not return a layout stateid for use as
+		 * a barrier, we choose the worst-case barrier.
+		 */
+		*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+	}
+	spin_unlock(&ino->i_lock);
+	return found;
+}
+
 /*
  * Compare two layout segments for sorting into layout cache.
  * We want to preferentially return RW over RO layouts, so ensure those
@@ -732,6 +811,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	*lgp->lsegpp = lseg;
 	pnfs_insert_layout(lo, lseg);
 
+	if (res->return_on_close) {
+		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+	}
+
 	/* Done processing layoutget. Set the layout stateid */
 	pnfs_set_layout_stateid(lo, &res->stateid, false);
 	spin_unlock(&ino->i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f91d0d45551c..e2612ea0cbed 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
 
 enum {
 	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
+	NFS_LSEG_ROC,		/* roc bit received from server */
 };
 
 struct pnfs_layout_segment {
@@ -50,6 +51,7 @@ enum {
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
+	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
 	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
 };
 
@@ -72,6 +74,7 @@ struct pnfs_layout_hdr {
 	struct list_head	plh_segs;      /* layout segments list */
 	nfs4_stateid		plh_stateid;
 	atomic_t		plh_outstanding; /* number of RPCs out */
+	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
 	u32			plh_barrier; /* ignore lower seqids */
 	unsigned long		plh_flags;
 	struct inode		*plh_inode;
@@ -162,6 +165,10 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
 int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
 				u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 
 
 static inline int lo_fail_bit(u32 iomode)
@@ -193,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 	return NULL;
 }
 
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+	return false;
+}
+
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+	return false;
+}
+
 static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
 {
 }
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e93ada0565fc..7f20c0b47a91 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -149,6 +149,7 @@ struct nfs_server {
 						   that are supported on this
 						   filesystem */
 	struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
+	struct rpc_wait_queue	roc_rpcwaitq;
 #endif
 	void (*destroy)(struct nfs_server *);
 
-- 
cgit v1.2.3-71-gd317


From 24d292b894273495f9664bb495e575f8cb7e8cac Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 24 Dec 2010 01:32:43 +0000
Subject: NFS: Move cl_state_owners and related fields to the nfs_server struct

NFSv4 migration needs to reassociate state owners from the source to
the destination nfs_server data structures.  To make that easier, move
the cl_state_owners field to the nfs_server struct.  cl_openowner_id
and cl_lockowner_id accompany this move, as they are used in
conjunction with cl_state_owners.

The cl_lock field in the parent nfs_client continues to protect all
three of these fields.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h          |   2 +-
 fs/nfs/nfs4state.c        | 251 +++++++++++++++++++++++++++++++++-------------
 include/linux/nfs_fs_sb.h |   9 +-
 3 files changed, 186 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 18d64cb5985b..7a7474073148 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -110,7 +110,7 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
 	struct nfs_unique_id so_owner_id;
 	struct nfs_server    *so_server;
-	struct rb_node	     so_client_node;
+	struct rb_node	     so_server_node;
 
 	struct rpc_cred	     *so_cred;	 /* Associated cred */
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 286084f148e3..2336d532cf66 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
 		put_rpccred(cred);
 }
 
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
+	struct rpc_cred *cred = NULL;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
-	struct rpc_cred *cred = NULL;
 
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
 		if (list_empty(&sp->so_states))
 			continue;
 		cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
 	return cred;
 }
 
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+	struct rpc_cred *cred = NULL;
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		cred = nfs4_get_renew_cred_server_locked(server);
+		if (cred != NULL)
+			break;
+	}
+	rcu_read_unlock();
+	return cred;
+}
+
 #if defined(CONFIG_NFS_V4_1)
 
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -231,28 +256,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 
 #endif /* CONFIG_NFS_V4_1 */
 
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
 {
+	struct nfs_client *clp = server->nfs_client;
+	struct rpc_cred *cred = NULL;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
+
+	spin_lock(&clp->cl_lock);
+	pos = rb_first(&server->state_owners);
+	if (pos != NULL) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		cred = get_rpccred(sp->so_cred);
+	}
+	spin_unlock(&clp->cl_lock);
+	return cred;
+}
+
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+	struct nfs_server *server;
 	struct rpc_cred *cred;
 
 	spin_lock(&clp->cl_lock);
 	cred = nfs4_get_machine_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
 	if (cred != NULL)
 		goto out;
-	pos = rb_first(&clp->cl_state_owners);
-	if (pos != NULL) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-		cred = get_rpccred(sp->so_cred);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		cred = nfs4_get_setclientid_cred_server(server);
+		if (cred != NULL)
+			break;
 	}
+	rcu_read_unlock();
+
 out:
-	spin_unlock(&clp->cl_lock);
 	return cred;
 }
 
-static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
-		__u64 minval, int maxbits)
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
+				       struct nfs_unique_id *new,
+				       __u64 minval, int maxbits)
 {
 	struct rb_node **p, *parent;
 	struct nfs_unique_id *pos;
@@ -307,16 +360,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
 }
 
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
-	struct nfs_client *clp = server->nfs_client;
-	struct rb_node **p = &clp->cl_state_owners.rb_node,
+	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
 	struct nfs4_state_owner *sp, *res = NULL;
 
 	while (*p != NULL) {
 		parent = *p;
-		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
 
 		if (server < sp->so_server) {
 			p = &parent->rb_left;
@@ -340,24 +392,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 }
 
 static struct nfs4_state_owner *
-nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 {
-	struct rb_node **p = &clp->cl_state_owners.rb_node,
+	struct nfs_server *server = new->so_server;
+	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
 	struct nfs4_state_owner *sp;
 
 	while (*p != NULL) {
 		parent = *p;
-		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
 
-		if (new->so_server < sp->so_server) {
-			p = &parent->rb_left;
-			continue;
-		}
-		if (new->so_server > sp->so_server) {
-			p = &parent->rb_right;
-			continue;
-		}
 		if (new->so_cred < sp->so_cred)
 			p = &parent->rb_left;
 		else if (new->so_cred > sp->so_cred)
@@ -367,18 +412,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
 			return sp;
 		}
 	}
-	nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
-	rb_link_node(&new->so_client_node, parent, p);
-	rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+	nfs_alloc_unique_id_locked(&server->openowner_id,
+					&new->so_owner_id, 1, 64);
+	rb_link_node(&new->so_server_node, parent, p);
+	rb_insert_color(&new->so_server_node, &server->state_owners);
 	return new;
 }
 
 static void
-nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 {
-	if (!RB_EMPTY_NODE(&sp->so_client_node))
-		rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-	nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+	struct nfs_server *server = sp->so_server;
+
+	if (!RB_EMPTY_NODE(&sp->so_server_node))
+		rb_erase(&sp->so_server_node, &server->state_owners);
+	nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
 }
 
 /*
@@ -407,23 +455,32 @@ nfs4_alloc_state_owner(void)
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-	if (!RB_EMPTY_NODE(&sp->so_client_node)) {
-		struct nfs_client *clp = sp->so_server->nfs_client;
+	if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+		struct nfs_server *server = sp->so_server;
+		struct nfs_client *clp = server->nfs_client;
 
 		spin_lock(&clp->cl_lock);
-		rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-		RB_CLEAR_NODE(&sp->so_client_node);
+		rb_erase(&sp->so_server_node, &server->state_owners);
+		RB_CLEAR_NODE(&sp->so_server_node);
 		spin_unlock(&clp->cl_lock);
 	}
 }
 
-struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+					      struct rpc_cred *cred)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp, *new;
 
 	spin_lock(&clp->cl_lock);
-	sp = nfs4_find_state_owner(server, cred);
+	sp = nfs4_find_state_owner_locked(server, cred);
 	spin_unlock(&clp->cl_lock);
 	if (sp != NULL)
 		return sp;
@@ -433,7 +490,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	new->so_server = server;
 	new->so_cred = cred;
 	spin_lock(&clp->cl_lock);
-	sp = nfs4_insert_state_owner(clp, new);
+	sp = nfs4_insert_state_owner_locked(new);
 	spin_unlock(&clp->cl_lock);
 	if (sp == new)
 		get_rpccred(cred);
@@ -444,6 +501,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	return sp;
 }
 
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
 	struct nfs_client *clp = sp->so_server->nfs_client;
@@ -451,7 +513,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
 		return;
-	nfs4_remove_state_owner(clp, sp);
+	nfs4_remove_state_owner_locked(sp);
 	spin_unlock(&clp->cl_lock);
 	rpc_destroy_wait_queue(&sp->so_sequence.wait);
 	put_rpccred(cred);
@@ -657,7 +719,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
 	struct nfs4_lock_state *lsp;
-	struct nfs_client *clp = state->owner->so_server->nfs_client;
+	struct nfs_server *server = state->owner->so_server;
+	struct nfs_client *clp = server->nfs_client;
 
 	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
 	if (lsp == NULL)
@@ -681,7 +744,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 		return NULL;
 	}
 	spin_lock(&clp->cl_lock);
-	nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
+	nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
 	spin_unlock(&clp->cl_lock);
 	INIT_LIST_HEAD(&lsp->ls_locks);
 	return lsp;
@@ -689,10 +752,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-	struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
+	struct nfs_server *server = lsp->ls_state->owner->so_server;
+	struct nfs_client *clp = server->nfs_client;
 
 	spin_lock(&clp->cl_lock);
-	nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+	nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
 	spin_unlock(&clp->cl_lock);
 	rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
 	kfree(lsp);
@@ -1138,15 +1202,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
 	}
 }
 
-static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+static void nfs4_reset_seqids(struct nfs_server *server,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
 	struct nfs4_state *state;
 
-	/* Reset all sequence ids to zero */
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
 		sp->so_seqid.flags = 0;
 		spin_lock(&sp->so_lock);
 		list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1155,6 +1223,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
 		}
 		spin_unlock(&sp->so_lock);
 	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_reset_seqids(server, mark_reclaim);
+	rcu_read_unlock();
 }
 
 static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1172,25 +1252,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
 		(void)ops->reclaim_complete(clp);
 }
 
-static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
 {
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
 	struct nfs4_state *state;
 
-	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-		return 0;
-
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
 		spin_lock(&sp->so_lock);
 		list_for_each_entry(state, &sp->so_states, open_states) {
-			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+						&state->flags))
 				continue;
 			nfs4_state_mark_reclaim_nograce(clp, state);
 		}
 		spin_unlock(&sp->so_lock);
 	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_clear_reclaim_server(server);
+	rcu_read_unlock();
 
 	nfs_delegation_reap_unclaimed(clp);
 	return 1;
@@ -1262,27 +1358,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
 {
+	struct nfs4_state_owner *sp;
+	struct nfs_server *server;
 	struct rb_node *pos;
 	int status = 0;
 
 restart:
-	spin_lock(&clp->cl_lock);
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-		if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
-			continue;
-		atomic_inc(&sp->so_count);
-		spin_unlock(&clp->cl_lock);
-		status = nfs4_reclaim_open_state(sp, ops);
-		if (status < 0) {
-			set_bit(ops->owner_flag_bit, &sp->so_flags);
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		spin_lock(&clp->cl_lock);
+		for (pos = rb_first(&server->state_owners);
+		     pos != NULL;
+		     pos = rb_next(pos)) {
+			sp = rb_entry(pos,
+				struct nfs4_state_owner, so_server_node);
+			if (!test_and_clear_bit(ops->owner_flag_bit,
+							&sp->so_flags))
+				continue;
+			atomic_inc(&sp->so_count);
+			spin_unlock(&clp->cl_lock);
+			rcu_read_unlock();
+
+			status = nfs4_reclaim_open_state(sp, ops);
+			if (status < 0) {
+				set_bit(ops->owner_flag_bit, &sp->so_flags);
+				nfs4_put_state_owner(sp);
+				return nfs4_recovery_handle_error(clp, status);
+			}
+
 			nfs4_put_state_owner(sp);
-			return nfs4_recovery_handle_error(clp, status);
+			goto restart;
 		}
-		nfs4_put_state_owner(sp);
-		goto restart;
+		spin_unlock(&clp->cl_lock);
 	}
-	spin_unlock(&clp->cl_lock);
+	rcu_read_unlock();
 	return status;
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7f20c0b47a91..27255ffdbe10 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -47,11 +47,7 @@ struct nfs_client {
 	u64			cl_clientid;	/* constant */
 	unsigned long		cl_state;
 
-	struct rb_root		cl_openowner_id;
-	struct rb_root		cl_lockowner_id;
-
 	struct list_head	cl_delegations;
-	struct rb_root		cl_state_owners;
 	spinlock_t		cl_lock;
 
 	unsigned long		cl_lease_time;
@@ -150,6 +146,11 @@ struct nfs_server {
 						   filesystem */
 	struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
 	struct rpc_wait_queue	roc_rpcwaitq;
+
+	/* the following fields are protected by nfs_client->cl_lock */
+	struct rb_root		state_owners;
+	struct rb_root		openowner_id;
+	struct rb_root		lockowner_id;
 #endif
 	void (*destroy)(struct nfs_server *);
 
-- 
cgit v1.2.3-71-gd317


From d3978bb325510f0a26ebd92f211b36c5f98b2306 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 24 Dec 2010 01:33:04 +0000
Subject: NFS: Move cl_delegations to the nfs_server struct

Delegations are per-inode, not per-nfs_client.  When a server file
system is migrated, delegations on the client must be moved from the
source to the destination nfs_server.  Make it easier to manage a
mount point's delegation list across a migration event by moving the
list to the nfs_server struct.

Clean up: I added documenting comments to public functions I changed
in this patch.  For consistency I added comments to all the other
public functions in fs/nfs/delegation.c.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |   2 +-
 fs/nfs/delegation.c       | 337 +++++++++++++++++++++++++++++++++-------------
 fs/nfs/delegation.h       |   1 +
 fs/nfs/nfs4renewd.c       |   2 +-
 include/linux/nfs_fs_sb.h |   2 +-
 5 files changed, 251 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 32b5fbfab35e..192f2f860265 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -172,7 +172,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	if (err)
 		goto error_cleanup;
 
-	INIT_LIST_HEAD(&clp->cl_delegations);
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -1040,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
 	/* Zero out the NFS state stuff */
 	INIT_LIST_HEAD(&server->client_link);
 	INIT_LIST_HEAD(&server->master_link);
+	INIT_LIST_HEAD(&server->delegations);
 
 	atomic_set(&server->active, 0);
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 521d71b81825..364e4328f392 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -40,11 +40,23 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
 	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
 
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 {
 	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
 int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
 	struct nfs_delegation *delegation;
@@ -119,10 +131,15 @@ again:
 	return 0;
 }
 
-/*
- * Set up a delegation on an inode
+/**
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
  */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+				  struct nfs_openres *res)
 {
 	struct nfs_delegation *delegation;
 	struct rpc_cred *oldcred = NULL;
@@ -177,11 +194,11 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
 
 static struct nfs_delegation *
 nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-			     struct nfs_client *clp)
+			     struct nfs_server *server)
 {
 	struct nfs_delegation *delegation =
 		rcu_dereference_protected(nfsi->delegation,
-					  lockdep_is_held(&clp->cl_lock));
+				lockdep_is_held(&server->nfs_client->cl_lock));
 
 	if (delegation == NULL)
 		goto nomatch;
@@ -198,22 +215,29 @@ nomatch:
 }
 
 static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
-						    struct nfs_client *clp)
+						    struct nfs_server *server)
 {
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_delegation *delegation;
 
 	spin_lock(&clp->cl_lock);
-	delegation = nfs_detach_delegation_locked(nfsi, clp);
+	delegation = nfs_detach_delegation_locked(nfsi, server);
 	spin_unlock(&clp->cl_lock);
 	return delegation;
 }
 
-/*
- * Set up a delegation on an inode
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
  */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation, *old_delegation;
 	struct nfs_delegation *freeme = NULL;
@@ -234,7 +258,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 
 	spin_lock(&clp->cl_lock);
 	old_delegation = rcu_dereference_protected(nfsi->delegation,
-						   lockdep_is_held(&clp->cl_lock));
+					lockdep_is_held(&clp->cl_lock));
 	if (old_delegation != NULL) {
 		if (memcmp(&delegation->stateid, &old_delegation->stateid,
 					sizeof(old_delegation->stateid)) == 0 &&
@@ -253,9 +277,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			delegation = NULL;
 			goto out;
 		}
-		freeme = nfs_detach_delegation_locked(nfsi, clp);
+		freeme = nfs_detach_delegation_locked(nfsi, server);
 	}
-	list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+	list_add_rcu(&delegation->super_list, &server->delegations);
 	nfsi->delegation_state = delegation->type;
 	rcu_assign_pointer(nfsi->delegation, delegation);
 	delegation = NULL;
@@ -297,67 +321,85 @@ out:
 	return err;
 }
 
-/*
- * Return all delegations that have been marked for return
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
  */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation;
+	struct nfs_server *server;
 	struct inode *inode;
 	int err = 0;
 
 restart:
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-		if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
-			continue;
-		inode = nfs_delegation_grab_inode(delegation);
-		if (inode == NULL)
-			continue;
-		delegation = nfs_detach_delegation(NFS_I(inode), clp);
-		rcu_read_unlock();
-		if (delegation != NULL) {
-			filemap_flush(inode->i_mapping);
-			err = __nfs_inode_return_delegation(inode, delegation, 0);
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
+							&delegation->flags))
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL)
+				continue;
+			delegation = nfs_detach_delegation(NFS_I(inode),
+								server);
+			rcu_read_unlock();
+
+			if (delegation != NULL) {
+				filemap_flush(inode->i_mapping);
+				err = __nfs_inode_return_delegation(inode,
+								delegation, 0);
+			}
+			iput(inode);
+			if (!err)
+				goto restart;
+			set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+			return err;
 		}
-		iput(inode);
-		if (!err)
-			goto restart;
-		set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-		return err;
 	}
 	rcu_read_unlock();
 	return 0;
 }
 
-/*
- * This function returns the delegation without reclaiming opens
- * or protecting against delegation reclaims.
- * It is therefore really only safe to be called from
- * nfs4_clear_inode()
+/**
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
+ * @inode: inode to process
+ *
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
  */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 
 	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		delegation = nfs_detach_delegation(nfsi, clp);
+		delegation = nfs_detach_delegation(nfsi, server);
 		if (delegation != NULL)
 			nfs_do_return_delegation(inode, delegation, 0);
 	}
 }
 
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
 int nfs_inode_return_delegation(struct inode *inode)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int err = 0;
 
 	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		delegation = nfs_detach_delegation(nfsi, clp);
+		delegation = nfs_detach_delegation(nfsi, server);
 		if (delegation != NULL) {
 			nfs_wb_all(inode);
 			err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -366,46 +408,61 @@ int nfs_inode_return_delegation(struct inode *inode)
 	return err;
 }
 
-static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
 {
+	struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
+
 	set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
 	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
 }
 
-/*
- * Return all delegations associated to a super block
+/**
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
  */
 void nfs_super_return_all_delegations(struct super_block *sb)
 {
-	struct nfs_client *clp = NFS_SB(sb)->nfs_client;
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_delegation *delegation;
 
 	if (clp == NULL)
 		return;
+
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		spin_lock(&delegation->lock);
-		if (delegation->inode != NULL && delegation->inode->i_sb == sb)
-			set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+		set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
 		spin_unlock(&delegation->lock);
 	}
 	rcu_read_unlock();
+
 	if (nfs_client_return_marked_delegations(clp) != 0)
 		nfs4_schedule_state_manager(clp);
 }
 
-static
-void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
+						 fmode_t flags)
 {
 	struct nfs_delegation *delegation;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
 			continue;
 		if (delegation->type & flags)
-			nfs_mark_return_delegation(clp, delegation);
+			nfs_mark_return_delegation(delegation);
 	}
+}
+
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+							fmode_t flags)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_all_delegation_types(server, flags);
 	rcu_read_unlock();
 }
 
@@ -420,19 +477,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
 		nfs4_schedule_state_manager(clp);
 }
 
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
 	nfs_client_mark_return_all_delegation_types(clp, flags);
 	nfs_delegation_run_state_manager(clp);
 }
 
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
 	nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
 
-/*
- * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
+/**
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
  */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
@@ -441,29 +511,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
 	nfs_client_mark_return_all_delegations(clp);
 }
 
-static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
 	struct nfs_delegation *delegation;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
 			continue;
-		nfs_mark_return_delegation(clp, delegation);
+		nfs_mark_return_delegation(delegation);
 	}
-	rcu_read_unlock();
 }
 
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 {
-	nfs_client_mark_return_unreferenced_delegations(clp);
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_unreferenced_delegations(server);
+	rcu_read_unlock();
+
 	nfs_delegation_run_state_manager(clp);
 }
 
-/*
- * Asynchronous delegation recall!
+/**
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
  */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode,
+				      const nfs4_stateid *stateid)
 {
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_delegation *delegation;
@@ -475,22 +559,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
 		rcu_read_unlock();
 		return -ENOENT;
 	}
-
-	nfs_mark_return_delegation(clp, delegation);
+	nfs_mark_return_delegation(delegation);
 	rcu_read_unlock();
+
 	nfs_delegation_run_state_manager(clp);
 	return 0;
 }
 
-/*
- * Retrieve the inode associated with a delegation
- */
-struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
+static struct inode *
+nfs_delegation_find_inode_server(struct nfs_server *server,
+				 const struct nfs_fh *fhandle)
 {
 	struct nfs_delegation *delegation;
 	struct inode *res = NULL;
-	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		spin_lock(&delegation->lock);
 		if (delegation->inode != NULL &&
 		    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -500,47 +583,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
 		if (res != NULL)
 			break;
 	}
+	return res;
+}
+
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+					const struct nfs_fh *fhandle)
+{
+	struct nfs_server *server;
+	struct inode *res = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		res = nfs_delegation_find_inode_server(server, fhandle);
+		if (res != NULL)
+			break;
+	}
 	rcu_read_unlock();
 	return res;
 }
 
-/*
- * Mark all delegations as needing to be reclaimed
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
  */
 void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
-	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
-		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_delegation_mark_reclaim_server(server);
 	rcu_read_unlock();
 }
 
-/*
- * Reap all unclaimed delegations after reboot recovery is done
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
  */
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation;
+	struct nfs_server *server;
 	struct inode *inode;
+
 restart:
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-		if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
-			continue;
-		inode = nfs_delegation_grab_inode(delegation);
-		if (inode == NULL)
-			continue;
-		delegation = nfs_detach_delegation(NFS_I(inode), clp);
-		rcu_read_unlock();
-		if (delegation != NULL)
-			nfs_free_delegation(delegation);
-		iput(inode);
-		goto restart;
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+						&delegation->flags) == 0)
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL)
+				continue;
+			delegation = nfs_detach_delegation(NFS_I(inode),
+								server);
+			rcu_read_unlock();
+
+			if (delegation != NULL)
+				nfs_free_delegation(delegation);
+			iput(inode);
+			goto restart;
+		}
 	}
 	rcu_read_unlock();
 }
 
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+	int ret = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		if (!list_empty(&server->delegations)) {
+			ret = 1;
+			break;
+		}
+	rcu_read_unlock();
+	return ret;
+}
+
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda19..d9322e490c56 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
 
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index cde5650ee5a2..402143d75fc5 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -80,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
 		cred = ops->get_state_renewal_cred_locked(clp);
 		spin_unlock(&clp->cl_lock);
 		if (cred == NULL) {
-			if (list_empty(&clp->cl_delegations)) {
+			if (!nfs_delegations_present(clp)) {
 				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 				goto out;
 			}
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 27255ffdbe10..b197563913bf 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -47,7 +47,6 @@ struct nfs_client {
 	u64			cl_clientid;	/* constant */
 	unsigned long		cl_state;
 
-	struct list_head	cl_delegations;
 	spinlock_t		cl_lock;
 
 	unsigned long		cl_lease_time;
@@ -152,6 +151,7 @@ struct nfs_server {
 	struct rb_root		openowner_id;
 	struct rb_root		lockowner_id;
 #endif
+	struct list_head	delegations;
 	void (*destroy)(struct nfs_server *);
 
 	atomic_t active; /* Keep trace of any activity to this server */
-- 
cgit v1.2.3-71-gd317


From d035c36c58dd9183ad6aa7875dea89893faedb55 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 21 Dec 2010 10:45:27 -0500
Subject: NFSv4: Ensure continued open and lockowner name uniqueness

In order to enable migration support, we will want to move some of the
structures that are subject to migration into the struct nfs_server.
In particular, if we are to move the state_owner and state_owner_id to
being a per-filesystem structure, then we should label the resulting
open/lock owners with a per-filesytem label to ensure global uniqueness.

This patch does so by adding the super block s_dev to the open/lock owner
name.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       |  3 +++
 fs/nfs/nfs4xdr.c        | 14 ++++++++------
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 88f590feeb72..f2b92f6a7efb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3779,6 +3779,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
 	arg.lock_owner.id = lsp->ls_id.id;
+	arg.lock_owner.s_dev = server->s_dev;
 	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	switch (status) {
 		case 0:
@@ -4024,6 +4025,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_stateid = &lsp->ls_stateid;
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
+	p->arg.lock_owner.s_dev = server->s_dev;
 	p->res.lock_seqid = p->arg.lock_seqid;
 	p->lsp = lsp;
 	p->server = server;
@@ -4428,6 +4430,7 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
 		return;
 	args->lock_owner.clientid = server->nfs_client->cl_clientid;
 	args->lock_owner.id = lsp->ls_id.id;
+	args->lock_owner.s_dev = server->s_dev;
 	msg.rpc_argp = args;
 	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3cbdd0c80a2d..8e496887ec61 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
  * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
  */
-#define open_owner_id_maxsz	(1 + 4)
-#define lock_owner_id_maxsz	(1 + 4)
+#define open_owner_id_maxsz	(1 + 1 + 4)
+#define lock_owner_id_maxsz	(1 + 1 + 4)
 #define decode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
 {
 	__be32 *p;
 
-	p = reserve_space(xdr, 28);
+	p = reserve_space(xdr, 32);
 	p = xdr_encode_hyper(p, lowner->clientid);
-	*p++ = cpu_to_be32(16);
+	*p++ = cpu_to_be32(20);
 	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+	*p++ = cpu_to_be32(lowner->s_dev);
 	xdr_encode_hyper(p, lowner->id);
 }
 
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	*p++ = cpu_to_be32(OP_OPEN);
 	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
-	p = reserve_space(xdr, 28);
+	p = reserve_space(xdr, 32);
 	p = xdr_encode_hyper(p, arg->clientid);
-	*p++ = cpu_to_be32(16);
+	*p++ = cpu_to_be32(20);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
+	*p++ = cpu_to_be32(arg->server->s_dev);
 	xdr_encode_hyper(p, arg->id);
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 83d36d3a12e6..b0068579bec2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -317,6 +317,7 @@ struct nfs_closeres {
 struct nfs_lowner {
 	__u64			clientid;
 	__u64			id;
+	dev_t			s_dev;
 };
 
 struct nfs_lock_args {
-- 
cgit v1.2.3-71-gd317


From 26fcaf60fe3861409eb4c455c5c0d0f00f599b08 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 7 Jan 2011 01:42:31 +0100
Subject: PM: Fix oops in suspend/hibernate code related to failing ioremap()

When ioremap() fails (which might happen for some reason), we nicely
oops in suspend_nvs_save() due to NULL dereference by memcpy() in there.
Fail gracefully instead.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/sleep.c    | 5 ++---
 include/linux/suspend.h | 4 ++--
 kernel/power/nvs.c      | 8 +++++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index febb153b5a68..d8bca6c90719 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -124,8 +124,7 @@ static int acpi_pm_freeze(void)
 static int acpi_pm_pre_suspend(void)
 {
 	acpi_pm_freeze();
-	suspend_nvs_save();
-	return 0;
+	return suspend_nvs_save();
 }
 
 /**
@@ -151,7 +150,7 @@ static int acpi_pm_prepare(void)
 {
 	int error = __acpi_pm_prepare();
 	if (!error)
-		acpi_pm_pre_suspend();
+		error = acpi_pm_pre_suspend();
 
 	return error;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 26697514c5ec..acb7d911bb0c 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -262,7 +262,7 @@ static inline bool system_entering_hibernation(void) { return false; }
 extern int suspend_nvs_register(unsigned long start, unsigned long size);
 extern int suspend_nvs_alloc(void);
 extern void suspend_nvs_free(void);
-extern void suspend_nvs_save(void);
+extern int suspend_nvs_save(void);
 extern void suspend_nvs_restore(void);
 #else /* CONFIG_SUSPEND_NVS */
 static inline int suspend_nvs_register(unsigned long a, unsigned long b)
@@ -271,7 +271,7 @@ static inline int suspend_nvs_register(unsigned long a, unsigned long b)
 }
 static inline int suspend_nvs_alloc(void) { return 0; }
 static inline void suspend_nvs_free(void) {}
-static inline void suspend_nvs_save(void) {}
+static inline int suspend_nvs_save(void) {}
 static inline void suspend_nvs_restore(void) {}
 #endif /* CONFIG_SUSPEND_NVS */
 
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
index 1836db60bbb6..57c6fabbb6b6 100644
--- a/kernel/power/nvs.c
+++ b/kernel/power/nvs.c
@@ -105,7 +105,7 @@ int suspend_nvs_alloc(void)
 /**
  *	suspend_nvs_save - save NVS memory regions
  */
-void suspend_nvs_save(void)
+int suspend_nvs_save(void)
 {
 	struct nvs_page *entry;
 
@@ -114,8 +114,14 @@ void suspend_nvs_save(void)
 	list_for_each_entry(entry, &nvs_list, node)
 		if (entry->data) {
 			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			if (!entry->kaddr) {
+				suspend_nvs_free();
+				return -ENOMEM;
+			}
 			memcpy(entry->data, entry->kaddr, entry->size);
 		}
+
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 976513dbfc1547c7b1822566923058655f0c32fd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 7 Jan 2011 01:43:44 +0100
Subject: PM / ACPI: Move NVS saving and restoring code to drivers/acpi

The saving of the ACPI NVS area during hibernation and suspend and
restoring it during the subsequent resume is entirely specific to
ACPI, so move it to drivers/acpi and drop the CONFIG_SUSPEND_NVS
configuration option which is redundant.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/e820.c  |   1 +
 drivers/acpi/Makefile   |   2 +-
 drivers/acpi/internal.h |   8 +++
 drivers/acpi/nvs.c      | 142 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h    |   9 +++
 include/linux/suspend.h |  17 ------
 kernel/power/Kconfig    |   5 --
 kernel/power/Makefile   |   1 -
 kernel/power/nvs.c      | 142 ------------------------------------------------
 9 files changed, 161 insertions(+), 166 deletions(-)
 create mode 100644 drivers/acpi/nvs.c
 delete mode 100644 kernel/power/nvs.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34d..294f26da0c0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
 #include <linux/suspend.h>
+#include <linux/acpi.h>
 #include <linux/firmware-map.h>
 #include <linux/memblock.h>
 
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 3d031d02e54b..9cc9f2c4da79 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -24,7 +24,7 @@ acpi-y				+= atomicio.o
 # sleep related files
 acpi-y				+= wakeup.o
 acpi-y				+= sleep.o
-acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o
+acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o nvs.o
 
 
 #
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index a212bfeddf8c..7c23b76e8eca 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -82,8 +82,16 @@ extern int acpi_sleep_init(void);
 
 #ifdef CONFIG_ACPI_SLEEP
 int acpi_sleep_proc_init(void);
+int suspend_nvs_alloc(void);
+void suspend_nvs_free(void);
+int suspend_nvs_save(void);
+void suspend_nvs_restore(void);
 #else
 static inline int acpi_sleep_proc_init(void) { return 0; }
+static inline int suspend_nvs_alloc(void) { return 0; }
+static inline void suspend_nvs_free(void) {}
+static inline int suspend_nvs_save(void) {}
+static inline void suspend_nvs_restore(void) {}
 #endif
 
 #endif /* _ACPI_INTERNAL_H_ */
diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c
new file mode 100644
index 000000000000..57c6fabbb6b6
--- /dev/null
+++ b/drivers/acpi/nvs.c
@@ -0,0 +1,142 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * suspend and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	suspend_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int suspend_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	suspend_nvs_free - free data pages allocated for saving NVS regions
+ */
+void suspend_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	suspend_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int suspend_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			suspend_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	suspend_nvs_save - save NVS memory regions
+ */
+int suspend_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			if (!entry->kaddr) {
+				suspend_nvs_free();
+				return -ENOMEM;
+			}
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+
+	return 0;
+}
+
+/**
+ *	suspend_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void suspend_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 67c91b4418b0..fa7ed6a983d0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -254,6 +254,15 @@ void __init acpi_old_suspend_ordering(void);
 void __init acpi_nvs_nosave(void);
 #endif /* CONFIG_PM_SLEEP */
 
+#ifdef CONFIG_ACPI_SLEEP
+int suspend_nvs_register(unsigned long start, unsigned long size);
+#else
+static inline int suspend_nvs_register(unsigned long a, unsigned long b)
+{
+	return 0;
+}
+#endif
+
 struct acpi_osc_context {
 	char *uuid_str; /* uuid string */
 	int rev;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index acb7d911bb0c..0e288e3c37be 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -258,23 +258,6 @@ static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
 
-#ifdef CONFIG_SUSPEND_NVS
-extern int suspend_nvs_register(unsigned long start, unsigned long size);
-extern int suspend_nvs_alloc(void);
-extern void suspend_nvs_free(void);
-extern int suspend_nvs_save(void);
-extern void suspend_nvs_restore(void);
-#else /* CONFIG_SUSPEND_NVS */
-static inline int suspend_nvs_register(unsigned long a, unsigned long b)
-{
-	return 0;
-}
-static inline int suspend_nvs_alloc(void) { return 0; }
-static inline void suspend_nvs_free(void) {}
-static inline int suspend_nvs_save(void) {}
-static inline void suspend_nvs_restore(void) {}
-#endif /* CONFIG_SUSPEND_NVS */
-
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
 void restore_processor_state(void);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
 	depends on PM_ADVANCED_DEBUG
 	default n
 
-config SUSPEND_NVS
-       bool
-
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM && ARCH_SUSPEND_POSSIBLE
-	select SUSPEND_NVS if HAS_IOMEM
 	default y
 	---help---
 	  Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
 	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
-	select SUSPEND_NVS if HAS_IOMEM
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..120a15823325 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,6 +10,5 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
 obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
 				   block_io.o
-obj-$(CONFIG_SUSPEND_NVS)	+= nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 57c6fabbb6b6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * suspend and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
-	unsigned long phys_start;
-	unsigned int size;
-	void *kaddr;
-	void *data;
-	struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- *	suspend_nvs_register - register platform NVS memory region to save
- *	@start - physical address of the region
- *	@size - size of the region
- *
- *	The NVS region need not be page-aligned (both ends) and we arrange
- *	things so that the data from page-aligned addresses in this region will
- *	be copied into separate RAM pages.
- */
-int suspend_nvs_register(unsigned long start, unsigned long size)
-{
-	struct nvs_page *entry, *next;
-
-	while (size > 0) {
-		unsigned int nr_bytes;
-
-		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-		if (!entry)
-			goto Error;
-
-		list_add_tail(&entry->node, &nvs_list);
-		entry->phys_start = start;
-		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-		entry->size = (size < nr_bytes) ? size : nr_bytes;
-
-		start += entry->size;
-		size -= entry->size;
-	}
-	return 0;
-
- Error:
-	list_for_each_entry_safe(entry, next, &nvs_list, node) {
-		list_del(&entry->node);
-		kfree(entry);
-	}
-	return -ENOMEM;
-}
-
-/**
- *	suspend_nvs_free - free data pages allocated for saving NVS regions
- */
-void suspend_nvs_free(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			free_page((unsigned long)entry->data);
-			entry->data = NULL;
-			if (entry->kaddr) {
-				iounmap(entry->kaddr);
-				entry->kaddr = NULL;
-			}
-		}
-}
-
-/**
- *	suspend_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int suspend_nvs_alloc(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node) {
-		entry->data = (void *)__get_free_page(GFP_KERNEL);
-		if (!entry->data) {
-			suspend_nvs_free();
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
-
-/**
- *	suspend_nvs_save - save NVS memory regions
- */
-int suspend_nvs_save(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			entry->kaddr = ioremap(entry->phys_start, entry->size);
-			if (!entry->kaddr) {
-				suspend_nvs_free();
-				return -ENOMEM;
-			}
-			memcpy(entry->data, entry->kaddr, entry->size);
-		}
-
-	return 0;
-}
-
-/**
- *	suspend_nvs_restore - restore NVS memory regions
- *
- *	This function is going to be called with interrupts disabled, so it
- *	cannot iounmap the virtual addresses used to access the NVS region.
- */
-void suspend_nvs_restore(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data)
-			memcpy(entry->kaddr, entry->data, entry->size);
-}
-- 
cgit v1.2.3-71-gd317


From 6c23a9681c0fe7fb7dd331b39dda11926f43746e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 7 Jan 2011 08:43:37 +0100
Subject: block: add internal hd part table references

We can't use krefs since it's apparently restricted to very basic
reference counting.

This reverts commit e4a683c8.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c      |  6 +++---
 block/blk-merge.c     |  2 +-
 block/genhd.c         |  3 ++-
 fs/partitions/check.c |  8 +++-----
 include/linux/genhd.h | 27 +++++++++++++++++++++++++--
 include/linux/kref.h  |  1 -
 lib/kref.c            | 12 ------------
 7 files changed, 34 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 500c080a6a6b..2f4002f79a24 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -70,7 +70,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	} else {
 		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-		if (!kref_test_and_get(&part->ref)) {
+		if (!hd_struct_try_get(part)) {
 			/*
 			 * The partition is already being removed,
 			 * the request will be accounted on the disk only
@@ -80,7 +80,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 			 * it as any other partition.
 			 */
 			part = &rq->rq_disk->part0;
-			kref_get(&part->ref);
+			hd_struct_get(part);
 		}
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
@@ -1818,7 +1818,7 @@ static void blk_account_io_done(struct request *req)
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 
-		kref_put(&part->ref, __delete_partition);
+		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b06b83b89d89..00b7d31b38a2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -356,7 +356,7 @@ static void blk_account_io_merge(struct request *req)
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
 
-		kref_put(&part->ref, __delete_partition);
+		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index 85c150598830..399d37ec7412 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,7 +1192,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			return NULL;
 		}
 		disk->part_tbl->part[0] = &disk->part0;
-		kref_init(&disk->part0.ref);
+
+		hd_ref_init(&disk->part0);
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 48209f58522b..011520df71ae 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -381,10 +381,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	put_device(part_to_dev(part));
 }
 
-void __delete_partition(struct kref *ref)
+void __delete_partition(struct hd_struct *part)
 {
-	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
-
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
@@ -406,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	kref_put(&part->ref, __delete_partition);
+	hd_struct_put(part);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -505,7 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	kref_init(&p->ref);
+	hd_ref_init(p);
 	return p;
 
 out_free_info:
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2ba2792a3dd4..2d0468145967 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -115,8 +115,8 @@ struct hd_struct {
 #else
 	struct disk_stats dkstats;
 #endif
+	atomic_t ref;
 	struct rcu_head rcu_head;
-	struct kref ref;
 };
 
 #define GENHD_FL_REMOVABLE			1
@@ -584,7 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     sector_t len, int flags,
 						     struct partition_meta_info
 						       *info);
-extern void __delete_partition(struct kref *ref);
+extern void __delete_partition(struct hd_struct *);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
@@ -613,6 +613,29 @@ extern ssize_t part_fail_store(struct device *dev,
 			       const char *buf, size_t count);
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+static inline void hd_ref_init(struct hd_struct *part)
+{
+	atomic_set(&part->ref, 1);
+	smp_mb();
+}
+
+static inline void hd_struct_get(struct hd_struct *part)
+{
+	atomic_inc(&part->ref);
+	smp_mb__after_atomic_inc();
+}
+
+static inline int hd_struct_try_get(struct hd_struct *part)
+{
+	return atomic_inc_not_zero(&part->ref);
+}
+
+static inline void hd_struct_put(struct hd_struct *part)
+{
+	if (atomic_dec_and_test(&part->ref))
+		__delete_partition(part);
+}
+
 #else /* CONFIG_BLOCK */
 
 static inline void printk_all_partitions(void) { }
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 90b9e44abf54..6cc38fc07ab7 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -23,7 +23,6 @@ struct kref {
 
 void kref_init(struct kref *kref);
 void kref_get(struct kref *kref);
-int kref_test_and_get(struct kref *kref);
 int kref_put(struct kref *kref, void (*release) (struct kref *kref));
 
 #endif /* _KREF_H_ */
diff --git a/lib/kref.c b/lib/kref.c
index e7a6e1067122..d3d227a08a4b 100644
--- a/lib/kref.c
+++ b/lib/kref.c
@@ -36,18 +36,6 @@ void kref_get(struct kref *kref)
 	smp_mb__after_atomic_inc();
 }
 
-/**
- * kref_test_and_get - increment refcount for object only if refcount is not
- * zero.
- * @kref: object.
- *
- * Return non-zero if the refcount was incremented, 0 otherwise
- */
-int kref_test_and_get(struct kref *kref)
-{
-	return atomic_inc_not_zero(&kref->refcount);
-}
-
 /**
  * kref_put - decrement refcount for object.
  * @kref: object.
-- 
cgit v1.2.3-71-gd317


From ca86828ccd3128513f6d4e200b437deac95408db Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 6 Jan 2011 17:51:07 +0100
Subject: x86, AMD, PCI: Add AMD northbridge PCI device id for CPU families 12h
 and 14h

This patch adds the PCI northbridge device id for AMD CPU
families 12h and 14h. Both families have implemented the same
PCI northbridge device.

There are some future use cases that use this PCI device and
we would like to clarify its naming.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: xen-devel@lists.xensource.com <xen-devel@lists.xensource.com>
Cc: Keir Fraser <keir@xen.org>
Cc: Jan Beulich <JBeulich@novell.com>
LKML-Reference: <20110106165107.GL4739@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cb845c16ad7d..dd7d4e20d39b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -518,6 +518,7 @@
 #define PCI_DEVICE_ID_AMD_11H_NB_MISC	0x1303
 #define PCI_DEVICE_ID_AMD_11H_NB_LINK	0x1304
 #define PCI_DEVICE_ID_AMD_15H_NB_MISC	0x1603
+#define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
 #define PCI_DEVICE_ID_AMD_SCSI		0x2020
-- 
cgit v1.2.3-71-gd317


From c97415a72521071c235e0879f9a600014afd87b1 Mon Sep 17 00:00:00 2001
From: Stefan Achatz <erazor_de@users.sourceforge.net>
Date: Fri, 26 Nov 2010 19:57:29 +0000
Subject: sysfs: Introducing binary attributes for struct class

Added dev_bin_attrs to struct class similar to existing dev_attrs.

Signed-off-by: Stefan Achatz <erazor_de@users.sourceforge.net>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/base/core.c    | 41 +++++++++++++++++++++++++++++++++++++++--
 include/linux/device.h |  1 +
 2 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 6ed645411c40..761359261589 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -338,6 +338,35 @@ static void device_remove_attributes(struct device *dev,
 			device_remove_file(dev, &attrs[i]);
 }
 
+static int device_add_bin_attributes(struct device *dev,
+				     struct bin_attribute *attrs)
+{
+	int error = 0;
+	int i;
+
+	if (attrs) {
+		for (i = 0; attr_name(attrs[i]); i++) {
+			error = device_create_bin_file(dev, &attrs[i]);
+			if (error)
+				break;
+		}
+		if (error)
+			while (--i >= 0)
+				device_remove_bin_file(dev, &attrs[i]);
+	}
+	return error;
+}
+
+static void device_remove_bin_attributes(struct device *dev,
+					 struct bin_attribute *attrs)
+{
+	int i;
+
+	if (attrs)
+		for (i = 0; attr_name(attrs[i]); i++)
+			device_remove_bin_file(dev, &attrs[i]);
+}
+
 static int device_add_groups(struct device *dev,
 			     const struct attribute_group **groups)
 {
@@ -378,12 +407,15 @@ static int device_add_attrs(struct device *dev)
 		error = device_add_attributes(dev, class->dev_attrs);
 		if (error)
 			return error;
+		error = device_add_bin_attributes(dev, class->dev_bin_attrs);
+		if (error)
+			goto err_remove_class_attrs;
 	}
 
 	if (type) {
 		error = device_add_groups(dev, type->groups);
 		if (error)
-			goto err_remove_class_attrs;
+			goto err_remove_class_bin_attrs;
 	}
 
 	error = device_add_groups(dev, dev->groups);
@@ -395,6 +427,9 @@ static int device_add_attrs(struct device *dev)
  err_remove_type_groups:
 	if (type)
 		device_remove_groups(dev, type->groups);
+ err_remove_class_bin_attrs:
+	if (class)
+		device_remove_bin_attributes(dev, class->dev_bin_attrs);
  err_remove_class_attrs:
 	if (class)
 		device_remove_attributes(dev, class->dev_attrs);
@@ -412,8 +447,10 @@ static void device_remove_attrs(struct device *dev)
 	if (type)
 		device_remove_groups(dev, type->groups);
 
-	if (class)
+	if (class) {
 		device_remove_attributes(dev, class->dev_attrs);
+		device_remove_bin_attributes(dev, class->dev_bin_attrs);
+	}
 }
 
 
diff --git a/include/linux/device.h b/include/linux/device.h
index dd4895313468..032bdb5406a2 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -197,6 +197,7 @@ struct class {
 
 	struct class_attribute		*class_attrs;
 	struct device_attribute		*dev_attrs;
+	struct bin_attribute		*dev_bin_attrs;
 	struct kobject			*dev_kobj;
 
 	int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
-- 
cgit v1.2.3-71-gd317


From ec6e7c3ae39cb1dc279b5274aaaadd09ff8e224b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 6 Jan 2011 13:45:32 -0500
Subject: tracing/trivial: Add missing comma in TRACE_EVENT comment

Add missing comma within the TRACE_EVENT() example in tracepoint.h.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20110106184532.GA2526@Krystal>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d3e4f87e95c0..899103c8f6f2 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -326,7 +326,7 @@ do_trace:								\
  *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *		__entry->next_pid	= next->pid;
  *		__entry->next_prio	= next->prio;
- *	)
+ *	),
  *
  *	*
  *	* Formatted output of a trace record via TP_printk().
-- 
cgit v1.2.3-71-gd317


From bd1c8b22b7b81c6f6c4f5c19cb2387da3d02fb0f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 4 Jan 2011 16:06:09 +0800
Subject: tracepoint: Add __rcu annotation

Add __rcu annotation to :
	(struct tracepoint)->funcs

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D22D4F1.50505@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 899103c8f6f2..c6814616653b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -32,7 +32,7 @@ struct tracepoint {
 	int state;			/* State. */
 	void (*regfunc)(void);
 	void (*unregfunc)(void);
-	struct tracepoint_func *funcs;
+	struct tracepoint_func __rcu *funcs;
 } __attribute__((aligned(32)));		/*
 					 * Aligned on 32 bytes because it is
 					 * globally visible and gcc happily
-- 
cgit v1.2.3-71-gd317


From 2d75af2f2a7a6103a6d539a492fe81deacabde44 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 7 Jan 2011 13:36:58 -0500
Subject: dynamic debug: Fix build issue with older gcc

On older gcc (3.3) dynamic debug fails to compile:

include/net/inet_connection_sock.h: In function `inet_csk_reset_xmit_timer':
include/net/inet_connection_sock.h:236: error: duplicate label declaration `do_printk'
include/net/inet_connection_sock.h:219: error: this is a previous declaration
include/net/inet_connection_sock.h:236: error: duplicate label declaration `out'
include/net/inet_connection_sock.h:219: error: this is a previous declaration
include/net/inet_connection_sock.h:236: error: duplicate label `do_printk'
include/net/inet_connection_sock.h:236: error: duplicate label `out'

Fix, by reverting the usage of JUMP_LABEL() in dynamic debug for now.

Cc: <stable@kernel.org>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/dynamic_debug.h | 18 ++++--------------
 lib/dynamic_debug.c           |  9 ++++-----
 2 files changed, 8 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index a90b3892074a..1c70028f81f9 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 extern int ddebug_remove_module(const char *mod_name);
 
 #define dynamic_pr_debug(fmt, ...) do {					\
-	__label__ do_printk;						\
-	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
 	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
 		_DPRINTK_FLAGS_DEFAULT };				\
-	JUMP_LABEL(&descriptor.enabled, do_printk);			\
-	goto out;							\
-do_printk:								\
-	printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);			\
-out:	;								\
+	if (unlikely(descriptor.enabled))				\
+		printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);		\
 	} while (0)
 
 
 #define dynamic_dev_dbg(dev, fmt, ...) do {				\
-	__label__ do_printk;						\
-	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
 	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
 		_DPRINTK_FLAGS_DEFAULT };				\
-	JUMP_LABEL(&descriptor.enabled, do_printk);			\
-	goto out;							\
-do_printk:								\
-	dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);		\
-out:	;								\
+	if (unlikely(descriptor.enabled))				\
+		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
 	} while (0)
 
 #else
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 3094318bfea7..b335acb43be2 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -141,11 +141,10 @@ static void ddebug_change(const struct ddebug_query *query,
 			else if (!dp->flags)
 				dt->num_enabled++;
 			dp->flags = newflags;
-			if (newflags) {
-				jump_label_enable(&dp->enabled);
-			} else {
-				jump_label_disable(&dp->enabled);
-			}
+			if (newflags)
+				dp->enabled = 1;
+			else
+				dp->enabled = 0;
 			if (verbose)
 				printk(KERN_INFO
 					"ddebug: changed %s:%d [%s]%s %s\n",
-- 
cgit v1.2.3-71-gd317


From 26daa1ed40c6b31b4220581431982814c47c608a Mon Sep 17 00:00:00 2001
From: Jennifer Li <Jennifer.li@o2micro.com>
Date: Wed, 17 Nov 2010 23:01:59 -0500
Subject: mmc: sdhci: Disable ADMA on some O2Micro SD/MMC parts.

This patch disables the broken ADMA on selected O2Micro devices.

Signed-off-by: Jennifer Li <Jennifer.li@o2micro.com>
Reviewed-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/sdhci-pci.c | 112 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h      |   5 ++
 2 files changed, 117 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index 3d9c2460d437..831cf91b644a 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -176,6 +176,74 @@ static const struct sdhci_pci_fixes sdhci_intel_mfd_emmc_sdio = {
 	.quirks		= SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
 };
 
+/* O2Micro extra registers */
+#define O2_SD_LOCK_WP		0xD3
+#define O2_SD_MULTI_VCC3V	0xEE
+#define O2_SD_CLKREQ		0xEC
+#define O2_SD_CAPS		0xE0
+#define O2_SD_ADMA1		0xE2
+#define O2_SD_ADMA2		0xE7
+#define O2_SD_INF_MOD		0xF1
+
+static int o2_probe(struct sdhci_pci_chip *chip)
+{
+	int ret;
+	u8 scratch;
+
+	switch (chip->pdev->device) {
+	case PCI_DEVICE_ID_O2_8220:
+	case PCI_DEVICE_ID_O2_8221:
+	case PCI_DEVICE_ID_O2_8320:
+	case PCI_DEVICE_ID_O2_8321:
+		/* This extra setup is required due to broken ADMA. */
+		ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch);
+		if (ret)
+			return ret;
+		scratch &= 0x7f;
+		pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch);
+
+		/* Set Multi 3 to VCC3V# */
+		pci_write_config_byte(chip->pdev, O2_SD_MULTI_VCC3V, 0x08);
+
+		/* Disable CLK_REQ# support after media DET */
+		ret = pci_read_config_byte(chip->pdev, O2_SD_CLKREQ, &scratch);
+		if (ret)
+			return ret;
+		scratch |= 0x20;
+		pci_write_config_byte(chip->pdev, O2_SD_CLKREQ, scratch);
+
+		/* Choose capabilities, enable SDMA.  We have to write 0x01
+		 * to the capabilities register first to unlock it.
+		 */
+		ret = pci_read_config_byte(chip->pdev, O2_SD_CAPS, &scratch);
+		if (ret)
+			return ret;
+		scratch |= 0x01;
+		pci_write_config_byte(chip->pdev, O2_SD_CAPS, scratch);
+		pci_write_config_byte(chip->pdev, O2_SD_CAPS, 0x73);
+
+		/* Disable ADMA1/2 */
+		pci_write_config_byte(chip->pdev, O2_SD_ADMA1, 0x39);
+		pci_write_config_byte(chip->pdev, O2_SD_ADMA2, 0x08);
+
+		/* Disable the infinite transfer mode */
+		ret = pci_read_config_byte(chip->pdev, O2_SD_INF_MOD, &scratch);
+		if (ret)
+			return ret;
+		scratch |= 0x08;
+		pci_write_config_byte(chip->pdev, O2_SD_INF_MOD, scratch);
+
+		/* Lock WP */
+		ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch);
+		if (ret)
+			return ret;
+		scratch |= 0x80;
+		pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch);
+	}
+
+	return 0;
+}
+
 static int jmicron_pmos(struct sdhci_pci_chip *chip, int on)
 {
 	u8 scratch;
@@ -339,6 +407,10 @@ static int jmicron_resume(struct sdhci_pci_chip *chip)
 	return 0;
 }
 
+static const struct sdhci_pci_fixes sdhci_o2 = {
+	.probe		= o2_probe,
+};
+
 static const struct sdhci_pci_fixes sdhci_jmicron = {
 	.probe		= jmicron_probe,
 
@@ -589,6 +661,46 @@ static const struct pci_device_id pci_ids[] __devinitdata = {
 		.driver_data	= (kernel_ulong_t)&sdhci_intel_mfd_emmc_sdio,
 	},
 
+	{
+		.vendor		= PCI_VENDOR_ID_O2,
+		.device		= PCI_DEVICE_ID_O2_8120,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_o2,
+	},
+
+	{
+		.vendor		= PCI_VENDOR_ID_O2,
+		.device		= PCI_DEVICE_ID_O2_8220,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_o2,
+	},
+
+	{
+		.vendor		= PCI_VENDOR_ID_O2,
+		.device		= PCI_DEVICE_ID_O2_8221,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_o2,
+	},
+
+	{
+		.vendor		= PCI_VENDOR_ID_O2,
+		.device		= PCI_DEVICE_ID_O2_8320,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_o2,
+	},
+
+	{
+		.vendor		= PCI_VENDOR_ID_O2,
+		.device		= PCI_DEVICE_ID_O2_8321,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_o2,
+	},
+
 	{	/* Generic SD host controller */
 		PCI_DEVICE_CLASS((PCI_CLASS_SYSTEM_SDHCI << 8), 0xFFFF00)
 	},
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cb845c16ad7d..7a58875cdad5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1650,6 +1650,11 @@
 #define PCI_DEVICE_ID_O2_6836		0x6836
 #define PCI_DEVICE_ID_O2_6812		0x6872
 #define PCI_DEVICE_ID_O2_6933		0x6933
+#define PCI_DEVICE_ID_O2_8120		0x8120
+#define PCI_DEVICE_ID_O2_8220		0x8220
+#define PCI_DEVICE_ID_O2_8221		0x8221
+#define PCI_DEVICE_ID_O2_8320		0x8320
+#define PCI_DEVICE_ID_O2_8321		0x8321
 
 #define PCI_VENDOR_ID_3DFX		0x121a
 #define PCI_DEVICE_ID_3DFX_VOODOO	0x0001
-- 
cgit v1.2.3-71-gd317


From 04566831a703ae3ef4b49a2deae261c9ed26e020 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Mon, 8 Nov 2010 21:36:50 -0500
Subject: mmc: Aggressive clock gating framework

This patch modifies the MMC core code to optionally call the set_ios()
operation on the driver with the clock frequency set to 0 (gate) after
a grace period of at least 8 MCLK cycles, then restore it (ungate)
before any new request. This gives the driver the option to shut down
the MCI clock to the MMC/SD card when the clock frequency is 0, i.e.
the core has stated that the MCI clock does not need to be generated.

It is inspired by existing clock gating code found in the OMAP and
Atmel drivers and brings this up to the host abstraction.  Gating is
performed before and after any MMC request.

This patchset implements this for the MMCI/PL180 MMC/SD host controller,
but it should be simple to switch OMAP/Atmel over to using this instead.

mmc_set_{gated,ungated}() add variable protection to the state holders
for the clock gating code.  This is particularly important when ordinary
.set_ios() calls would race with the .set_ios() call resulting from a
delayed gate operation.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Reviewed-by: Chris Ball <cjb@laptop.org>
Tested-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/Kconfig   |  11 +++
 drivers/mmc/core/core.c    |  62 +++++++++++++-
 drivers/mmc/core/core.h    |   3 +
 drivers/mmc/core/debugfs.c |   5 ++
 drivers/mmc/core/host.c    | 205 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/mmc/core/host.h    |  21 +++++
 include/linux/mmc/host.h   |  10 +++
 7 files changed, 315 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/Kconfig b/drivers/mmc/core/Kconfig
index bb22ffd76ef8..ef103871517f 100644
--- a/drivers/mmc/core/Kconfig
+++ b/drivers/mmc/core/Kconfig
@@ -16,3 +16,14 @@ config MMC_UNSAFE_RESUME
 
 	  This option sets a default which can be overridden by the
 	  module parameter "removable=0" or "removable=1".
+
+config MMC_CLKGATE
+	bool "MMC host clock gating (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  This will attempt to aggressively gate the clock to the MMC card.
+	  This is done to save power due to gating off the logic and bus
+	  noise when the MMC card is not in use. Your host driver has to
+	  support handling this in order for it to be of any use.
+
+	  If unsure, say N.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index a3a780faf85a..722af2dce3bb 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -130,6 +130,8 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 
 		if (mrq->done)
 			mrq->done(mrq);
+
+		mmc_host_clk_gate(host);
 	}
 }
 
@@ -190,6 +192,7 @@ mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
 			mrq->stop->mrq = mrq;
 		}
 	}
+	mmc_host_clk_ungate(host);
 	host->ops->request(host, mrq);
 }
 
@@ -296,7 +299,7 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card)
 
 		timeout_us = data->timeout_ns / 1000;
 		timeout_us += data->timeout_clks * 1000 /
-			(card->host->ios.clock / 1000);
+			(mmc_host_clk_rate(card->host) / 1000);
 
 		if (data->flags & MMC_DATA_WRITE)
 			/*
@@ -614,6 +617,8 @@ static inline void mmc_set_ios(struct mmc_host *host)
 		 ios->power_mode, ios->chip_select, ios->vdd,
 		 ios->bus_width, ios->timing);
 
+	if (ios->clock > 0)
+		mmc_set_ungated(host);
 	host->ops->set_ios(host, ios);
 }
 
@@ -641,6 +646,61 @@ void mmc_set_clock(struct mmc_host *host, unsigned int hz)
 	mmc_set_ios(host);
 }
 
+#ifdef CONFIG_MMC_CLKGATE
+/*
+ * This gates the clock by setting it to 0 Hz.
+ */
+void mmc_gate_clock(struct mmc_host *host)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->clk_lock, flags);
+	host->clk_old = host->ios.clock;
+	host->ios.clock = 0;
+	host->clk_gated = true;
+	spin_unlock_irqrestore(&host->clk_lock, flags);
+	mmc_set_ios(host);
+}
+
+/*
+ * This restores the clock from gating by using the cached
+ * clock value.
+ */
+void mmc_ungate_clock(struct mmc_host *host)
+{
+	/*
+	 * We should previously have gated the clock, so the clock shall
+	 * be 0 here! The clock may however be 0 during initialization,
+	 * when some request operations are performed before setting
+	 * the frequency. When ungate is requested in that situation
+	 * we just ignore the call.
+	 */
+	if (host->clk_old) {
+		BUG_ON(host->ios.clock);
+		/* This call will also set host->clk_gated to false */
+		mmc_set_clock(host, host->clk_old);
+	}
+}
+
+void mmc_set_ungated(struct mmc_host *host)
+{
+	unsigned long flags;
+
+	/*
+	 * We've been given a new frequency while the clock is gated,
+	 * so make sure we regard this as ungating it.
+	 */
+	spin_lock_irqsave(&host->clk_lock, flags);
+	host->clk_gated = false;
+	spin_unlock_irqrestore(&host->clk_lock, flags);
+}
+
+#else
+void mmc_set_ungated(struct mmc_host *host)
+{
+}
+#endif
+
 /*
  * Change the bus mode (open drain/push-pull) of a host.
  */
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 77240cd11bcf..026c975b99a9 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -33,6 +33,9 @@ void mmc_init_erase(struct mmc_card *card);
 
 void mmc_set_chip_select(struct mmc_host *host, int mode);
 void mmc_set_clock(struct mmc_host *host, unsigned int hz);
+void mmc_gate_clock(struct mmc_host *host);
+void mmc_ungate_clock(struct mmc_host *host);
+void mmc_set_ungated(struct mmc_host *host);
 void mmc_set_bus_mode(struct mmc_host *host, unsigned int mode);
 void mmc_set_bus_width(struct mmc_host *host, unsigned int width);
 void mmc_set_bus_width_ddr(struct mmc_host *host, unsigned int width,
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index eed1405fd742..998797ed67a6 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -183,6 +183,11 @@ void mmc_add_host_debugfs(struct mmc_host *host)
 			&mmc_clock_fops))
 		goto err_node;
 
+#ifdef CONFIG_MMC_CLKGATE
+	if (!debugfs_create_u32("clk_delay", (S_IRUSR | S_IWUSR),
+				root, &host->clk_delay))
+		goto err_node;
+#endif
 	return;
 
 err_node:
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 10b8af27e03a..92e33703e437 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 2003 Russell King, All Rights Reserved.
  *  Copyright (C) 2007-2008 Pierre Ossman
+ *  Copyright (C) 2010 Linus Walleij
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -20,6 +21,7 @@
 #include <linux/suspend.h>
 
 #include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
 
 #include "core.h"
 #include "host.h"
@@ -50,6 +52,204 @@ void mmc_unregister_host_class(void)
 static DEFINE_IDR(mmc_host_idr);
 static DEFINE_SPINLOCK(mmc_host_lock);
 
+#ifdef CONFIG_MMC_CLKGATE
+
+/*
+ * Enabling clock gating will make the core call out to the host
+ * once up and once down when it performs a request or card operation
+ * intermingled in any fashion. The driver will see this through
+ * set_ios() operations with ios.clock field set to 0 to gate (disable)
+ * the block clock, and to the old frequency to enable it again.
+ */
+static void mmc_host_clk_gate_delayed(struct mmc_host *host)
+{
+	unsigned long tick_ns;
+	unsigned long freq = host->ios.clock;
+	unsigned long flags;
+
+	if (!freq) {
+		pr_debug("%s: frequency set to 0 in disable function, "
+			 "this means the clock is already disabled.\n",
+			 mmc_hostname(host));
+		return;
+	}
+	/*
+	 * New requests may have appeared while we were scheduling,
+	 * then there is no reason to delay the check before
+	 * clk_disable().
+	 */
+	spin_lock_irqsave(&host->clk_lock, flags);
+
+	/*
+	 * Delay n bus cycles (at least 8 from MMC spec) before attempting
+	 * to disable the MCI block clock. The reference count may have
+	 * gone up again after this delay due to rescheduling!
+	 */
+	if (!host->clk_requests) {
+		spin_unlock_irqrestore(&host->clk_lock, flags);
+		tick_ns = DIV_ROUND_UP(1000000000, freq);
+		ndelay(host->clk_delay * tick_ns);
+	} else {
+		/* New users appeared while waiting for this work */
+		spin_unlock_irqrestore(&host->clk_lock, flags);
+		return;
+	}
+	mutex_lock(&host->clk_gate_mutex);
+	spin_lock_irqsave(&host->clk_lock, flags);
+	if (!host->clk_requests) {
+		spin_unlock_irqrestore(&host->clk_lock, flags);
+		/* This will set host->ios.clock to 0 */
+		mmc_gate_clock(host);
+		spin_lock_irqsave(&host->clk_lock, flags);
+		pr_debug("%s: gated MCI clock\n", mmc_hostname(host));
+	}
+	spin_unlock_irqrestore(&host->clk_lock, flags);
+	mutex_unlock(&host->clk_gate_mutex);
+}
+
+/*
+ * Internal work. Work to disable the clock at some later point.
+ */
+static void mmc_host_clk_gate_work(struct work_struct *work)
+{
+	struct mmc_host *host = container_of(work, struct mmc_host,
+					      clk_gate_work);
+
+	mmc_host_clk_gate_delayed(host);
+}
+
+/**
+ *	mmc_host_clk_ungate - ungate hardware MCI clocks
+ *	@host: host to ungate.
+ *
+ *	Makes sure the host ios.clock is restored to a non-zero value
+ *	past this call.	Increase clock reference count and ungate clock
+ *	if we're the first user.
+ */
+void mmc_host_clk_ungate(struct mmc_host *host)
+{
+	unsigned long flags;
+
+	mutex_lock(&host->clk_gate_mutex);
+	spin_lock_irqsave(&host->clk_lock, flags);
+	if (host->clk_gated) {
+		spin_unlock_irqrestore(&host->clk_lock, flags);
+		mmc_ungate_clock(host);
+		spin_lock_irqsave(&host->clk_lock, flags);
+		pr_debug("%s: ungated MCI clock\n", mmc_hostname(host));
+	}
+	host->clk_requests++;
+	spin_unlock_irqrestore(&host->clk_lock, flags);
+	mutex_unlock(&host->clk_gate_mutex);
+}
+
+/**
+ *	mmc_host_may_gate_card - check if this card may be gated
+ *	@card: card to check.
+ */
+static bool mmc_host_may_gate_card(struct mmc_card *card)
+{
+	/* If there is no card we may gate it */
+	if (!card)
+		return true;
+	/*
+	 * Don't gate SDIO cards! These need to be clocked at all times
+	 * since they may be independent systems generating interrupts
+	 * and other events. The clock requests counter from the core will
+	 * go down to zero since the core does not need it, but we will not
+	 * gate the clock, because there is somebody out there that may still
+	 * be using it.
+	 */
+	if (mmc_card_sdio(card))
+		return false;
+
+	return true;
+}
+
+/**
+ *	mmc_host_clk_gate - gate off hardware MCI clocks
+ *	@host: host to gate.
+ *
+ *	Calls the host driver with ios.clock set to zero as often as possible
+ *	in order to gate off hardware MCI clocks. Decrease clock reference
+ *	count and schedule disabling of clock.
+ */
+void mmc_host_clk_gate(struct mmc_host *host)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->clk_lock, flags);
+	host->clk_requests--;
+	if (mmc_host_may_gate_card(host->card) &&
+	    !host->clk_requests)
+		schedule_work(&host->clk_gate_work);
+	spin_unlock_irqrestore(&host->clk_lock, flags);
+}
+
+/**
+ *	mmc_host_clk_rate - get current clock frequency setting
+ *	@host: host to get the clock frequency for.
+ *
+ *	Returns current clock frequency regardless of gating.
+ */
+unsigned int mmc_host_clk_rate(struct mmc_host *host)
+{
+	unsigned long freq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->clk_lock, flags);
+	if (host->clk_gated)
+		freq = host->clk_old;
+	else
+		freq = host->ios.clock;
+	spin_unlock_irqrestore(&host->clk_lock, flags);
+	return freq;
+}
+
+/**
+ *	mmc_host_clk_init - set up clock gating code
+ *	@host: host with potential clock to control
+ */
+static inline void mmc_host_clk_init(struct mmc_host *host)
+{
+	host->clk_requests = 0;
+	/* Hold MCI clock for 8 cycles by default */
+	host->clk_delay = 8;
+	host->clk_gated = false;
+	INIT_WORK(&host->clk_gate_work, mmc_host_clk_gate_work);
+	spin_lock_init(&host->clk_lock);
+	mutex_init(&host->clk_gate_mutex);
+}
+
+/**
+ *	mmc_host_clk_exit - shut down clock gating code
+ *	@host: host with potential clock to control
+ */
+static inline void mmc_host_clk_exit(struct mmc_host *host)
+{
+	/*
+	 * Wait for any outstanding gate and then make sure we're
+	 * ungated before exiting.
+	 */
+	if (cancel_work_sync(&host->clk_gate_work))
+		mmc_host_clk_gate_delayed(host);
+	if (host->clk_gated)
+		mmc_host_clk_ungate(host);
+	BUG_ON(host->clk_requests > 0);
+}
+
+#else
+
+static inline void mmc_host_clk_init(struct mmc_host *host)
+{
+}
+
+static inline void mmc_host_clk_exit(struct mmc_host *host)
+{
+}
+
+#endif
+
 /**
  *	mmc_alloc_host - initialise the per-host structure.
  *	@extra: sizeof private data structure
@@ -82,6 +282,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->class_dev.class = &mmc_host_class;
 	device_initialize(&host->class_dev);
 
+	mmc_host_clk_init(host);
+
 	spin_lock_init(&host->lock);
 	init_waitqueue_head(&host->wq);
 	INIT_DELAYED_WORK(&host->detect, mmc_rescan);
@@ -163,6 +365,8 @@ void mmc_remove_host(struct mmc_host *host)
 	device_del(&host->class_dev);
 
 	led_trigger_unregister_simple(host->led);
+
+	mmc_host_clk_exit(host);
 }
 
 EXPORT_SYMBOL(mmc_remove_host);
@@ -183,4 +387,3 @@ void mmc_free_host(struct mmc_host *host)
 }
 
 EXPORT_SYMBOL(mmc_free_host);
-
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 8c87e1109a34..de199f911928 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -10,10 +10,31 @@
  */
 #ifndef _MMC_CORE_HOST_H
 #define _MMC_CORE_HOST_H
+#include <linux/mmc/host.h>
 
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
 
+#ifdef CONFIG_MMC_CLKGATE
+void mmc_host_clk_ungate(struct mmc_host *host);
+void mmc_host_clk_gate(struct mmc_host *host);
+unsigned int mmc_host_clk_rate(struct mmc_host *host);
+
+#else
+static inline void mmc_host_clk_ungate(struct mmc_host *host)
+{
+}
+
+static inline void mmc_host_clk_gate(struct mmc_host *host)
+{
+}
+
+static inline unsigned int mmc_host_clk_rate(struct mmc_host *host)
+{
+	return host->ios.clock;
+}
+#endif
+
 void mmc_host_deeper_disable(struct work_struct *work);
 
 #endif
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 30f6fad99a58..381c77fd4dca 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -172,6 +172,16 @@ struct mmc_host {
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
+#ifdef CONFIG_MMC_CLKGATE
+	int			clk_requests;	/* internal reference counter */
+	unsigned int		clk_delay;	/* number of MCI clk hold cycles */
+	bool			clk_gated;	/* clock gated */
+	struct work_struct	clk_gate_work; /* delayed clock gate */
+	unsigned int		clk_old;	/* old clock value cache */
+	spinlock_t		clk_lock;	/* lock for clk fields */
+	struct mutex		clk_gate_mutex;	/* mutex for clock gating */
+#endif
+
 	/* host specific block data */
 	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
 	unsigned short		max_segs;	/* see blk_queue_max_segments */
-- 
cgit v1.2.3-71-gd317


From 8f230f454fe04ba326ffaead3a6b88dcf44eaf4b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 8 Dec 2010 10:04:30 +0100
Subject: mmc: Add support for JMicron 388 SD/MMC controller

JMicron 388 SD/MMC combo controller supports the 1.8V low-voltage for
SD, but MMC doesn't work with the low-voltage, resulting in an error
at probing.

This patch adds the support for multiple voltage mask per device type,
so that SD works with 1.8V while MMC forces 3.3V.  Here new ocr_avail_*
fields for each device are introduced, so that the actual OCR mask is
switched dynamically.

Also, the restriction of low-voltage in core/sd.c is removed when the
bit is allowed explicitly via ocr_avail_sd mask.

This patch was rewritten from scratch based on Aries' original code.

Signed-off-by: Aries Lee <arieslee@jmicron.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/mmc.c       |  2 ++
 drivers/mmc/core/sd.c        |  5 ++++-
 drivers/mmc/core/sdio.c      |  2 ++
 drivers/mmc/host/sdhci-pci.c | 47 ++++++++++++++++++++++++++++++++++++++------
 drivers/mmc/host/sdhci.c     | 23 +++++++++++++++++-----
 include/linux/mmc/host.h     |  3 +++
 include/linux/mmc/sdhci.h    |  4 ++++
 include/linux/pci_ids.h      |  2 ++
 8 files changed, 76 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 77f93c3b8808..76bb621e9aa9 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -745,6 +745,8 @@ int mmc_attach_mmc(struct mmc_host *host, u32 ocr)
 	WARN_ON(!host->claimed);
 
 	mmc_attach_bus_ops(host);
+	if (host->ocr_avail_mmc)
+		host->ocr_avail = host->ocr_avail_mmc;
 
 	/*
 	 * We need to get OCR a different way for SPI.
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 49da4dffd28e..de062ebd8b26 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -772,6 +772,8 @@ int mmc_attach_sd(struct mmc_host *host, u32 ocr)
 	WARN_ON(!host->claimed);
 
 	mmc_sd_attach_bus_ops(host);
+	if (host->ocr_avail_sd)
+		host->ocr_avail = host->ocr_avail_sd;
 
 	/*
 	 * We need to get OCR a different way for SPI.
@@ -795,7 +797,8 @@ int mmc_attach_sd(struct mmc_host *host, u32 ocr)
 		ocr &= ~0x7F;
 	}
 
-	if (ocr & MMC_VDD_165_195) {
+	if ((ocr & MMC_VDD_165_195) &&
+	    !(host->ocr_avail_sd & MMC_VDD_165_195)) {
 		printk(KERN_WARNING "%s: SD card claims to support the "
 		       "incompletely defined 'low voltage range'. This "
 		       "will be ignored.\n", mmc_hostname(host));
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index efef5f94ac42..c18810ab6465 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -700,6 +700,8 @@ int mmc_attach_sdio(struct mmc_host *host, u32 ocr)
 	WARN_ON(!host->claimed);
 
 	mmc_attach_bus(host, &mmc_sdio_ops);
+	if (host->ocr_avail_sdio)
+		host->ocr_avail = host->ocr_avail_sdio;
 
 	/*
 	 * Sanity check the voltages that the card claims to
diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index 831cf91b644a..d2638ffc4ed2 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -272,6 +272,7 @@ static int jmicron_pmos(struct sdhci_pci_chip *chip, int on)
 static int jmicron_probe(struct sdhci_pci_chip *chip)
 {
 	int ret;
+	u16 mmcdev = 0;
 
 	if (chip->pdev->revision == 0) {
 		chip->quirks |= SDHCI_QUIRK_32BIT_DMA_ADDR |
@@ -293,12 +294,17 @@ static int jmicron_probe(struct sdhci_pci_chip *chip)
 	 * 2. The MMC interface has a lower subfunction number
 	 *    than the SD interface.
 	 */
-	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_SD) {
+	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_SD)
+		mmcdev = PCI_DEVICE_ID_JMICRON_JMB38X_MMC;
+	else if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_SD)
+		mmcdev = PCI_DEVICE_ID_JMICRON_JMB388_ESD;
+
+	if (mmcdev) {
 		struct pci_dev *sd_dev;
 
 		sd_dev = NULL;
 		while ((sd_dev = pci_get_device(PCI_VENDOR_ID_JMICRON,
-			PCI_DEVICE_ID_JMICRON_JMB38X_MMC, sd_dev)) != NULL) {
+						mmcdev, sd_dev)) != NULL) {
 			if ((PCI_SLOT(chip->pdev->devfn) ==
 				PCI_SLOT(sd_dev->devfn)) &&
 				(chip->pdev->bus == sd_dev->bus))
@@ -358,11 +364,21 @@ static int jmicron_probe_slot(struct sdhci_pci_slot *slot)
 			slot->host->quirks |= SDHCI_QUIRK_BROKEN_ADMA;
 	}
 
+	/* JM388 MMC doesn't support 1.8V while SD supports it */
+	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
+		slot->host->ocr_avail_sd = MMC_VDD_32_33 | MMC_VDD_33_34 |
+			MMC_VDD_29_30 | MMC_VDD_30_31 |
+			MMC_VDD_165_195; /* allow 1.8V */
+		slot->host->ocr_avail_mmc = MMC_VDD_32_33 | MMC_VDD_33_34 |
+			MMC_VDD_29_30 | MMC_VDD_30_31; /* no 1.8V for MMC */
+	}
+
 	/*
 	 * The secondary interface requires a bit set to get the
 	 * interrupts.
 	 */
-	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC)
+	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+	    slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD)
 		jmicron_enable_mmc(slot->host, 1);
 
 	return 0;
@@ -373,7 +389,8 @@ static void jmicron_remove_slot(struct sdhci_pci_slot *slot, int dead)
 	if (dead)
 		return;
 
-	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC)
+	if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+	    slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD)
 		jmicron_enable_mmc(slot->host, 0);
 }
 
@@ -381,7 +398,8 @@ static int jmicron_suspend(struct sdhci_pci_chip *chip, pm_message_t state)
 {
 	int i;
 
-	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC) {
+	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+	    chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
 		for (i = 0;i < chip->num_slots;i++)
 			jmicron_enable_mmc(chip->slots[i]->host, 0);
 	}
@@ -393,7 +411,8 @@ static int jmicron_resume(struct sdhci_pci_chip *chip)
 {
 	int ret, i;
 
-	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC) {
+	if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+	    chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
 		for (i = 0;i < chip->num_slots;i++)
 			jmicron_enable_mmc(chip->slots[i]->host, 1);
 	}
@@ -581,6 +600,22 @@ static const struct pci_device_id pci_ids[] __devinitdata = {
 		.driver_data	= (kernel_ulong_t)&sdhci_jmicron,
 	},
 
+	{
+		.vendor		= PCI_VENDOR_ID_JMICRON,
+		.device		= PCI_DEVICE_ID_JMICRON_JMB388_SD,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_jmicron,
+	},
+
+	{
+		.vendor		= PCI_VENDOR_ID_JMICRON,
+		.device		= PCI_DEVICE_ID_JMICRON_JMB388_ESD,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_jmicron,
+	},
+
 	{
 		.vendor		= PCI_VENDOR_ID_SYSKONNECT,
 		.device		= 0x8000,
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 8a74fcbfe13b..55698864c2cd 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1739,7 +1739,7 @@ EXPORT_SYMBOL_GPL(sdhci_alloc_host);
 int sdhci_add_host(struct sdhci_host *host)
 {
 	struct mmc_host *mmc;
-	unsigned int caps;
+	unsigned int caps, ocr_avail;
 	int ret;
 
 	WARN_ON(host == NULL);
@@ -1893,13 +1893,26 @@ int sdhci_add_host(struct sdhci_host *host)
 	    mmc_card_is_removable(mmc))
 		mmc->caps |= MMC_CAP_NEEDS_POLL;
 
-	mmc->ocr_avail = 0;
+	ocr_avail = 0;
 	if (caps & SDHCI_CAN_VDD_330)
-		mmc->ocr_avail |= MMC_VDD_32_33|MMC_VDD_33_34;
+		ocr_avail |= MMC_VDD_32_33 | MMC_VDD_33_34;
 	if (caps & SDHCI_CAN_VDD_300)
-		mmc->ocr_avail |= MMC_VDD_29_30|MMC_VDD_30_31;
+		ocr_avail |= MMC_VDD_29_30 | MMC_VDD_30_31;
 	if (caps & SDHCI_CAN_VDD_180)
-		mmc->ocr_avail |= MMC_VDD_165_195;
+		ocr_avail |= MMC_VDD_165_195;
+
+	mmc->ocr_avail = ocr_avail;
+	mmc->ocr_avail_sdio = ocr_avail;
+	if (host->ocr_avail_sdio)
+		mmc->ocr_avail_sdio &= host->ocr_avail_sdio;
+	mmc->ocr_avail_sd = ocr_avail;
+	if (host->ocr_avail_sd)
+		mmc->ocr_avail_sd &= host->ocr_avail_sd;
+	else /* normal SD controllers don't support 1.8V */
+		mmc->ocr_avail_sd &= ~MMC_VDD_165_195;
+	mmc->ocr_avail_mmc = ocr_avail;
+	if (host->ocr_avail_mmc)
+		mmc->ocr_avail_mmc &= host->ocr_avail_mmc;
 
 	if (mmc->ocr_avail == 0) {
 		printk(KERN_ERR "%s: Hardware doesn't report any "
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 381c77fd4dca..4a9d9d2589c7 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -131,6 +131,9 @@ struct mmc_host {
 	unsigned int		f_max;
 	unsigned int		f_init;
 	u32			ocr_avail;
+	u32			ocr_avail_sdio;	/* SDIO-specific OCR */
+	u32			ocr_avail_sd;	/* SD-specific OCR */
+	u32			ocr_avail_mmc;	/* MMC-specific OCR */
 	struct notifier_block	pm_notify;
 
 #define MMC_VDD_165_195		0x00000080	/* VDD voltage 1.65 - 1.95 */
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 1fdc673f2396..0d953f513d81 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -139,6 +139,10 @@ struct sdhci_host {
 
 	unsigned int caps;	/* Alternative capabilities */
 
+	unsigned int            ocr_avail_sdio;	/* OCR bit masks */
+	unsigned int            ocr_avail_sd;
+	unsigned int            ocr_avail_mmc;
+
 	unsigned long private[0] ____cacheline_aligned;
 };
 #endif /* __SDHCI_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7a58875cdad5..2f17b4ccbb58 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2368,6 +2368,8 @@
 #define PCI_DEVICE_ID_JMICRON_JMB38X_SD	0x2381
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MMC 0x2382
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MS	0x2383
+#define PCI_DEVICE_ID_JMICRON_JMB388_SD	0x2391
+#define PCI_DEVICE_ID_JMICRON_JMB388_ESD 0x2392
 
 #define PCI_VENDOR_ID_KORENIX		0x1982
 #define PCI_DEVICE_ID_KORENIX_JETCARDF0	0x1600
-- 
cgit v1.2.3-71-gd317


From 080bc9774b6f1e3866747b18631bad26f47c22ce Mon Sep 17 00:00:00 2001
From: Ohad Ben-Cohen <ohad@wizery.com>
Date: Sun, 28 Nov 2010 07:21:29 +0200
Subject: mmc: sdio: don't reinitialize nonremovable powered-resumed cards

Upon system resume, SDIO core must reinitialize cards that were
powered off during suspend.

If the card had its power kept during suspend (and thus it is
'powered-resumed'), SDIO core performs only a limited reinitializing,
mainly needed to make sure that the card wasn't removed/replaced.

If a __nonremovable__ card is powered-resumed, we can safely skip the
reinitializing phase.

Note: 9b966aa (mmc: sdio: fully reconfigure oldcard on resume) removed
the bus width reconfiguration since mmc_sdio_init_card already does it.
It is brought back now in case mmc_sdio_init_card is skipped.

Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/sdio.c  | 16 ++++++++++++++--
 include/linux/mmc/host.h |  5 +++++
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index c18810ab6465..82f4b9008987 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -627,15 +627,27 @@ static int mmc_sdio_suspend(struct mmc_host *host)
 
 static int mmc_sdio_resume(struct mmc_host *host)
 {
-	int i, err;
+	int i, err = 0;
 
 	BUG_ON(!host);
 	BUG_ON(!host->card);
 
 	/* Basic card reinitialization. */
 	mmc_claim_host(host);
-	err = mmc_sdio_init_card(host, host->ocr, host->card,
+
+	/* No need to reinitialize powered-resumed nonremovable cards */
+	if (mmc_card_is_removable(host) || !mmc_card_is_powered_resumed(host))
+		err = mmc_sdio_init_card(host, host->ocr, host->card,
 				 (host->pm_flags & MMC_PM_KEEP_POWER));
+	else if (mmc_card_is_powered_resumed(host)) {
+		/* We may have switched to 1-bit mode during suspend */
+		err = sdio_enable_4bit_bus(host->card);
+		if (err > 0) {
+			mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
+			err = 0;
+		}
+	}
+
 	if (!err && host->sdio_irqs)
 		mmc_signal_sdio_irq(host);
 	mmc_release_host(host);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 4a9d9d2589c7..3a85e73a38a9 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -320,5 +320,10 @@ static inline int mmc_card_is_removable(struct mmc_host *host)
 	return !(host->caps & MMC_CAP_NONREMOVABLE) && mmc_assume_removable;
 }
 
+static inline int mmc_card_is_powered_resumed(struct mmc_host *host)
+{
+	return host->pm_flags & MMC_PM_KEEP_POWER;
+}
+
 #endif
 
-- 
cgit v1.2.3-71-gd317


From a081748735c5feb96b1365e78a5ff0fb6ca7e3a4 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 8 Jan 2011 21:42:42 -0700
Subject: of/flattree: forward declare struct device_node in of_fdt.h

This patch forward declares struct device_node to fix a compile error
when of_fdt.h is included, but of.h is not.  Alternately, including
linux/of.h could have been added to of_fdt.h, but that pulls in a lot
of unnecessary declarations when only working with the flattened
form.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/linux/of_fdt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index ee96091f7d25..0ef22a1f129e 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -59,6 +59,8 @@ struct boot_param_header {
 
 #if defined(CONFIG_OF_FLATTREE)
 
+struct device_node;
+
 /* For scanning an arbitrary device-tree at any time */
 extern char *of_fdt_get_string(struct boot_param_header *blob, u32 offset);
 extern void *of_fdt_get_property(struct boot_param_header *blob,
-- 
cgit v1.2.3-71-gd317


From 22113efd00491310da802f3b1a9a66cfcf415fac Mon Sep 17 00:00:00 2001
From: Aries Lee <arieslee@jmicron.com>
Date: Wed, 15 Dec 2010 08:14:24 +0100
Subject: mmc: Test bus-width for old MMC devices

Some old MMC devices fail with the 4/8 bits the driver tries to use
exclusively.  This patch adds a test for the given bus setup and falls
back to the lower bit mode (until 1-bit mode) when the test fails.

[Major rework and refactoring by tiwai]
[Quirk addition and many fixes by prakity]

Signed-off-by: Aries Lee <arieslee@jmicron.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Philip Rakity <prakity@marvell.com>
Tested-by: Philip Rakity <prakity@marvell.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/mmc.c     |  76 +++++++++++++++++++++-------------
 drivers/mmc/core/mmc_ops.c | 101 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/mmc_ops.h |   1 +
 drivers/mmc/host/sdhci.c   |   7 +++-
 drivers/mmc/host/sdhci.h   |   1 +
 include/linux/mmc/host.h   |   1 +
 include/linux/mmc/mmc.h    |   2 +
 7 files changed, 159 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 76bb621e9aa9..1d8409fcf155 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -534,39 +534,57 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	 */
 	if ((card->csd.mmca_vsn >= CSD_SPEC_VER_4) &&
 	    (host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA))) {
-		unsigned ext_csd_bit, bus_width;
-
-		if (host->caps & MMC_CAP_8_BIT_DATA) {
-			if (ddr)
-				ext_csd_bit = EXT_CSD_DDR_BUS_WIDTH_8;
-			else
-				ext_csd_bit = EXT_CSD_BUS_WIDTH_8;
-			bus_width = MMC_BUS_WIDTH_8;
-		} else {
-			if (ddr)
-				ext_csd_bit = EXT_CSD_DDR_BUS_WIDTH_4;
-			else
-				ext_csd_bit = EXT_CSD_BUS_WIDTH_4;
-			bus_width = MMC_BUS_WIDTH_4;
+		static unsigned ext_csd_bits[][2] = {
+			{ EXT_CSD_BUS_WIDTH_8, EXT_CSD_DDR_BUS_WIDTH_8 },
+			{ EXT_CSD_BUS_WIDTH_4, EXT_CSD_DDR_BUS_WIDTH_4 },
+			{ EXT_CSD_BUS_WIDTH_1, EXT_CSD_BUS_WIDTH_1 },
+		};
+		static unsigned bus_widths[] = {
+			MMC_BUS_WIDTH_8,
+			MMC_BUS_WIDTH_4,
+			MMC_BUS_WIDTH_1
+		};
+		unsigned idx, bus_width = 0;
+
+		if (host->caps & MMC_CAP_8_BIT_DATA)
+			idx = 0;
+		else
+			idx = 1;
+		for (; idx < ARRAY_SIZE(bus_widths); idx++) {
+			bus_width = bus_widths[idx];
+			if (bus_width == MMC_BUS_WIDTH_1)
+				ddr = 0; /* no DDR for 1-bit width */
+			err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+					 EXT_CSD_BUS_WIDTH,
+					 ext_csd_bits[idx][0]);
+			if (!err) {
+				/*
+				 * If controller can't handle bus width test,
+				 * use the highest bus width to maintain
+				 * compatibility with previous MMC behavior.
+				 */
+				if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
+					break;
+				mmc_set_bus_width_ddr(card->host,
+						      bus_width, MMC_SDR_MODE);
+				err = mmc_bus_test(card, bus_width);
+				if (!err)
+					break;
+			}
 		}
 
-		err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-				 EXT_CSD_BUS_WIDTH, ext_csd_bit);
-
-		if (err && err != -EBADMSG)
-			goto free_card;
-
+		if (!err && ddr) {
+			err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+					EXT_CSD_BUS_WIDTH,
+					ext_csd_bits[idx][1]);
+		}
 		if (err) {
 			printk(KERN_WARNING "%s: switch to bus width %d ddr %d "
-			       "failed\n", mmc_hostname(card->host),
-			       1 << bus_width, ddr);
-			err = 0;
-		} else {
-			if (ddr)
-				mmc_card_set_ddr_mode(card);
-			else
-				ddr = MMC_SDR_MODE;
-
+				"failed\n", mmc_hostname(card->host),
+				1 << bus_width, ddr);
+			goto free_card;
+		} else if (ddr) {
+			mmc_card_set_ddr_mode(card);
 			mmc_set_bus_width_ddr(card->host, bus_width, ddr);
 		}
 	}
diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 326447c9ede8..60842f878ded 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -462,3 +462,104 @@ int mmc_send_status(struct mmc_card *card, u32 *status)
 	return 0;
 }
 
+static int
+mmc_send_bus_test(struct mmc_card *card, struct mmc_host *host, u8 opcode,
+		  u8 len)
+{
+	struct mmc_request mrq;
+	struct mmc_command cmd;
+	struct mmc_data data;
+	struct scatterlist sg;
+	u8 *data_buf;
+	u8 *test_buf;
+	int i, err;
+	static u8 testdata_8bit[8] = { 0x55, 0xaa, 0, 0, 0, 0, 0, 0 };
+	static u8 testdata_4bit[4] = { 0x5a, 0, 0, 0 };
+
+	/* dma onto stack is unsafe/nonportable, but callers to this
+	 * routine normally provide temporary on-stack buffers ...
+	 */
+	data_buf = kmalloc(len, GFP_KERNEL);
+	if (!data_buf)
+		return -ENOMEM;
+
+	if (len == 8)
+		test_buf = testdata_8bit;
+	else if (len == 4)
+		test_buf = testdata_4bit;
+	else {
+		printk(KERN_ERR "%s: Invalid bus_width %d\n",
+		       mmc_hostname(host), len);
+		kfree(data_buf);
+		return -EINVAL;
+	}
+
+	if (opcode == MMC_BUS_TEST_W)
+		memcpy(data_buf, test_buf, len);
+
+	memset(&mrq, 0, sizeof(struct mmc_request));
+	memset(&cmd, 0, sizeof(struct mmc_command));
+	memset(&data, 0, sizeof(struct mmc_data));
+
+	mrq.cmd = &cmd;
+	mrq.data = &data;
+	cmd.opcode = opcode;
+	cmd.arg = 0;
+
+	/* NOTE HACK:  the MMC_RSP_SPI_R1 is always correct here, but we
+	 * rely on callers to never use this with "native" calls for reading
+	 * CSD or CID.  Native versions of those commands use the R2 type,
+	 * not R1 plus a data block.
+	 */
+	cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC;
+
+	data.blksz = len;
+	data.blocks = 1;
+	if (opcode == MMC_BUS_TEST_R)
+		data.flags = MMC_DATA_READ;
+	else
+		data.flags = MMC_DATA_WRITE;
+
+	data.sg = &sg;
+	data.sg_len = 1;
+	sg_init_one(&sg, data_buf, len);
+	mmc_wait_for_req(host, &mrq);
+	err = 0;
+	if (opcode == MMC_BUS_TEST_R) {
+		for (i = 0; i < len / 4; i++)
+			if ((test_buf[i] ^ data_buf[i]) != 0xff) {
+				err = -EIO;
+				break;
+			}
+	}
+	kfree(data_buf);
+
+	if (cmd.error)
+		return cmd.error;
+	if (data.error)
+		return data.error;
+
+	return err;
+}
+
+int mmc_bus_test(struct mmc_card *card, u8 bus_width)
+{
+	int err, width;
+
+	if (bus_width == MMC_BUS_WIDTH_8)
+		width = 8;
+	else if (bus_width == MMC_BUS_WIDTH_4)
+		width = 4;
+	else if (bus_width == MMC_BUS_WIDTH_1)
+		return 0; /* no need for test */
+	else
+		return -EINVAL;
+
+	/*
+	 * Ignore errors from BUS_TEST_W.  BUS_TEST_R will fail if there
+	 * is a problem.  This improves chances that the test will work.
+	 */
+	mmc_send_bus_test(card, card->host, MMC_BUS_TEST_W, width);
+	err = mmc_send_bus_test(card, card->host, MMC_BUS_TEST_R, width);
+	return err;
+}
diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h
index 653eb8e84178..e6d44b8a18db 100644
--- a/drivers/mmc/core/mmc_ops.h
+++ b/drivers/mmc/core/mmc_ops.h
@@ -26,6 +26,7 @@ int mmc_send_cid(struct mmc_host *host, u32 *cid);
 int mmc_spi_read_ocr(struct mmc_host *host, int highcap, u32 *ocrp);
 int mmc_spi_set_crc(struct mmc_host *host, int use_crc);
 int mmc_card_sleepawake(struct mmc_host *host, int sleep);
+int mmc_bus_test(struct mmc_card *card, u8 bus_width);
 
 #endif
 
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 55698864c2cd..d5febe584b05 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -23,6 +23,7 @@
 
 #include <linux/leds.h>
 
+#include <linux/mmc/mmc.h>
 #include <linux/mmc/host.h>
 
 #include "sdhci.h"
@@ -1521,7 +1522,11 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
 
 	if (intmask & SDHCI_INT_DATA_TIMEOUT)
 		host->data->error = -ETIMEDOUT;
-	else if (intmask & (SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_END_BIT))
+	else if (intmask & SDHCI_INT_DATA_END_BIT)
+		host->data->error = -EILSEQ;
+	else if ((intmask & SDHCI_INT_DATA_CRC) &&
+		SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))
+			!= MMC_BUS_TEST_R)
 		host->data->error = -EILSEQ;
 	else if (intmask & SDHCI_INT_ADMA_ERROR) {
 		printk(KERN_ERR "%s: ADMA error\n", mmc_hostname(host->mmc));
diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h
index 1efe7dc5255b..6e0969e40650 100644
--- a/drivers/mmc/host/sdhci.h
+++ b/drivers/mmc/host/sdhci.h
@@ -52,6 +52,7 @@
 #define  SDHCI_CMD_RESP_SHORT_BUSY 0x03
 
 #define SDHCI_MAKE_CMD(c, f) (((c & 0xff) << 8) | (f & 0xff))
+#define SDHCI_GET_CMD(c) ((c>>8) & 0x3f)
 
 #define SDHCI_RESPONSE		0x10
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 3a85e73a38a9..bcb793ec7374 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -172,6 +172,7 @@ struct mmc_host {
 #define MMC_CAP_1_2V_DDR	(1 << 12)	/* can support */
 						/* DDR mode at 1.2V */
 #define MMC_CAP_POWER_OFF_CARD	(1 << 13)	/* Can power off after boot */
+#define MMC_CAP_BUS_WIDTH_TEST	(1 << 14)	/* CMD14/CMD19 bus width ok */
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 956fbd877692..612301f85d14 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -40,7 +40,9 @@
 #define MMC_READ_DAT_UNTIL_STOP  11   /* adtc [31:0] dadr        R1  */
 #define MMC_STOP_TRANSMISSION    12   /* ac                      R1b */
 #define MMC_SEND_STATUS          13   /* ac   [31:16] RCA        R1  */
+#define MMC_BUS_TEST_R           14   /* adtc                    R1  */
 #define MMC_GO_INACTIVE_STATE    15   /* ac   [31:16] RCA            */
+#define MMC_BUS_TEST_W           19   /* adtc                    R1  */
 #define MMC_SPI_READ_OCR         58   /* spi                  spi_R3 */
 #define MMC_SPI_CRC_ON_OFF       59   /* spi  [0:0] flag      spi_R1 */
 
-- 
cgit v1.2.3-71-gd317


From 30652aa36b58d57fcc1a0acce51e391bbb6edf5e Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sat, 1 Jan 2011 18:37:32 -0600
Subject: mmc: sdhci: add quirk for max len ADMA descriptors

Some controllers misparse segment length 0 as being 0, not 65536. Add
a quirk to deal with it.

Signed-off-by: Olof Johansson <olof@lixom.net>
Reviewed-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/sdhci.c  | 10 +++++++---
 include/linux/mmc/sdhci.h |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index d5febe584b05..9e15f41f87be 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1949,10 +1949,14 @@ int sdhci_add_host(struct sdhci_host *host)
 	 * of bytes. When doing hardware scatter/gather, each entry cannot
 	 * be larger than 64 KiB though.
 	 */
-	if (host->flags & SDHCI_USE_ADMA)
-		mmc->max_seg_size = 65536;
-	else
+	if (host->flags & SDHCI_USE_ADMA) {
+		if (host->quirks & SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC)
+			mmc->max_seg_size = 65535;
+		else
+			mmc->max_seg_size = 65536;
+	} else {
 		mmc->max_seg_size = mmc->max_req_size;
+	}
 
 	/*
 	 * Maximum block size. This varies from controller to controller and
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 0d953f513d81..83bd9f76709a 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -83,6 +83,8 @@ struct sdhci_host {
 #define SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12		(1<<28)
 /* Controller doesn't have HISPD bit field in HI-SPEED SD card */
 #define SDHCI_QUIRK_NO_HISPD_BIT			(1<<29)
+/* Controller treats ADMA descriptors with length 0000h incorrectly */
+#define SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC		(1<<30)
 
 	int irq;		/* Device IRQ */
 	void __iomem *ioaddr;	/* Mapped address */
-- 
cgit v1.2.3-71-gd317


From f95f3850f7a9e1d49ebc5b6e72e7cc3ec3685b0b Mon Sep 17 00:00:00 2001
From: Will Newton <will.newton@gmail.com>
Date: Sun, 2 Jan 2011 01:11:59 -0500
Subject: mmc: dw_mmc: Add Synopsys DesignWare mmc host driver.

This adds the mmc host driver for the Synopsys DesignWare mmc
host controller, found in a number of embedded SoC designs.

Signed-off-by: Will Newton <will.newton@imgtec.com>
Reviewed-by: Matt Fleming <matt@console-pimps.org>
Reviewed-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/Kconfig   |   16 +
 drivers/mmc/host/Makefile  |    1 +
 drivers/mmc/host/dw_mmc.c  | 1796 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/host/dw_mmc.h  |  168 +++++
 include/linux/mmc/dw_mmc.h |  217 ++++++
 5 files changed, 2198 insertions(+)
 create mode 100644 drivers/mmc/host/dw_mmc.c
 create mode 100644 drivers/mmc/host/dw_mmc.h
 create mode 100644 include/linux/mmc/dw_mmc.h

(limited to 'include/linux')

diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 9f47d38dcc7f..5ec02a536c5d 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -479,6 +479,22 @@ config SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND
 	help
 	  If you say yes here SD-Cards may work on the EZkit.
 
+config MMC_DW
+	tristate "Synopsys DesignWare Memory Card Interface"
+	depends on ARM
+	help
+	  This selects support for the Synopsys DesignWare Mobile Storage IP
+	  block, this provides host support for SD and MMC interfaces, in both
+	  PIO and external DMA modes.
+
+config MMC_DW_IDMAC
+	bool "Internal DMAC interface"
+	depends on MMC_DW
+	help
+	  This selects support for the internal DMAC block within the Synopsys
+	  Designware Mobile Storage IP block. This disables the external DMA
+	  interface.
+
 config MMC_SH_MMCIF
 	tristate "SuperH Internal MMCIF support"
 	depends on MMC_BLOCK && (SUPERH || ARCH_SHMOBILE)
diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index 6d1ff9e27368..e834fb223e9a 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_MMC_TMIO)		+= tmio_mmc.o
 obj-$(CONFIG_MMC_CB710)	+= cb710-mmc.o
 obj-$(CONFIG_MMC_VIA_SDMMC)	+= via-sdmmc.o
 obj-$(CONFIG_SDH_BFIN)		+= bfin_sdh.o
+obj-$(CONFIG_MMC_DW)		+= dw_mmc.o
 obj-$(CONFIG_MMC_SH_MMCIF)	+= sh_mmcif.o
 obj-$(CONFIG_MMC_JZ4740)	+= jz4740_mmc.o
 obj-$(CONFIG_MMC_USHC)		+= ushc.o
diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
new file mode 100644
index 000000000000..2fcc82577c1b
--- /dev/null
+++ b/drivers/mmc/host/dw_mmc.c
@@ -0,0 +1,1796 @@
+/*
+ * Synopsys DesignWare Multimedia Card Interface driver
+ *  (Based on NXP driver for lpc 31xx)
+ *
+ * Copyright (C) 2009 NXP Semiconductors
+ * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/clk.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/scatterlist.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/mmc/host.h>
+#include <linux/mmc/mmc.h>
+#include <linux/mmc/dw_mmc.h>
+#include <linux/bitops.h>
+
+#include "dw_mmc.h"
+
+/* Common flag combinations */
+#define DW_MCI_DATA_ERROR_FLAGS	(SDMMC_INT_DTO | SDMMC_INT_DCRC | \
+				 SDMMC_INT_HTO | SDMMC_INT_SBE  | \
+				 SDMMC_INT_EBE)
+#define DW_MCI_CMD_ERROR_FLAGS	(SDMMC_INT_RTO | SDMMC_INT_RCRC | \
+				 SDMMC_INT_RESP_ERR)
+#define DW_MCI_ERROR_FLAGS	(DW_MCI_DATA_ERROR_FLAGS | \
+				 DW_MCI_CMD_ERROR_FLAGS  | SDMMC_INT_HLE)
+#define DW_MCI_SEND_STATUS	1
+#define DW_MCI_RECV_STATUS	2
+#define DW_MCI_DMA_THRESHOLD	16
+
+#ifdef CONFIG_MMC_DW_IDMAC
+struct idmac_desc {
+	u32		des0;	/* Control Descriptor */
+#define IDMAC_DES0_DIC	BIT(1)
+#define IDMAC_DES0_LD	BIT(2)
+#define IDMAC_DES0_FD	BIT(3)
+#define IDMAC_DES0_CH	BIT(4)
+#define IDMAC_DES0_ER	BIT(5)
+#define IDMAC_DES0_CES	BIT(30)
+#define IDMAC_DES0_OWN	BIT(31)
+
+	u32		des1;	/* Buffer sizes */
+#define IDMAC_SET_BUFFER1_SIZE(d, s) \
+	((d)->des1 = ((d)->des1 & 0x03ffc000) | ((s) & 0x3fff))
+
+	u32		des2;	/* buffer 1 physical address */
+
+	u32		des3;	/* buffer 2 physical address */
+};
+#endif /* CONFIG_MMC_DW_IDMAC */
+
+/**
+ * struct dw_mci_slot - MMC slot state
+ * @mmc: The mmc_host representing this slot.
+ * @host: The MMC controller this slot is using.
+ * @ctype: Card type for this slot.
+ * @mrq: mmc_request currently being processed or waiting to be
+ *	processed, or NULL when the slot is idle.
+ * @queue_node: List node for placing this node in the @queue list of
+ *	&struct dw_mci.
+ * @clock: Clock rate configured by set_ios(). Protected by host->lock.
+ * @flags: Random state bits associated with the slot.
+ * @id: Number of this slot.
+ * @last_detect_state: Most recently observed card detect state.
+ */
+struct dw_mci_slot {
+	struct mmc_host		*mmc;
+	struct dw_mci		*host;
+
+	u32			ctype;
+
+	struct mmc_request	*mrq;
+	struct list_head	queue_node;
+
+	unsigned int		clock;
+	unsigned long		flags;
+#define DW_MMC_CARD_PRESENT	0
+#define DW_MMC_CARD_NEED_INIT	1
+	int			id;
+	int			last_detect_state;
+};
+
+#if defined(CONFIG_DEBUG_FS)
+static int dw_mci_req_show(struct seq_file *s, void *v)
+{
+	struct dw_mci_slot *slot = s->private;
+	struct mmc_request *mrq;
+	struct mmc_command *cmd;
+	struct mmc_command *stop;
+	struct mmc_data	*data;
+
+	/* Make sure we get a consistent snapshot */
+	spin_lock_bh(&slot->host->lock);
+	mrq = slot->mrq;
+
+	if (mrq) {
+		cmd = mrq->cmd;
+		data = mrq->data;
+		stop = mrq->stop;
+
+		if (cmd)
+			seq_printf(s,
+				   "CMD%u(0x%x) flg %x rsp %x %x %x %x err %d\n",
+				   cmd->opcode, cmd->arg, cmd->flags,
+				   cmd->resp[0], cmd->resp[1], cmd->resp[2],
+				   cmd->resp[2], cmd->error);
+		if (data)
+			seq_printf(s, "DATA %u / %u * %u flg %x err %d\n",
+				   data->bytes_xfered, data->blocks,
+				   data->blksz, data->flags, data->error);
+		if (stop)
+			seq_printf(s,
+				   "CMD%u(0x%x) flg %x rsp %x %x %x %x err %d\n",
+				   stop->opcode, stop->arg, stop->flags,
+				   stop->resp[0], stop->resp[1], stop->resp[2],
+				   stop->resp[2], stop->error);
+	}
+
+	spin_unlock_bh(&slot->host->lock);
+
+	return 0;
+}
+
+static int dw_mci_req_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dw_mci_req_show, inode->i_private);
+}
+
+static const struct file_operations dw_mci_req_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dw_mci_req_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int dw_mci_regs_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "STATUS:\t0x%08x\n", SDMMC_STATUS);
+	seq_printf(s, "RINTSTS:\t0x%08x\n", SDMMC_RINTSTS);
+	seq_printf(s, "CMD:\t0x%08x\n", SDMMC_CMD);
+	seq_printf(s, "CTRL:\t0x%08x\n", SDMMC_CTRL);
+	seq_printf(s, "INTMASK:\t0x%08x\n", SDMMC_INTMASK);
+	seq_printf(s, "CLKENA:\t0x%08x\n", SDMMC_CLKENA);
+
+	return 0;
+}
+
+static int dw_mci_regs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dw_mci_regs_show, inode->i_private);
+}
+
+static const struct file_operations dw_mci_regs_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dw_mci_regs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void dw_mci_init_debugfs(struct dw_mci_slot *slot)
+{
+	struct mmc_host	*mmc = slot->mmc;
+	struct dw_mci *host = slot->host;
+	struct dentry *root;
+	struct dentry *node;
+
+	root = mmc->debugfs_root;
+	if (!root)
+		return;
+
+	node = debugfs_create_file("regs", S_IRUSR, root, host,
+				   &dw_mci_regs_fops);
+	if (!node)
+		goto err;
+
+	node = debugfs_create_file("req", S_IRUSR, root, slot,
+				   &dw_mci_req_fops);
+	if (!node)
+		goto err;
+
+	node = debugfs_create_u32("state", S_IRUSR, root, (u32 *)&host->state);
+	if (!node)
+		goto err;
+
+	node = debugfs_create_x32("pending_events", S_IRUSR, root,
+				  (u32 *)&host->pending_events);
+	if (!node)
+		goto err;
+
+	node = debugfs_create_x32("completed_events", S_IRUSR, root,
+				  (u32 *)&host->completed_events);
+	if (!node)
+		goto err;
+
+	return;
+
+err:
+	dev_err(&mmc->class_dev, "failed to initialize debugfs for slot\n");
+}
+#endif /* defined(CONFIG_DEBUG_FS) */
+
+static void dw_mci_set_timeout(struct dw_mci *host)
+{
+	/* timeout (maximum) */
+	mci_writel(host, TMOUT, 0xffffffff);
+}
+
+static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd)
+{
+	struct mmc_data	*data;
+	u32 cmdr;
+	cmd->error = -EINPROGRESS;
+
+	cmdr = cmd->opcode;
+
+	if (cmdr == MMC_STOP_TRANSMISSION)
+		cmdr |= SDMMC_CMD_STOP;
+	else
+		cmdr |= SDMMC_CMD_PRV_DAT_WAIT;
+
+	if (cmd->flags & MMC_RSP_PRESENT) {
+		/* We expect a response, so set this bit */
+		cmdr |= SDMMC_CMD_RESP_EXP;
+		if (cmd->flags & MMC_RSP_136)
+			cmdr |= SDMMC_CMD_RESP_LONG;
+	}
+
+	if (cmd->flags & MMC_RSP_CRC)
+		cmdr |= SDMMC_CMD_RESP_CRC;
+
+	data = cmd->data;
+	if (data) {
+		cmdr |= SDMMC_CMD_DAT_EXP;
+		if (data->flags & MMC_DATA_STREAM)
+			cmdr |= SDMMC_CMD_STRM_MODE;
+		if (data->flags & MMC_DATA_WRITE)
+			cmdr |= SDMMC_CMD_DAT_WR;
+	}
+
+	return cmdr;
+}
+
+static void dw_mci_start_command(struct dw_mci *host,
+				 struct mmc_command *cmd, u32 cmd_flags)
+{
+	host->cmd = cmd;
+	dev_vdbg(&host->pdev->dev,
+		 "start command: ARGR=0x%08x CMDR=0x%08x\n",
+		 cmd->arg, cmd_flags);
+
+	mci_writel(host, CMDARG, cmd->arg);
+	wmb();
+
+	mci_writel(host, CMD, cmd_flags | SDMMC_CMD_START);
+}
+
+static void send_stop_cmd(struct dw_mci *host, struct mmc_data *data)
+{
+	dw_mci_start_command(host, data->stop, host->stop_cmdr);
+}
+
+/* DMA interface functions */
+static void dw_mci_stop_dma(struct dw_mci *host)
+{
+	if (host->use_dma) {
+		host->dma_ops->stop(host);
+		host->dma_ops->cleanup(host);
+	} else {
+		/* Data transfer was stopped by the interrupt handler */
+		set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+	}
+}
+
+#ifdef CONFIG_MMC_DW_IDMAC
+static void dw_mci_dma_cleanup(struct dw_mci *host)
+{
+	struct mmc_data *data = host->data;
+
+	if (data)
+		dma_unmap_sg(&host->pdev->dev, data->sg, data->sg_len,
+			     ((data->flags & MMC_DATA_WRITE)
+			      ? DMA_TO_DEVICE : DMA_FROM_DEVICE));
+}
+
+static void dw_mci_idmac_stop_dma(struct dw_mci *host)
+{
+	u32 temp;
+
+	/* Disable and reset the IDMAC interface */
+	temp = mci_readl(host, CTRL);
+	temp &= ~SDMMC_CTRL_USE_IDMAC;
+	temp |= SDMMC_CTRL_DMA_RESET;
+	mci_writel(host, CTRL, temp);
+
+	/* Stop the IDMAC running */
+	temp = mci_readl(host, BMOD);
+	temp &= ~SDMMC_IDMAC_ENABLE;
+	mci_writel(host, BMOD, temp);
+}
+
+static void dw_mci_idmac_complete_dma(struct dw_mci *host)
+{
+	struct mmc_data *data = host->data;
+
+	dev_vdbg(&host->pdev->dev, "DMA complete\n");
+
+	host->dma_ops->cleanup(host);
+
+	/*
+	 * If the card was removed, data will be NULL. No point in trying to
+	 * send the stop command or waiting for NBUSY in this case.
+	 */
+	if (data) {
+		set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+		tasklet_schedule(&host->tasklet);
+	}
+}
+
+static void dw_mci_translate_sglist(struct dw_mci *host, struct mmc_data *data,
+				    unsigned int sg_len)
+{
+	int i;
+	struct idmac_desc *desc = host->sg_cpu;
+
+	for (i = 0; i < sg_len; i++, desc++) {
+		unsigned int length = sg_dma_len(&data->sg[i]);
+		u32 mem_addr = sg_dma_address(&data->sg[i]);
+
+		/* Set the OWN bit and disable interrupts for this descriptor */
+		desc->des0 = IDMAC_DES0_OWN | IDMAC_DES0_DIC | IDMAC_DES0_CH;
+
+		/* Buffer length */
+		IDMAC_SET_BUFFER1_SIZE(desc, length);
+
+		/* Physical address to DMA to/from */
+		desc->des2 = mem_addr;
+	}
+
+	/* Set first descriptor */
+	desc = host->sg_cpu;
+	desc->des0 |= IDMAC_DES0_FD;
+
+	/* Set last descriptor */
+	desc = host->sg_cpu + (i - 1) * sizeof(struct idmac_desc);
+	desc->des0 &= ~(IDMAC_DES0_CH | IDMAC_DES0_DIC);
+	desc->des0 |= IDMAC_DES0_LD;
+
+	wmb();
+}
+
+static void dw_mci_idmac_start_dma(struct dw_mci *host, unsigned int sg_len)
+{
+	u32 temp;
+
+	dw_mci_translate_sglist(host, host->data, sg_len);
+
+	/* Select IDMAC interface */
+	temp = mci_readl(host, CTRL);
+	temp |= SDMMC_CTRL_USE_IDMAC;
+	mci_writel(host, CTRL, temp);
+
+	wmb();
+
+	/* Enable the IDMAC */
+	temp = mci_readl(host, BMOD);
+	temp |= SDMMC_IDMAC_ENABLE;
+	mci_writel(host, BMOD, temp);
+
+	/* Start it running */
+	mci_writel(host, PLDMND, 1);
+}
+
+static int dw_mci_idmac_init(struct dw_mci *host)
+{
+	struct idmac_desc *p;
+	int i;
+
+	/* Number of descriptors in the ring buffer */
+	host->ring_size = PAGE_SIZE / sizeof(struct idmac_desc);
+
+	/* Forward link the descriptor list */
+	for (i = 0, p = host->sg_cpu; i < host->ring_size - 1; i++, p++)
+		p->des3 = host->sg_dma + (sizeof(struct idmac_desc) * (i + 1));
+
+	/* Set the last descriptor as the end-of-ring descriptor */
+	p->des3 = host->sg_dma;
+	p->des0 = IDMAC_DES0_ER;
+
+	/* Mask out interrupts - get Tx & Rx complete only */
+	mci_writel(host, IDINTEN, SDMMC_IDMAC_INT_NI | SDMMC_IDMAC_INT_RI |
+		   SDMMC_IDMAC_INT_TI);
+
+	/* Set the descriptor base address */
+	mci_writel(host, DBADDR, host->sg_dma);
+	return 0;
+}
+
+static struct dw_mci_dma_ops dw_mci_idmac_ops = {
+	.init = dw_mci_idmac_init,
+	.start = dw_mci_idmac_start_dma,
+	.stop = dw_mci_idmac_stop_dma,
+	.complete = dw_mci_idmac_complete_dma,
+	.cleanup = dw_mci_dma_cleanup,
+};
+#endif /* CONFIG_MMC_DW_IDMAC */
+
+static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
+{
+	struct scatterlist *sg;
+	unsigned int i, direction, sg_len;
+	u32 temp;
+
+	/* If we don't have a channel, we can't do DMA */
+	if (!host->use_dma)
+		return -ENODEV;
+
+	/*
+	 * We don't do DMA on "complex" transfers, i.e. with
+	 * non-word-aligned buffers or lengths. Also, we don't bother
+	 * with all the DMA setup overhead for short transfers.
+	 */
+	if (data->blocks * data->blksz < DW_MCI_DMA_THRESHOLD)
+		return -EINVAL;
+	if (data->blksz & 3)
+		return -EINVAL;
+
+	for_each_sg(data->sg, sg, data->sg_len, i) {
+		if (sg->offset & 3 || sg->length & 3)
+			return -EINVAL;
+	}
+
+	if (data->flags & MMC_DATA_READ)
+		direction = DMA_FROM_DEVICE;
+	else
+		direction = DMA_TO_DEVICE;
+
+	sg_len = dma_map_sg(&host->pdev->dev, data->sg, data->sg_len,
+			    direction);
+
+	dev_vdbg(&host->pdev->dev,
+		 "sd sg_cpu: %#lx sg_dma: %#lx sg_len: %d\n",
+		 (unsigned long)host->sg_cpu, (unsigned long)host->sg_dma,
+		 sg_len);
+
+	/* Enable the DMA interface */
+	temp = mci_readl(host, CTRL);
+	temp |= SDMMC_CTRL_DMA_ENABLE;
+	mci_writel(host, CTRL, temp);
+
+	/* Disable RX/TX IRQs, let DMA handle it */
+	temp = mci_readl(host, INTMASK);
+	temp  &= ~(SDMMC_INT_RXDR | SDMMC_INT_TXDR);
+	mci_writel(host, INTMASK, temp);
+
+	host->dma_ops->start(host, sg_len);
+
+	return 0;
+}
+
+static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
+{
+	u32 temp;
+
+	data->error = -EINPROGRESS;
+
+	WARN_ON(host->data);
+	host->sg = NULL;
+	host->data = data;
+
+	if (dw_mci_submit_data_dma(host, data)) {
+		host->sg = data->sg;
+		host->pio_offset = 0;
+		if (data->flags & MMC_DATA_READ)
+			host->dir_status = DW_MCI_RECV_STATUS;
+		else
+			host->dir_status = DW_MCI_SEND_STATUS;
+
+		temp = mci_readl(host, INTMASK);
+		temp |= SDMMC_INT_TXDR | SDMMC_INT_RXDR;
+		mci_writel(host, INTMASK, temp);
+
+		temp = mci_readl(host, CTRL);
+		temp &= ~SDMMC_CTRL_DMA_ENABLE;
+		mci_writel(host, CTRL, temp);
+	}
+}
+
+static void mci_send_cmd(struct dw_mci_slot *slot, u32 cmd, u32 arg)
+{
+	struct dw_mci *host = slot->host;
+	unsigned long timeout = jiffies + msecs_to_jiffies(500);
+	unsigned int cmd_status = 0;
+
+	mci_writel(host, CMDARG, arg);
+	wmb();
+	mci_writel(host, CMD, SDMMC_CMD_START | cmd);
+
+	while (time_before(jiffies, timeout)) {
+		cmd_status = mci_readl(host, CMD);
+		if (!(cmd_status & SDMMC_CMD_START))
+			return;
+	}
+	dev_err(&slot->mmc->class_dev,
+		"Timeout sending command (cmd %#x arg %#x status %#x)\n",
+		cmd, arg, cmd_status);
+}
+
+static void dw_mci_setup_bus(struct dw_mci_slot *slot)
+{
+	struct dw_mci *host = slot->host;
+	u32 div;
+
+	if (slot->clock != host->current_speed) {
+		if (host->bus_hz % slot->clock)
+			/*
+			 * move the + 1 after the divide to prevent
+			 * over-clocking the card.
+			 */
+			div = ((host->bus_hz / slot->clock) >> 1) + 1;
+		else
+			div = (host->bus_hz  / slot->clock) >> 1;
+
+		dev_info(&slot->mmc->class_dev,
+			 "Bus speed (slot %d) = %dHz (slot req %dHz, actual %dHZ"
+			 " div = %d)\n", slot->id, host->bus_hz, slot->clock,
+			 div ? ((host->bus_hz / div) >> 1) : host->bus_hz, div);
+
+		/* disable clock */
+		mci_writel(host, CLKENA, 0);
+		mci_writel(host, CLKSRC, 0);
+
+		/* inform CIU */
+		mci_send_cmd(slot,
+			     SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
+
+		/* set clock to desired speed */
+		mci_writel(host, CLKDIV, div);
+
+		/* inform CIU */
+		mci_send_cmd(slot,
+			     SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
+
+		/* enable clock */
+		mci_writel(host, CLKENA, SDMMC_CLKEN_ENABLE);
+
+		/* inform CIU */
+		mci_send_cmd(slot,
+			     SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
+
+		host->current_speed = slot->clock;
+	}
+
+	/* Set the current slot bus width */
+	mci_writel(host, CTYPE, slot->ctype);
+}
+
+static void dw_mci_start_request(struct dw_mci *host,
+				 struct dw_mci_slot *slot)
+{
+	struct mmc_request *mrq;
+	struct mmc_command *cmd;
+	struct mmc_data	*data;
+	u32 cmdflags;
+
+	mrq = slot->mrq;
+	if (host->pdata->select_slot)
+		host->pdata->select_slot(slot->id);
+
+	/* Slot specific timing and width adjustment */
+	dw_mci_setup_bus(slot);
+
+	host->cur_slot = slot;
+	host->mrq = mrq;
+
+	host->pending_events = 0;
+	host->completed_events = 0;
+	host->data_status = 0;
+
+	data = mrq->data;
+	if (data) {
+		dw_mci_set_timeout(host);
+		mci_writel(host, BYTCNT, data->blksz*data->blocks);
+		mci_writel(host, BLKSIZ, data->blksz);
+	}
+
+	cmd = mrq->cmd;
+	cmdflags = dw_mci_prepare_command(slot->mmc, cmd);
+
+	/* this is the first command, send the initialization clock */
+	if (test_and_clear_bit(DW_MMC_CARD_NEED_INIT, &slot->flags))
+		cmdflags |= SDMMC_CMD_INIT;
+
+	if (data) {
+		dw_mci_submit_data(host, data);
+		wmb();
+	}
+
+	dw_mci_start_command(host, cmd, cmdflags);
+
+	if (mrq->stop)
+		host->stop_cmdr = dw_mci_prepare_command(slot->mmc, mrq->stop);
+}
+
+static void dw_mci_queue_request(struct dw_mci *host, struct dw_mci_slot *slot,
+				 struct mmc_request *mrq)
+{
+	dev_vdbg(&slot->mmc->class_dev, "queue request: state=%d\n",
+		 host->state);
+
+	spin_lock_bh(&host->lock);
+	slot->mrq = mrq;
+
+	if (host->state == STATE_IDLE) {
+		host->state = STATE_SENDING_CMD;
+		dw_mci_start_request(host, slot);
+	} else {
+		list_add_tail(&slot->queue_node, &host->queue);
+	}
+
+	spin_unlock_bh(&host->lock);
+}
+
+static void dw_mci_request(struct mmc_host *mmc, struct mmc_request *mrq)
+{
+	struct dw_mci_slot *slot = mmc_priv(mmc);
+	struct dw_mci *host = slot->host;
+
+	WARN_ON(slot->mrq);
+
+	if (!test_bit(DW_MMC_CARD_PRESENT, &slot->flags)) {
+		mrq->cmd->error = -ENOMEDIUM;
+		mmc_request_done(mmc, mrq);
+		return;
+	}
+
+	/* We don't support multiple blocks of weird lengths. */
+	dw_mci_queue_request(host, slot, mrq);
+}
+
+static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+	struct dw_mci_slot *slot = mmc_priv(mmc);
+
+	/* set default 1 bit mode */
+	slot->ctype = SDMMC_CTYPE_1BIT;
+
+	switch (ios->bus_width) {
+	case MMC_BUS_WIDTH_1:
+		slot->ctype = SDMMC_CTYPE_1BIT;
+		break;
+	case MMC_BUS_WIDTH_4:
+		slot->ctype = SDMMC_CTYPE_4BIT;
+		break;
+	}
+
+	if (ios->clock) {
+		/*
+		 * Use mirror of ios->clock to prevent race with mmc
+		 * core ios update when finding the minimum.
+		 */
+		slot->clock = ios->clock;
+	}
+
+	switch (ios->power_mode) {
+	case MMC_POWER_UP:
+		set_bit(DW_MMC_CARD_NEED_INIT, &slot->flags);
+		break;
+	default:
+		break;
+	}
+}
+
+static int dw_mci_get_ro(struct mmc_host *mmc)
+{
+	int read_only;
+	struct dw_mci_slot *slot = mmc_priv(mmc);
+	struct dw_mci_board *brd = slot->host->pdata;
+
+	/* Use platform get_ro function, else try on board write protect */
+	if (brd->get_ro)
+		read_only = brd->get_ro(slot->id);
+	else
+		read_only =
+			mci_readl(slot->host, WRTPRT) & (1 << slot->id) ? 1 : 0;
+
+	dev_dbg(&mmc->class_dev, "card is %s\n",
+		read_only ? "read-only" : "read-write");
+
+	return read_only;
+}
+
+static int dw_mci_get_cd(struct mmc_host *mmc)
+{
+	int present;
+	struct dw_mci_slot *slot = mmc_priv(mmc);
+	struct dw_mci_board *brd = slot->host->pdata;
+
+	/* Use platform get_cd function, else try onboard card detect */
+	if (brd->get_cd)
+		present = !brd->get_cd(slot->id);
+	else
+		present = (mci_readl(slot->host, CDETECT) & (1 << slot->id))
+			== 0 ? 1 : 0;
+
+	if (present)
+		dev_dbg(&mmc->class_dev, "card is present\n");
+	else
+		dev_dbg(&mmc->class_dev, "card is not present\n");
+
+	return present;
+}
+
+static const struct mmc_host_ops dw_mci_ops = {
+	.request	= dw_mci_request,
+	.set_ios	= dw_mci_set_ios,
+	.get_ro		= dw_mci_get_ro,
+	.get_cd		= dw_mci_get_cd,
+};
+
+static void dw_mci_request_end(struct dw_mci *host, struct mmc_request *mrq)
+	__releases(&host->lock)
+	__acquires(&host->lock)
+{
+	struct dw_mci_slot *slot;
+	struct mmc_host	*prev_mmc = host->cur_slot->mmc;
+
+	WARN_ON(host->cmd || host->data);
+
+	host->cur_slot->mrq = NULL;
+	host->mrq = NULL;
+	if (!list_empty(&host->queue)) {
+		slot = list_entry(host->queue.next,
+				  struct dw_mci_slot, queue_node);
+		list_del(&slot->queue_node);
+		dev_vdbg(&host->pdev->dev, "list not empty: %s is next\n",
+			 mmc_hostname(slot->mmc));
+		host->state = STATE_SENDING_CMD;
+		dw_mci_start_request(host, slot);
+	} else {
+		dev_vdbg(&host->pdev->dev, "list empty\n");
+		host->state = STATE_IDLE;
+	}
+
+	spin_unlock(&host->lock);
+	mmc_request_done(prev_mmc, mrq);
+	spin_lock(&host->lock);
+}
+
+static void dw_mci_command_complete(struct dw_mci *host, struct mmc_command *cmd)
+{
+	u32 status = host->cmd_status;
+
+	host->cmd_status = 0;
+
+	/* Read the response from the card (up to 16 bytes) */
+	if (cmd->flags & MMC_RSP_PRESENT) {
+		if (cmd->flags & MMC_RSP_136) {
+			cmd->resp[3] = mci_readl(host, RESP0);
+			cmd->resp[2] = mci_readl(host, RESP1);
+			cmd->resp[1] = mci_readl(host, RESP2);
+			cmd->resp[0] = mci_readl(host, RESP3);
+		} else {
+			cmd->resp[0] = mci_readl(host, RESP0);
+			cmd->resp[1] = 0;
+			cmd->resp[2] = 0;
+			cmd->resp[3] = 0;
+		}
+	}
+
+	if (status & SDMMC_INT_RTO)
+		cmd->error = -ETIMEDOUT;
+	else if ((cmd->flags & MMC_RSP_CRC) && (status & SDMMC_INT_RCRC))
+		cmd->error = -EILSEQ;
+	else if (status & SDMMC_INT_RESP_ERR)
+		cmd->error = -EIO;
+	else
+		cmd->error = 0;
+
+	if (cmd->error) {
+		/* newer ip versions need a delay between retries */
+		if (host->quirks & DW_MCI_QUIRK_RETRY_DELAY)
+			mdelay(20);
+
+		if (cmd->data) {
+			host->data = NULL;
+			dw_mci_stop_dma(host);
+		}
+	}
+}
+
+static void dw_mci_tasklet_func(unsigned long priv)
+{
+	struct dw_mci *host = (struct dw_mci *)priv;
+	struct mmc_data	*data;
+	struct mmc_command *cmd;
+	enum dw_mci_state state;
+	enum dw_mci_state prev_state;
+	u32 status;
+
+	spin_lock(&host->lock);
+
+	state = host->state;
+	data = host->data;
+
+	do {
+		prev_state = state;
+
+		switch (state) {
+		case STATE_IDLE:
+			break;
+
+		case STATE_SENDING_CMD:
+			if (!test_and_clear_bit(EVENT_CMD_COMPLETE,
+						&host->pending_events))
+				break;
+
+			cmd = host->cmd;
+			host->cmd = NULL;
+			set_bit(EVENT_CMD_COMPLETE, &host->completed_events);
+			dw_mci_command_complete(host, host->mrq->cmd);
+			if (!host->mrq->data || cmd->error) {
+				dw_mci_request_end(host, host->mrq);
+				goto unlock;
+			}
+
+			prev_state = state = STATE_SENDING_DATA;
+			/* fall through */
+
+		case STATE_SENDING_DATA:
+			if (test_and_clear_bit(EVENT_DATA_ERROR,
+					       &host->pending_events)) {
+				dw_mci_stop_dma(host);
+				if (data->stop)
+					send_stop_cmd(host, data);
+				state = STATE_DATA_ERROR;
+				break;
+			}
+
+			if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
+						&host->pending_events))
+				break;
+
+			set_bit(EVENT_XFER_COMPLETE, &host->completed_events);
+			prev_state = state = STATE_DATA_BUSY;
+			/* fall through */
+
+		case STATE_DATA_BUSY:
+			if (!test_and_clear_bit(EVENT_DATA_COMPLETE,
+						&host->pending_events))
+				break;
+
+			host->data = NULL;
+			set_bit(EVENT_DATA_COMPLETE, &host->completed_events);
+			status = host->data_status;
+
+			if (status & DW_MCI_DATA_ERROR_FLAGS) {
+				if (status & SDMMC_INT_DTO) {
+					dev_err(&host->pdev->dev,
+						"data timeout error\n");
+					data->error = -ETIMEDOUT;
+				} else if (status & SDMMC_INT_DCRC) {
+					dev_err(&host->pdev->dev,
+						"data CRC error\n");
+					data->error = -EILSEQ;
+				} else {
+					dev_err(&host->pdev->dev,
+						"data FIFO error "
+						"(status=%08x)\n",
+						status);
+					data->error = -EIO;
+				}
+			} else {
+				data->bytes_xfered = data->blocks * data->blksz;
+				data->error = 0;
+			}
+
+			if (!data->stop) {
+				dw_mci_request_end(host, host->mrq);
+				goto unlock;
+			}
+
+			prev_state = state = STATE_SENDING_STOP;
+			if (!data->error)
+				send_stop_cmd(host, data);
+			/* fall through */
+
+		case STATE_SENDING_STOP:
+			if (!test_and_clear_bit(EVENT_CMD_COMPLETE,
+						&host->pending_events))
+				break;
+
+			host->cmd = NULL;
+			dw_mci_command_complete(host, host->mrq->stop);
+			dw_mci_request_end(host, host->mrq);
+			goto unlock;
+
+		case STATE_DATA_ERROR:
+			if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
+						&host->pending_events))
+				break;
+
+			state = STATE_DATA_BUSY;
+			break;
+		}
+	} while (state != prev_state);
+
+	host->state = state;
+unlock:
+	spin_unlock(&host->lock);
+
+}
+
+static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt)
+{
+	u16 *pdata = (u16 *)buf;
+
+	WARN_ON(cnt % 2 != 0);
+
+	cnt = cnt >> 1;
+	while (cnt > 0) {
+		mci_writew(host, DATA, *pdata++);
+		cnt--;
+	}
+}
+
+static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt)
+{
+	u16 *pdata = (u16 *)buf;
+
+	WARN_ON(cnt % 2 != 0);
+
+	cnt = cnt >> 1;
+	while (cnt > 0) {
+		*pdata++ = mci_readw(host, DATA);
+		cnt--;
+	}
+}
+
+static void dw_mci_push_data32(struct dw_mci *host, void *buf, int cnt)
+{
+	u32 *pdata = (u32 *)buf;
+
+	WARN_ON(cnt % 4 != 0);
+	WARN_ON((unsigned long)pdata & 0x3);
+
+	cnt = cnt >> 2;
+	while (cnt > 0) {
+		mci_writel(host, DATA, *pdata++);
+		cnt--;
+	}
+}
+
+static void dw_mci_pull_data32(struct dw_mci *host, void *buf, int cnt)
+{
+	u32 *pdata = (u32 *)buf;
+
+	WARN_ON(cnt % 4 != 0);
+	WARN_ON((unsigned long)pdata & 0x3);
+
+	cnt = cnt >> 2;
+	while (cnt > 0) {
+		*pdata++ = mci_readl(host, DATA);
+		cnt--;
+	}
+}
+
+static void dw_mci_push_data64(struct dw_mci *host, void *buf, int cnt)
+{
+	u64 *pdata = (u64 *)buf;
+
+	WARN_ON(cnt % 8 != 0);
+
+	cnt = cnt >> 3;
+	while (cnt > 0) {
+		mci_writeq(host, DATA, *pdata++);
+		cnt--;
+	}
+}
+
+static void dw_mci_pull_data64(struct dw_mci *host, void *buf, int cnt)
+{
+	u64 *pdata = (u64 *)buf;
+
+	WARN_ON(cnt % 8 != 0);
+
+	cnt = cnt >> 3;
+	while (cnt > 0) {
+		*pdata++ = mci_readq(host, DATA);
+		cnt--;
+	}
+}
+
+static void dw_mci_read_data_pio(struct dw_mci *host)
+{
+	struct scatterlist *sg = host->sg;
+	void *buf = sg_virt(sg);
+	unsigned int offset = host->pio_offset;
+	struct mmc_data	*data = host->data;
+	int shift = host->data_shift;
+	u32 status;
+	unsigned int nbytes = 0, len, old_len, count = 0;
+
+	do {
+		len = SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift;
+		if (count == 0)
+			old_len = len;
+
+		if (offset + len <= sg->length) {
+			host->pull_data(host, (void *)(buf + offset), len);
+
+			offset += len;
+			nbytes += len;
+
+			if (offset == sg->length) {
+				flush_dcache_page(sg_page(sg));
+				host->sg = sg = sg_next(sg);
+				if (!sg)
+					goto done;
+
+				offset = 0;
+				buf = sg_virt(sg);
+			}
+		} else {
+			unsigned int remaining = sg->length - offset;
+			host->pull_data(host, (void *)(buf + offset),
+					remaining);
+			nbytes += remaining;
+
+			flush_dcache_page(sg_page(sg));
+			host->sg = sg = sg_next(sg);
+			if (!sg)
+				goto done;
+
+			offset = len - remaining;
+			buf = sg_virt(sg);
+			host->pull_data(host, buf, offset);
+			nbytes += offset;
+		}
+
+		status = mci_readl(host, MINTSTS);
+		mci_writel(host, RINTSTS, SDMMC_INT_RXDR);
+		if (status & DW_MCI_DATA_ERROR_FLAGS) {
+			host->data_status = status;
+			data->bytes_xfered += nbytes;
+			smp_wmb();
+
+			set_bit(EVENT_DATA_ERROR, &host->pending_events);
+
+			tasklet_schedule(&host->tasklet);
+			return;
+		}
+		count++;
+	} while (status & SDMMC_INT_RXDR); /*if the RXDR is ready read again*/
+	len = SDMMC_GET_FCNT(mci_readl(host, STATUS));
+	host->pio_offset = offset;
+	data->bytes_xfered += nbytes;
+	return;
+
+done:
+	data->bytes_xfered += nbytes;
+	smp_wmb();
+	set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+}
+
+static void dw_mci_write_data_pio(struct dw_mci *host)
+{
+	struct scatterlist *sg = host->sg;
+	void *buf = sg_virt(sg);
+	unsigned int offset = host->pio_offset;
+	struct mmc_data	*data = host->data;
+	int shift = host->data_shift;
+	u32 status;
+	unsigned int nbytes = 0, len;
+
+	do {
+		len = SDMMC_FIFO_SZ -
+			(SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift);
+		if (offset + len <= sg->length) {
+			host->push_data(host, (void *)(buf + offset), len);
+
+			offset += len;
+			nbytes += len;
+			if (offset == sg->length) {
+				host->sg = sg = sg_next(sg);
+				if (!sg)
+					goto done;
+
+				offset = 0;
+				buf = sg_virt(sg);
+			}
+		} else {
+			unsigned int remaining = sg->length - offset;
+
+			host->push_data(host, (void *)(buf + offset),
+					remaining);
+			nbytes += remaining;
+
+			host->sg = sg = sg_next(sg);
+			if (!sg)
+				goto done;
+
+			offset = len - remaining;
+			buf = sg_virt(sg);
+			host->push_data(host, (void *)buf, offset);
+			nbytes += offset;
+		}
+
+		status = mci_readl(host, MINTSTS);
+		mci_writel(host, RINTSTS, SDMMC_INT_TXDR);
+		if (status & DW_MCI_DATA_ERROR_FLAGS) {
+			host->data_status = status;
+			data->bytes_xfered += nbytes;
+
+			smp_wmb();
+
+			set_bit(EVENT_DATA_ERROR, &host->pending_events);
+
+			tasklet_schedule(&host->tasklet);
+			return;
+		}
+	} while (status & SDMMC_INT_TXDR); /* if TXDR write again */
+
+	host->pio_offset = offset;
+	data->bytes_xfered += nbytes;
+
+	return;
+
+done:
+	data->bytes_xfered += nbytes;
+	smp_wmb();
+	set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+}
+
+static void dw_mci_cmd_interrupt(struct dw_mci *host, u32 status)
+{
+	if (!host->cmd_status)
+		host->cmd_status = status;
+
+	smp_wmb();
+
+	set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
+	tasklet_schedule(&host->tasklet);
+}
+
+static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
+{
+	struct dw_mci *host = dev_id;
+	u32 status, pending;
+	unsigned int pass_count = 0;
+
+	do {
+		status = mci_readl(host, RINTSTS);
+		pending = mci_readl(host, MINTSTS); /* read-only mask reg */
+
+		/*
+		 * DTO fix - version 2.10a and below, and only if internal DMA
+		 * is configured.
+		 */
+		if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO) {
+			if (!pending &&
+			    ((mci_readl(host, STATUS) >> 17) & 0x1fff))
+				pending |= SDMMC_INT_DATA_OVER;
+		}
+
+		if (!pending)
+			break;
+
+		if (pending & DW_MCI_CMD_ERROR_FLAGS) {
+			mci_writel(host, RINTSTS, DW_MCI_CMD_ERROR_FLAGS);
+			host->cmd_status = status;
+			smp_wmb();
+			set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
+			tasklet_schedule(&host->tasklet);
+		}
+
+		if (pending & DW_MCI_DATA_ERROR_FLAGS) {
+			/* if there is an error report DATA_ERROR */
+			mci_writel(host, RINTSTS, DW_MCI_DATA_ERROR_FLAGS);
+			host->data_status = status;
+			smp_wmb();
+			set_bit(EVENT_DATA_ERROR, &host->pending_events);
+			tasklet_schedule(&host->tasklet);
+		}
+
+		if (pending & SDMMC_INT_DATA_OVER) {
+			mci_writel(host, RINTSTS, SDMMC_INT_DATA_OVER);
+			if (!host->data_status)
+				host->data_status = status;
+			smp_wmb();
+			if (host->dir_status == DW_MCI_RECV_STATUS) {
+				if (host->sg != NULL)
+					dw_mci_read_data_pio(host);
+			}
+			set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
+			tasklet_schedule(&host->tasklet);
+		}
+
+		if (pending & SDMMC_INT_RXDR) {
+			mci_writel(host, RINTSTS, SDMMC_INT_RXDR);
+			if (host->sg)
+				dw_mci_read_data_pio(host);
+		}
+
+		if (pending & SDMMC_INT_TXDR) {
+			mci_writel(host, RINTSTS, SDMMC_INT_TXDR);
+			if (host->sg)
+				dw_mci_write_data_pio(host);
+		}
+
+		if (pending & SDMMC_INT_CMD_DONE) {
+			mci_writel(host, RINTSTS, SDMMC_INT_CMD_DONE);
+			dw_mci_cmd_interrupt(host, status);
+		}
+
+		if (pending & SDMMC_INT_CD) {
+			mci_writel(host, RINTSTS, SDMMC_INT_CD);
+			tasklet_schedule(&host->card_tasklet);
+		}
+
+	} while (pass_count++ < 5);
+
+#ifdef CONFIG_MMC_DW_IDMAC
+	/* Handle DMA interrupts */
+	pending = mci_readl(host, IDSTS);
+	if (pending & (SDMMC_IDMAC_INT_TI | SDMMC_IDMAC_INT_RI)) {
+		mci_writel(host, IDSTS, SDMMC_IDMAC_INT_TI | SDMMC_IDMAC_INT_RI);
+		mci_writel(host, IDSTS, SDMMC_IDMAC_INT_NI);
+		set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
+		host->dma_ops->complete(host);
+	}
+#endif
+
+	return IRQ_HANDLED;
+}
+
+static void dw_mci_tasklet_card(unsigned long data)
+{
+	struct dw_mci *host = (struct dw_mci *)data;
+	int i;
+
+	for (i = 0; i < host->num_slots; i++) {
+		struct dw_mci_slot *slot = host->slot[i];
+		struct mmc_host *mmc = slot->mmc;
+		struct mmc_request *mrq;
+		int present;
+		u32 ctrl;
+
+		present = dw_mci_get_cd(mmc);
+		while (present != slot->last_detect_state) {
+			spin_lock(&host->lock);
+
+			dev_dbg(&slot->mmc->class_dev, "card %s\n",
+				present ? "inserted" : "removed");
+
+			/* Card change detected */
+			slot->last_detect_state = present;
+
+			/* Power up slot */
+			if (present != 0) {
+				if (host->pdata->setpower)
+					host->pdata->setpower(slot->id,
+							      mmc->ocr_avail);
+
+				set_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+			}
+
+			/* Clean up queue if present */
+			mrq = slot->mrq;
+			if (mrq) {
+				if (mrq == host->mrq) {
+					host->data = NULL;
+					host->cmd = NULL;
+
+					switch (host->state) {
+					case STATE_IDLE:
+						break;
+					case STATE_SENDING_CMD:
+						mrq->cmd->error = -ENOMEDIUM;
+						if (!mrq->data)
+							break;
+						/* fall through */
+					case STATE_SENDING_DATA:
+						mrq->data->error = -ENOMEDIUM;
+						dw_mci_stop_dma(host);
+						break;
+					case STATE_DATA_BUSY:
+					case STATE_DATA_ERROR:
+						if (mrq->data->error == -EINPROGRESS)
+							mrq->data->error = -ENOMEDIUM;
+						if (!mrq->stop)
+							break;
+						/* fall through */
+					case STATE_SENDING_STOP:
+						mrq->stop->error = -ENOMEDIUM;
+						break;
+					}
+
+					dw_mci_request_end(host, mrq);
+				} else {
+					list_del(&slot->queue_node);
+					mrq->cmd->error = -ENOMEDIUM;
+					if (mrq->data)
+						mrq->data->error = -ENOMEDIUM;
+					if (mrq->stop)
+						mrq->stop->error = -ENOMEDIUM;
+
+					spin_unlock(&host->lock);
+					mmc_request_done(slot->mmc, mrq);
+					spin_lock(&host->lock);
+				}
+			}
+
+			/* Power down slot */
+			if (present == 0) {
+				if (host->pdata->setpower)
+					host->pdata->setpower(slot->id, 0);
+				clear_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+
+				/*
+				 * Clear down the FIFO - doing so generates a
+				 * block interrupt, hence setting the
+				 * scatter-gather pointer to NULL.
+				 */
+				host->sg = NULL;
+
+				ctrl = mci_readl(host, CTRL);
+				ctrl |= SDMMC_CTRL_FIFO_RESET;
+				mci_writel(host, CTRL, ctrl);
+
+#ifdef CONFIG_MMC_DW_IDMAC
+				ctrl = mci_readl(host, BMOD);
+				ctrl |= 0x01; /* Software reset of DMA */
+				mci_writel(host, BMOD, ctrl);
+#endif
+
+			}
+
+			spin_unlock(&host->lock);
+			present = dw_mci_get_cd(mmc);
+		}
+
+		mmc_detect_change(slot->mmc,
+			msecs_to_jiffies(host->pdata->detect_delay_ms));
+	}
+}
+
+static int __init dw_mci_init_slot(struct dw_mci *host, unsigned int id)
+{
+	struct mmc_host *mmc;
+	struct dw_mci_slot *slot;
+
+	mmc = mmc_alloc_host(sizeof(struct dw_mci_slot), &host->pdev->dev);
+	if (!mmc)
+		return -ENOMEM;
+
+	slot = mmc_priv(mmc);
+	slot->id = id;
+	slot->mmc = mmc;
+	slot->host = host;
+
+	mmc->ops = &dw_mci_ops;
+	mmc->f_min = DIV_ROUND_UP(host->bus_hz, 510);
+	mmc->f_max = host->bus_hz;
+
+	if (host->pdata->get_ocr)
+		mmc->ocr_avail = host->pdata->get_ocr(id);
+	else
+		mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
+
+	/*
+	 * Start with slot power disabled, it will be enabled when a card
+	 * is detected.
+	 */
+	if (host->pdata->setpower)
+		host->pdata->setpower(id, 0);
+
+	mmc->caps = 0;
+	if (host->pdata->get_bus_wd)
+		if (host->pdata->get_bus_wd(slot->id) >= 4)
+			mmc->caps |= MMC_CAP_4_BIT_DATA;
+
+	if (host->pdata->quirks & DW_MCI_QUIRK_HIGHSPEED)
+		mmc->caps |= MMC_CAP_SD_HIGHSPEED;
+
+#ifdef CONFIG_MMC_DW_IDMAC
+	mmc->max_segs = host->ring_size;
+	mmc->max_blk_size = 65536;
+	mmc->max_blk_count = host->ring_size;
+	mmc->max_seg_size = 0x1000;
+	mmc->max_req_size = mmc->max_seg_size * mmc->max_blk_count;
+#else
+	if (host->pdata->blk_settings) {
+		mmc->max_segs = host->pdata->blk_settings->max_segs;
+		mmc->max_blk_size = host->pdata->blk_settings->max_blk_size;
+		mmc->max_blk_count = host->pdata->blk_settings->max_blk_count;
+		mmc->max_req_size = host->pdata->blk_settings->max_req_size;
+		mmc->max_seg_size = host->pdata->blk_settings->max_seg_size;
+	} else {
+		/* Useful defaults if platform data is unset. */
+		mmc->max_segs = 64;
+		mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
+		mmc->max_blk_count = 512;
+		mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
+		mmc->max_seg_size = mmc->max_req_size;
+	}
+#endif /* CONFIG_MMC_DW_IDMAC */
+
+	if (dw_mci_get_cd(mmc))
+		set_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+	else
+		clear_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+
+	host->slot[id] = slot;
+	mmc_add_host(mmc);
+
+#if defined(CONFIG_DEBUG_FS)
+	dw_mci_init_debugfs(slot);
+#endif
+
+	/* Card initially undetected */
+	slot->last_detect_state = 0;
+
+	return 0;
+}
+
+static void dw_mci_cleanup_slot(struct dw_mci_slot *slot, unsigned int id)
+{
+	/* Shutdown detect IRQ */
+	if (slot->host->pdata->exit)
+		slot->host->pdata->exit(id);
+
+	/* Debugfs stuff is cleaned up by mmc core */
+	mmc_remove_host(slot->mmc);
+	slot->host->slot[id] = NULL;
+	mmc_free_host(slot->mmc);
+}
+
+static void dw_mci_init_dma(struct dw_mci *host)
+{
+	/* Alloc memory for sg translation */
+	host->sg_cpu = dma_alloc_coherent(&host->pdev->dev, PAGE_SIZE,
+					  &host->sg_dma, GFP_KERNEL);
+	if (!host->sg_cpu) {
+		dev_err(&host->pdev->dev, "%s: could not alloc DMA memory\n",
+			__func__);
+		goto no_dma;
+	}
+
+	/* Determine which DMA interface to use */
+#ifdef CONFIG_MMC_DW_IDMAC
+	host->dma_ops = &dw_mci_idmac_ops;
+	dev_info(&host->pdev->dev, "Using internal DMA controller.\n");
+#endif
+
+	if (!host->dma_ops)
+		goto no_dma;
+
+	if (host->dma_ops->init) {
+		if (host->dma_ops->init(host)) {
+			dev_err(&host->pdev->dev, "%s: Unable to initialize "
+				"DMA Controller.\n", __func__);
+			goto no_dma;
+		}
+	} else {
+		dev_err(&host->pdev->dev, "DMA initialization not found.\n");
+		goto no_dma;
+	}
+
+	host->use_dma = 1;
+	return;
+
+no_dma:
+	dev_info(&host->pdev->dev, "Using PIO mode.\n");
+	host->use_dma = 0;
+	return;
+}
+
+static bool mci_wait_reset(struct device *dev, struct dw_mci *host)
+{
+	unsigned long timeout = jiffies + msecs_to_jiffies(500);
+	unsigned int ctrl;
+
+	mci_writel(host, CTRL, (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET |
+				SDMMC_CTRL_DMA_RESET));
+
+	/* wait till resets clear */
+	do {
+		ctrl = mci_readl(host, CTRL);
+		if (!(ctrl & (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET |
+			      SDMMC_CTRL_DMA_RESET)))
+			return true;
+	} while (time_before(jiffies, timeout));
+
+	dev_err(dev, "Timeout resetting block (ctrl %#x)\n", ctrl);
+
+	return false;
+}
+
+static int dw_mci_probe(struct platform_device *pdev)
+{
+	struct dw_mci *host;
+	struct resource	*regs;
+	struct dw_mci_board *pdata;
+	int irq, ret, i, width;
+	u32 fifo_size;
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!regs)
+		return -ENXIO;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	host = kzalloc(sizeof(struct dw_mci), GFP_KERNEL);
+	if (!host)
+		return -ENOMEM;
+
+	host->pdev = pdev;
+	host->pdata = pdata = pdev->dev.platform_data;
+	if (!pdata || !pdata->init) {
+		dev_err(&pdev->dev,
+			"Platform data must supply init function\n");
+		ret = -ENODEV;
+		goto err_freehost;
+	}
+
+	if (!pdata->select_slot && pdata->num_slots > 1) {
+		dev_err(&pdev->dev,
+			"Platform data must supply select_slot function\n");
+		ret = -ENODEV;
+		goto err_freehost;
+	}
+
+	if (!pdata->bus_hz) {
+		dev_err(&pdev->dev,
+			"Platform data must supply bus speed\n");
+		ret = -ENODEV;
+		goto err_freehost;
+	}
+
+	host->bus_hz = pdata->bus_hz;
+	host->quirks = pdata->quirks;
+
+	spin_lock_init(&host->lock);
+	INIT_LIST_HEAD(&host->queue);
+
+	ret = -ENOMEM;
+	host->regs = ioremap(regs->start, regs->end - regs->start + 1);
+	if (!host->regs)
+		goto err_freehost;
+
+	host->dma_ops = pdata->dma_ops;
+	dw_mci_init_dma(host);
+
+	/*
+	 * Get the host data width - this assumes that HCON has been set with
+	 * the correct values.
+	 */
+	i = (mci_readl(host, HCON) >> 7) & 0x7;
+	if (!i) {
+		host->push_data = dw_mci_push_data16;
+		host->pull_data = dw_mci_pull_data16;
+		width = 16;
+		host->data_shift = 1;
+	} else if (i == 2) {
+		host->push_data = dw_mci_push_data64;
+		host->pull_data = dw_mci_pull_data64;
+		width = 64;
+		host->data_shift = 3;
+	} else {
+		/* Check for a reserved value, and warn if it is */
+		WARN((i != 1),
+		     "HCON reports a reserved host data width!\n"
+		     "Defaulting to 32-bit access.\n");
+		host->push_data = dw_mci_push_data32;
+		host->pull_data = dw_mci_pull_data32;
+		width = 32;
+		host->data_shift = 2;
+	}
+
+	/* Reset all blocks */
+	if (!mci_wait_reset(&pdev->dev, host)) {
+		ret = -ENODEV;
+		goto err_dmaunmap;
+	}
+
+	/* Clear the interrupts for the host controller */
+	mci_writel(host, RINTSTS, 0xFFFFFFFF);
+	mci_writel(host, INTMASK, 0); /* disable all mmc interrupt first */
+
+	/* Put in max timeout */
+	mci_writel(host, TMOUT, 0xFFFFFFFF);
+
+	/*
+	 * FIFO threshold settings  RxMark  = fifo_size / 2 - 1,
+	 *                          Tx Mark = fifo_size / 2 DMA Size = 8
+	 */
+	fifo_size = mci_readl(host, FIFOTH);
+	fifo_size = (fifo_size >> 16) & 0x7ff;
+	mci_writel(host, FIFOTH, ((0x2 << 28) | ((fifo_size/2 - 1) << 16) |
+				  ((fifo_size/2) << 0)));
+
+	/* disable clock to CIU */
+	mci_writel(host, CLKENA, 0);
+	mci_writel(host, CLKSRC, 0);
+
+	tasklet_init(&host->tasklet, dw_mci_tasklet_func, (unsigned long)host);
+	tasklet_init(&host->card_tasklet,
+		     dw_mci_tasklet_card, (unsigned long)host);
+
+	ret = request_irq(irq, dw_mci_interrupt, 0, "dw-mci", host);
+	if (ret)
+		goto err_dmaunmap;
+
+	platform_set_drvdata(pdev, host);
+
+	if (host->pdata->num_slots)
+		host->num_slots = host->pdata->num_slots;
+	else
+		host->num_slots = ((mci_readl(host, HCON) >> 1) & 0x1F) + 1;
+
+	/* We need at least one slot to succeed */
+	for (i = 0; i < host->num_slots; i++) {
+		ret = dw_mci_init_slot(host, i);
+		if (ret) {
+			ret = -ENODEV;
+			goto err_init_slot;
+		}
+	}
+
+	/*
+	 * Enable interrupts for command done, data over, data empty, card det,
+	 * receive ready and error such as transmit, receive timeout, crc error
+	 */
+	mci_writel(host, RINTSTS, 0xFFFFFFFF);
+	mci_writel(host, INTMASK, SDMMC_INT_CMD_DONE | SDMMC_INT_DATA_OVER |
+		   SDMMC_INT_TXDR | SDMMC_INT_RXDR |
+		   DW_MCI_ERROR_FLAGS | SDMMC_INT_CD);
+	mci_writel(host, CTRL, SDMMC_CTRL_INT_ENABLE); /* Enable mci interrupt */
+
+	dev_info(&pdev->dev, "DW MMC controller at irq %d, "
+		 "%d bit host data width\n", irq, width);
+	if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO)
+		dev_info(&pdev->dev, "Internal DMAC interrupt fix enabled.\n");
+
+	return 0;
+
+err_init_slot:
+	/* De-init any initialized slots */
+	while (i > 0) {
+		if (host->slot[i])
+			dw_mci_cleanup_slot(host->slot[i], i);
+		i--;
+	}
+	free_irq(irq, host);
+
+err_dmaunmap:
+	if (host->use_dma && host->dma_ops->exit)
+		host->dma_ops->exit(host);
+	dma_free_coherent(&host->pdev->dev, PAGE_SIZE,
+			  host->sg_cpu, host->sg_dma);
+	iounmap(host->regs);
+
+err_freehost:
+	kfree(host);
+	return ret;
+}
+
+static int __exit dw_mci_remove(struct platform_device *pdev)
+{
+	struct dw_mci *host = platform_get_drvdata(pdev);
+	int i;
+
+	mci_writel(host, RINTSTS, 0xFFFFFFFF);
+	mci_writel(host, INTMASK, 0); /* disable all mmc interrupt first */
+
+	platform_set_drvdata(pdev, NULL);
+
+	for (i = 0; i < host->num_slots; i++) {
+		dev_dbg(&pdev->dev, "remove slot %d\n", i);
+		if (host->slot[i])
+			dw_mci_cleanup_slot(host->slot[i], i);
+	}
+
+	/* disable clock to CIU */
+	mci_writel(host, CLKENA, 0);
+	mci_writel(host, CLKSRC, 0);
+
+	free_irq(platform_get_irq(pdev, 0), host);
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
+
+	if (host->use_dma && host->dma_ops->exit)
+		host->dma_ops->exit(host);
+
+	iounmap(host->regs);
+
+	kfree(host);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+/*
+ * TODO: we should probably disable the clock to the card in the suspend path.
+ */
+static int dw_mci_suspend(struct platform_device *pdev, pm_message_t mesg)
+{
+	int i, ret;
+	struct dw_mci *host = platform_get_drvdata(pdev);
+
+	for (i = 0; i < host->num_slots; i++) {
+		struct dw_mci_slot *slot = host->slot[i];
+		if (!slot)
+			continue;
+		ret = mmc_suspend_host(slot->mmc);
+		if (ret < 0) {
+			while (--i >= 0) {
+				slot = host->slot[i];
+				if (slot)
+					mmc_resume_host(host->slot[i]->mmc);
+			}
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int dw_mci_resume(struct platform_device *pdev)
+{
+	int i, ret;
+	struct dw_mci *host = platform_get_drvdata(pdev);
+
+	for (i = 0; i < host->num_slots; i++) {
+		struct dw_mci_slot *slot = host->slot[i];
+		if (!slot)
+			continue;
+		ret = mmc_resume_host(host->slot[i]->mmc);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+#else
+#define dw_mci_suspend	NULL
+#define dw_mci_resume	NULL
+#endif /* CONFIG_PM */
+
+static struct platform_driver dw_mci_driver = {
+	.remove		= __exit_p(dw_mci_remove),
+	.suspend	= dw_mci_suspend,
+	.resume		= dw_mci_resume,
+	.driver		= {
+		.name		= "dw_mmc",
+	},
+};
+
+static int __init dw_mci_init(void)
+{
+	return platform_driver_probe(&dw_mci_driver, dw_mci_probe);
+}
+
+static void __exit dw_mci_exit(void)
+{
+	platform_driver_unregister(&dw_mci_driver);
+}
+
+module_init(dw_mci_init);
+module_exit(dw_mci_exit);
+
+MODULE_DESCRIPTION("DW Multimedia Card Interface driver");
+MODULE_AUTHOR("NXP Semiconductor VietNam");
+MODULE_AUTHOR("Imagination Technologies Ltd");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h
new file mode 100644
index 000000000000..5dd55a75233d
--- /dev/null
+++ b/drivers/mmc/host/dw_mmc.h
@@ -0,0 +1,168 @@
+/*
+ * Synopsys DesignWare Multimedia Card Interface driver
+ *  (Based on NXP driver for lpc 31xx)
+ *
+ * Copyright (C) 2009 NXP Semiconductors
+ * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _DW_MMC_H_
+#define _DW_MMC_H_
+
+#define SDMMC_CTRL		0x000
+#define SDMMC_PWREN		0x004
+#define SDMMC_CLKDIV		0x008
+#define SDMMC_CLKSRC		0x00c
+#define SDMMC_CLKENA		0x010
+#define SDMMC_TMOUT		0x014
+#define SDMMC_CTYPE		0x018
+#define SDMMC_BLKSIZ		0x01c
+#define SDMMC_BYTCNT		0x020
+#define SDMMC_INTMASK		0x024
+#define SDMMC_CMDARG		0x028
+#define SDMMC_CMD		0x02c
+#define SDMMC_RESP0		0x030
+#define SDMMC_RESP1		0x034
+#define SDMMC_RESP2		0x038
+#define SDMMC_RESP3		0x03c
+#define SDMMC_MINTSTS		0x040
+#define SDMMC_RINTSTS		0x044
+#define SDMMC_STATUS		0x048
+#define SDMMC_FIFOTH		0x04c
+#define SDMMC_CDETECT		0x050
+#define SDMMC_WRTPRT		0x054
+#define SDMMC_GPIO		0x058
+#define SDMMC_TCBCNT		0x05c
+#define SDMMC_TBBCNT		0x060
+#define SDMMC_DEBNCE		0x064
+#define SDMMC_USRID		0x068
+#define SDMMC_VERID		0x06c
+#define SDMMC_HCON		0x070
+#define SDMMC_BMOD		0x080
+#define SDMMC_PLDMND		0x084
+#define SDMMC_DBADDR		0x088
+#define SDMMC_IDSTS		0x08c
+#define SDMMC_IDINTEN		0x090
+#define SDMMC_DSCADDR		0x094
+#define SDMMC_BUFADDR		0x098
+#define SDMMC_DATA		0x100
+#define SDMMC_DATA_ADR		0x100
+
+/* shift bit field */
+#define _SBF(f, v)		((v) << (f))
+
+/* Control register defines */
+#define SDMMC_CTRL_USE_IDMAC		BIT(25)
+#define SDMMC_CTRL_CEATA_INT_EN		BIT(11)
+#define SDMMC_CTRL_SEND_AS_CCSD		BIT(10)
+#define SDMMC_CTRL_SEND_CCSD		BIT(9)
+#define SDMMC_CTRL_ABRT_READ_DATA	BIT(8)
+#define SDMMC_CTRL_SEND_IRQ_RESP	BIT(7)
+#define SDMMC_CTRL_READ_WAIT		BIT(6)
+#define SDMMC_CTRL_DMA_ENABLE		BIT(5)
+#define SDMMC_CTRL_INT_ENABLE		BIT(4)
+#define SDMMC_CTRL_DMA_RESET		BIT(2)
+#define SDMMC_CTRL_FIFO_RESET		BIT(1)
+#define SDMMC_CTRL_RESET		BIT(0)
+/* Clock Enable register defines */
+#define SDMMC_CLKEN_LOW_PWR		BIT(16)
+#define SDMMC_CLKEN_ENABLE		BIT(0)
+/* time-out register defines */
+#define SDMMC_TMOUT_DATA(n)		_SBF(8, (n))
+#define SDMMC_TMOUT_DATA_MSK		0xFFFFFF00
+#define SDMMC_TMOUT_RESP(n)		((n) & 0xFF)
+#define SDMMC_TMOUT_RESP_MSK		0xFF
+/* card-type register defines */
+#define SDMMC_CTYPE_8BIT		BIT(16)
+#define SDMMC_CTYPE_4BIT		BIT(0)
+#define SDMMC_CTYPE_1BIT		0
+/* Interrupt status & mask register defines */
+#define SDMMC_INT_SDIO			BIT(16)
+#define SDMMC_INT_EBE			BIT(15)
+#define SDMMC_INT_ACD			BIT(14)
+#define SDMMC_INT_SBE			BIT(13)
+#define SDMMC_INT_HLE			BIT(12)
+#define SDMMC_INT_FRUN			BIT(11)
+#define SDMMC_INT_HTO			BIT(10)
+#define SDMMC_INT_DTO			BIT(9)
+#define SDMMC_INT_RTO			BIT(8)
+#define SDMMC_INT_DCRC			BIT(7)
+#define SDMMC_INT_RCRC			BIT(6)
+#define SDMMC_INT_RXDR			BIT(5)
+#define SDMMC_INT_TXDR			BIT(4)
+#define SDMMC_INT_DATA_OVER		BIT(3)
+#define SDMMC_INT_CMD_DONE		BIT(2)
+#define SDMMC_INT_RESP_ERR		BIT(1)
+#define SDMMC_INT_CD			BIT(0)
+#define SDMMC_INT_ERROR			0xbfc2
+/* Command register defines */
+#define SDMMC_CMD_START			BIT(31)
+#define SDMMC_CMD_CCS_EXP		BIT(23)
+#define SDMMC_CMD_CEATA_RD		BIT(22)
+#define SDMMC_CMD_UPD_CLK		BIT(21)
+#define SDMMC_CMD_INIT			BIT(15)
+#define SDMMC_CMD_STOP			BIT(14)
+#define SDMMC_CMD_PRV_DAT_WAIT		BIT(13)
+#define SDMMC_CMD_SEND_STOP		BIT(12)
+#define SDMMC_CMD_STRM_MODE		BIT(11)
+#define SDMMC_CMD_DAT_WR		BIT(10)
+#define SDMMC_CMD_DAT_EXP		BIT(9)
+#define SDMMC_CMD_RESP_CRC		BIT(8)
+#define SDMMC_CMD_RESP_LONG		BIT(7)
+#define SDMMC_CMD_RESP_EXP		BIT(6)
+#define SDMMC_CMD_INDX(n)		((n) & 0x1F)
+/* Status register defines */
+#define SDMMC_GET_FCNT(x)		(((x)>>17) & 0x1FF)
+#define SDMMC_FIFO_SZ			32
+/* Internal DMAC interrupt defines */
+#define SDMMC_IDMAC_INT_AI		BIT(9)
+#define SDMMC_IDMAC_INT_NI		BIT(8)
+#define SDMMC_IDMAC_INT_CES		BIT(5)
+#define SDMMC_IDMAC_INT_DU		BIT(4)
+#define SDMMC_IDMAC_INT_FBE		BIT(2)
+#define SDMMC_IDMAC_INT_RI		BIT(1)
+#define SDMMC_IDMAC_INT_TI		BIT(0)
+/* Internal DMAC bus mode bits */
+#define SDMMC_IDMAC_ENABLE		BIT(7)
+#define SDMMC_IDMAC_FB			BIT(1)
+#define SDMMC_IDMAC_SWRESET		BIT(0)
+
+/* Register access macros */
+#define mci_readl(dev, reg)			\
+	__raw_readl(dev->regs + SDMMC_##reg)
+#define mci_writel(dev, reg, value)			\
+	__raw_writel((value), dev->regs + SDMMC_##reg)
+
+/* 16-bit FIFO access macros */
+#define mci_readw(dev, reg)			\
+	__raw_readw(dev->regs + SDMMC_##reg)
+#define mci_writew(dev, reg, value)			\
+	__raw_writew((value), dev->regs + SDMMC_##reg)
+
+/* 64-bit FIFO access macros */
+#ifdef readq
+#define mci_readq(dev, reg)			\
+	__raw_readq(dev->regs + SDMMC_##reg)
+#define mci_writeq(dev, reg, value)			\
+	__raw_writeq((value), dev->regs + SDMMC_##reg)
+#else
+/*
+ * Dummy readq implementation for architectures that don't define it.
+ *
+ * We would assume that none of these architectures would configure
+ * the IP block with a 64bit FIFO width, so this code will never be
+ * executed on those machines. Defining these macros here keeps the
+ * rest of the code free from ifdefs.
+ */
+#define mci_readq(dev, reg)			\
+	(*(volatile u64 __force *)(dev->regs + SDMMC_##reg))
+#define mci_writeq(dev, reg, value)			\
+	(*(volatile u64 __force *)(dev->regs + SDMMC_##reg) = value)
+#endif
+
+#endif /* _DW_MMC_H_ */
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
new file mode 100644
index 000000000000..16b0261763ed
--- /dev/null
+++ b/include/linux/mmc/dw_mmc.h
@@ -0,0 +1,217 @@
+/*
+ * Synopsys DesignWare Multimedia Card Interface driver
+ *  (Based on NXP driver for lpc 31xx)
+ *
+ * Copyright (C) 2009 NXP Semiconductors
+ * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _LINUX_MMC_DW_MMC_H_
+#define _LINUX_MMC_DW_MMC_H_
+
+#define MAX_MCI_SLOTS	2
+
+enum dw_mci_state {
+	STATE_IDLE = 0,
+	STATE_SENDING_CMD,
+	STATE_SENDING_DATA,
+	STATE_DATA_BUSY,
+	STATE_SENDING_STOP,
+	STATE_DATA_ERROR,
+};
+
+enum {
+	EVENT_CMD_COMPLETE = 0,
+	EVENT_XFER_COMPLETE,
+	EVENT_DATA_COMPLETE,
+	EVENT_DATA_ERROR,
+	EVENT_XFER_ERROR
+};
+
+struct mmc_data;
+
+/**
+ * struct dw_mci - MMC controller state shared between all slots
+ * @lock: Spinlock protecting the queue and associated data.
+ * @regs: Pointer to MMIO registers.
+ * @sg: Scatterlist entry currently being processed by PIO code, if any.
+ * @pio_offset: Offset into the current scatterlist entry.
+ * @cur_slot: The slot which is currently using the controller.
+ * @mrq: The request currently being processed on @cur_slot,
+ *	or NULL if the controller is idle.
+ * @cmd: The command currently being sent to the card, or NULL.
+ * @data: The data currently being transferred, or NULL if no data
+ *	transfer is in progress.
+ * @use_dma: Whether DMA channel is initialized or not.
+ * @sg_dma: Bus address of DMA buffer.
+ * @sg_cpu: Virtual address of DMA buffer.
+ * @dma_ops: Pointer to platform-specific DMA callbacks.
+ * @cmd_status: Snapshot of SR taken upon completion of the current
+ *	command. Only valid when EVENT_CMD_COMPLETE is pending.
+ * @data_status: Snapshot of SR taken upon completion of the current
+ *	data transfer. Only valid when EVENT_DATA_COMPLETE or
+ *	EVENT_DATA_ERROR is pending.
+ * @stop_cmdr: Value to be loaded into CMDR when the stop command is
+ *	to be sent.
+ * @dir_status: Direction of current transfer.
+ * @tasklet: Tasklet running the request state machine.
+ * @card_tasklet: Tasklet handling card detect.
+ * @pending_events: Bitmask of events flagged by the interrupt handler
+ *	to be processed by the tasklet.
+ * @completed_events: Bitmask of events which the state machine has
+ *	processed.
+ * @state: Tasklet state.
+ * @queue: List of slots waiting for access to the controller.
+ * @bus_hz: The rate of @mck in Hz. This forms the basis for MMC bus
+ *	rate and timeout calculations.
+ * @current_speed: Configured rate of the controller.
+ * @num_slots: Number of slots available.
+ * @pdev: Platform device associated with the MMC controller.
+ * @pdata: Platform data associated with the MMC controller.
+ * @slot: Slots sharing this MMC controller.
+ * @data_shift: log2 of FIFO item size.
+ * @push_data: Pointer to FIFO push function.
+ * @pull_data: Pointer to FIFO pull function.
+ * @quirks: Set of quirks that apply to specific versions of the IP.
+ *
+ * Locking
+ * =======
+ *
+ * @lock is a softirq-safe spinlock protecting @queue as well as
+ * @cur_slot, @mrq and @state. These must always be updated
+ * at the same time while holding @lock.
+ *
+ * The @mrq field of struct dw_mci_slot is also protected by @lock,
+ * and must always be written at the same time as the slot is added to
+ * @queue.
+ *
+ * @pending_events and @completed_events are accessed using atomic bit
+ * operations, so they don't need any locking.
+ *
+ * None of the fields touched by the interrupt handler need any
+ * locking. However, ordering is important: Before EVENT_DATA_ERROR or
+ * EVENT_DATA_COMPLETE is set in @pending_events, all data-related
+ * interrupts must be disabled and @data_status updated with a
+ * snapshot of SR. Similarly, before EVENT_CMD_COMPLETE is set, the
+ * CMDRDY interupt must be disabled and @cmd_status updated with a
+ * snapshot of SR, and before EVENT_XFER_COMPLETE can be set, the
+ * bytes_xfered field of @data must be written. This is ensured by
+ * using barriers.
+ */
+struct dw_mci {
+	spinlock_t		lock;
+	void __iomem		*regs;
+
+	struct scatterlist	*sg;
+	unsigned int		pio_offset;
+
+	struct dw_mci_slot	*cur_slot;
+	struct mmc_request	*mrq;
+	struct mmc_command	*cmd;
+	struct mmc_data		*data;
+
+	/* DMA interface members*/
+	int			use_dma;
+
+	dma_addr_t		sg_dma;
+	void			*sg_cpu;
+	struct dw_mci_dma_ops	*dma_ops;
+#ifdef CONFIG_MMC_DW_IDMAC
+	unsigned int		ring_size;
+#else
+	struct dw_mci_dma_data	*dma_data;
+#endif
+	u32			cmd_status;
+	u32			data_status;
+	u32			stop_cmdr;
+	u32			dir_status;
+	struct tasklet_struct	tasklet;
+	struct tasklet_struct	card_tasklet;
+	unsigned long		pending_events;
+	unsigned long		completed_events;
+	enum dw_mci_state	state;
+	struct list_head	queue;
+
+	u32			bus_hz;
+	u32			current_speed;
+	u32			num_slots;
+	struct platform_device	*pdev;
+	struct dw_mci_board	*pdata;
+	struct dw_mci_slot	*slot[MAX_MCI_SLOTS];
+
+	/* FIFO push and pull */
+	int			data_shift;
+	void (*push_data)(struct dw_mci *host, void *buf, int cnt);
+	void (*pull_data)(struct dw_mci *host, void *buf, int cnt);
+
+	/* Workaround flags */
+	u32			quirks;
+};
+
+/* DMA ops for Internal/External DMAC interface */
+struct dw_mci_dma_ops {
+	/* DMA Ops */
+	int (*init)(struct dw_mci *host);
+	void (*start)(struct dw_mci *host, unsigned int sg_len);
+	void (*complete)(struct dw_mci *host);
+	void (*stop)(struct dw_mci *host);
+	void (*cleanup)(struct dw_mci *host);
+	void (*exit)(struct dw_mci *host);
+};
+
+/* IP Quirks/flags. */
+/* No special quirks or flags to cater for */
+#define DW_MCI_QUIRK_NONE		0
+/* DTO fix for command transmission with IDMAC configured */
+#define DW_MCI_QUIRK_IDMAC_DTO		1
+/* delay needed between retries on some 2.11a implementations */
+#define DW_MCI_QUIRK_RETRY_DELAY	2
+/* High Speed Capable - Supports HS cards (upto 50MHz) */
+#define DW_MCI_QUIRK_HIGHSPEED		4
+
+
+struct dma_pdata;
+
+struct block_settings {
+	unsigned short	max_segs;	/* see blk_queue_max_segments */
+	unsigned int	max_blk_size;	/* maximum size of one mmc block */
+	unsigned int	max_blk_count;	/* maximum number of blocks in one req*/
+	unsigned int	max_req_size;	/* maximum number of bytes in one req*/
+	unsigned int	max_seg_size;	/* see blk_queue_max_segment_size */
+};
+
+/* Board platform data */
+struct dw_mci_board {
+	u32 num_slots;
+
+	u32 quirks; /* Workaround / Quirk flags */
+	unsigned int bus_hz; /* Bus speed */
+
+	/* delay in mS before detecting cards after interrupt */
+	u32 detect_delay_ms;
+
+	int (*init)(u32 slot_id, irq_handler_t , void *);
+	int (*get_ro)(u32 slot_id);
+	int (*get_cd)(u32 slot_id);
+	int (*get_ocr)(u32 slot_id);
+	int (*get_bus_wd)(u32 slot_id);
+	/*
+	 * Enable power to selected slot and set voltage to desired level.
+	 * Voltage levels are specified using MMC_VDD_xxx defines defined
+	 * in linux/mmc/host.h file.
+	 */
+	void (*setpower)(u32 slot_id, u32 volt);
+	void (*exit)(u32 slot_id);
+	void (*select_slot)(u32 slot_id);
+
+	struct dw_mci_dma_ops *dma_ops;
+	struct dma_pdata *data;
+	struct block_settings *blk_settings;
+};
+
+#endif /* _LINUX_MMC_DW_MMC_H_ */
-- 
cgit v1.2.3-71-gd317


From 93173054f2979de41b1912b19f0b57edfb35fcdc Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Wed, 22 Dec 2010 12:02:15 +0100
Subject: mmc: tmio_mmc: implement a bounce buffer for unaligned DMA

For example, with SDIO WLAN cards, some transfers happen with buffers at
odd addresses, whereas the SH-Mobile DMA engine requires even addresses
for SDHI. This patch extends the tmio driver with a bounce buffer, that
is used for single entry scatter-gather lists both for sending and
receiving. If we ever encounter unaligned transfers with multi-element
sg lists, this patch will have to be extended. For now it just falls
back to PIO in this and other unsupported cases.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/tmio_mmc.c | 89 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/mfd/tmio.h    |  1 +
 2 files changed, 84 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index e04c032abb1c..595b7b3f160d 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -111,6 +111,8 @@
 		sd_ctrl_write32((host), CTL_STATUS, ~(i)); \
 	} while (0)
 
+/* This is arbitrary, just noone needed any higher alignment yet */
+#define MAX_ALIGN 4
 
 struct tmio_mmc_host {
 	void __iomem *ctl;
@@ -127,6 +129,7 @@ struct tmio_mmc_host {
 
 	/* pio related stuff */
 	struct scatterlist      *sg_ptr;
+	struct scatterlist      *sg_orig;
 	unsigned int            sg_len;
 	unsigned int            sg_off;
 
@@ -139,9 +142,13 @@ struct tmio_mmc_host {
 	struct tasklet_struct	dma_issue;
 #ifdef CONFIG_TMIO_MMC_DMA
 	unsigned int            dma_sglen;
+	u8			bounce_buf[PAGE_CACHE_SIZE] __attribute__((aligned(MAX_ALIGN)));
+	struct scatterlist	bounce_sg;
 #endif
 };
 
+static void tmio_check_bounce_buffer(struct tmio_mmc_host *host);
+
 static u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr)
 {
 	return readw(host->ctl + (addr << host->bus_shift));
@@ -180,6 +187,7 @@ static void tmio_mmc_init_sg(struct tmio_mmc_host *host, struct mmc_data *data)
 {
 	host->sg_len = data->sg_len;
 	host->sg_ptr = data->sg;
+	host->sg_orig = data->sg;
 	host->sg_off = 0;
 }
 
@@ -438,6 +446,8 @@ static void tmio_mmc_do_data_irq(struct tmio_mmc_host *host)
 	if (data->flags & MMC_DATA_READ) {
 		if (!host->chan_rx)
 			disable_mmc_irqs(host, TMIO_MASK_READOP);
+		else
+			tmio_check_bounce_buffer(host);
 		dev_dbg(&host->pdev->dev, "Complete Rx request %p\n",
 			host->mrq);
 	} else {
@@ -529,8 +539,7 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
 			if (!host->chan_rx)
 				enable_mmc_irqs(host, TMIO_MASK_READOP);
 		} else {
-			struct dma_chan *chan = host->chan_tx;
-			if (!chan)
+			if (!host->chan_tx)
 				enable_mmc_irqs(host, TMIO_MASK_WRITEOP);
 			else
 				tasklet_schedule(&host->dma_issue);
@@ -612,6 +621,16 @@ out:
 }
 
 #ifdef CONFIG_TMIO_MMC_DMA
+static void tmio_check_bounce_buffer(struct tmio_mmc_host *host)
+{
+	if (host->sg_ptr == &host->bounce_sg) {
+		unsigned long flags;
+		void *sg_vaddr = tmio_mmc_kmap_atomic(host->sg_orig, &flags);
+		memcpy(sg_vaddr, host->bounce_buf, host->bounce_sg.length);
+		tmio_mmc_kunmap_atomic(sg_vaddr, &flags);
+	}
+}
+
 static void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable)
 {
 #if defined(CONFIG_SUPERH) || defined(CONFIG_ARCH_SHMOBILE)
@@ -634,11 +653,35 @@ static void tmio_dma_complete(void *arg)
 
 static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 {
-	struct scatterlist *sg = host->sg_ptr;
+	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_rx;
+	struct mfd_cell	*cell = host->pdev->dev.platform_data;
+	struct tmio_mmc_data *pdata = cell->driver_data;
 	dma_cookie_t cookie;
-	int ret;
+	int ret, i;
+	bool aligned = true, multiple = true;
+	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+
+	for_each_sg(sg, sg_tmp, host->sg_len, i) {
+		if (sg_tmp->offset & align)
+			aligned = false;
+		if (sg_tmp->length & align) {
+			multiple = false;
+			break;
+		}
+	}
+
+	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
+			  align >= MAX_ALIGN)) || !multiple)
+		goto pio;
+
+	/* The only sg element can be unaligned, use our bounce buffer then */
+	if (!aligned) {
+		sg_init_one(&host->bounce_sg, host->bounce_buf, sg->length);
+		host->sg_ptr = &host->bounce_sg;
+		sg = host->sg_ptr;
+	}
 
 	ret = dma_map_sg(&host->pdev->dev, sg, host->sg_len, DMA_FROM_DEVICE);
 	if (ret > 0) {
@@ -661,6 +704,7 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 	dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
 		__func__, host->sg_len, ret, cookie, host->mrq);
 
+pio:
 	if (!desc) {
 		/* DMA failed, fall back to PIO */
 		if (ret >= 0)
@@ -684,11 +728,39 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 
 static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 {
-	struct scatterlist *sg = host->sg_ptr;
+	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_tx;
+	struct mfd_cell	*cell = host->pdev->dev.platform_data;
+	struct tmio_mmc_data *pdata = cell->driver_data;
 	dma_cookie_t cookie;
-	int ret;
+	int ret, i;
+	bool aligned = true, multiple = true;
+	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+
+	for_each_sg(sg, sg_tmp, host->sg_len, i) {
+		if (sg_tmp->offset & align)
+			aligned = false;
+		if (sg_tmp->length & align) {
+			multiple = false;
+			break;
+		}
+	}
+
+	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
+			  align >= MAX_ALIGN)) || !multiple)
+		goto pio;
+
+	/* The only sg element can be unaligned, use our bounce buffer then */
+	if (!aligned) {
+		unsigned long flags;
+		void *sg_vaddr = tmio_mmc_kmap_atomic(sg, &flags);
+		sg_init_one(&host->bounce_sg, host->bounce_buf, sg->length);
+		memcpy(host->bounce_buf, sg_vaddr, host->bounce_sg.length);
+		tmio_mmc_kunmap_atomic(sg_vaddr, &flags);
+		host->sg_ptr = &host->bounce_sg;
+		sg = host->sg_ptr;
+	}
 
 	ret = dma_map_sg(&host->pdev->dev, sg, host->sg_len, DMA_TO_DEVICE);
 	if (ret > 0) {
@@ -709,6 +781,7 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 	dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
 		__func__, host->sg_len, ret, cookie, host->mrq);
 
+pio:
 	if (!desc) {
 		/* DMA failed, fall back to PIO */
 		if (ret >= 0)
@@ -822,6 +895,10 @@ static void tmio_mmc_release_dma(struct tmio_mmc_host *host)
 	}
 }
 #else
+static void tmio_check_bounce_buffer(struct tmio_mmc_host *host)
+{
+}
+
 static void tmio_mmc_start_dma(struct tmio_mmc_host *host,
 			       struct mmc_data *data)
 {
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 085f041197dc..dbfc053b1f95 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -66,6 +66,7 @@ void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state);
 struct tmio_mmc_dma {
 	void *chan_priv_tx;
 	void *chan_priv_rx;
+	int alignment_shift;
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From 845ecd20239c28e97e766ff54078a58be19f3a91 Mon Sep 17 00:00:00 2001
From: Arnd Hannemann <arnd@arndnet.de>
Date: Tue, 28 Dec 2010 23:22:31 +0100
Subject: mmc: tmio_mmc: implement SDIO IRQ support

This patch implements SDIO IRQ support for mfds which
announce the TMIO_MMC_SDIO_IRQ flag for tmio_mmc.
If MMC_CAP_SDIO_IRQ is also set SDIO IRQ signalling is activated.
Tested with a b43-based wireless SDIO card and sh_mobile_sdhi.

Signed-off-by: Arnd Hannemann <arnd@arndnet.de>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/host/tmio_mmc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tmio.h    |  4 +++
 2 files changed, 83 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index f442c8205b0a..81bed310ddcd 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -53,6 +53,8 @@
 #define CTL_SD_ERROR_DETAIL_STATUS 0x2c
 #define CTL_SD_DATA_PORT 0x30
 #define CTL_TRANSACTION_CTL 0x34
+#define CTL_SDIO_STATUS 0x36
+#define CTL_SDIO_IRQ_MASK 0x38
 #define CTL_RESET_SD 0xe0
 #define CTL_SDIO_REGS 0x100
 #define CTL_CLK_AND_WAIT_CTL 0x138
@@ -81,6 +83,12 @@
 #define TMIO_STAT_CMD_BUSY      0x40000000
 #define TMIO_STAT_ILL_ACCESS    0x80000000
 
+/* Definitions for values the CTRL_SDIO_STATUS register can take. */
+#define TMIO_SDIO_STAT_IOIRQ	0x0001
+#define TMIO_SDIO_STAT_EXPUB52	0x4000
+#define TMIO_SDIO_STAT_EXWT	0x8000
+#define TMIO_SDIO_MASK_ALL	0xc007
+
 /* Define some IRQ masks */
 /* This is the mask used at reset by the chip */
 #define TMIO_MASK_ALL           0x837f031d
@@ -122,6 +130,7 @@ struct tmio_mmc_host {
 	struct mmc_data         *data;
 	struct mmc_host         *mmc;
 	int                     irq;
+	unsigned int		sdio_irq_enabled;
 
 	/* Callbacks for clock / power control */
 	void (*set_pwr)(struct platform_device *host, int state);
@@ -249,6 +258,22 @@ void pr_debug_status(u32 status)
 #define pr_debug_status(s)  do { } while (0)
 #endif
 
+static void tmio_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
+{
+	struct tmio_mmc_host *host = mmc_priv(mmc);
+
+	if (enable) {
+		host->sdio_irq_enabled = 1;
+		sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001);
+		sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK,
+			(TMIO_SDIO_MASK_ALL & ~TMIO_SDIO_STAT_IOIRQ));
+	} else {
+		sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, TMIO_SDIO_MASK_ALL);
+		sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0000);
+		host->sdio_irq_enabled = 0;
+	}
+}
+
 static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock)
 {
 	u32 clk = 0, clock;
@@ -268,8 +293,23 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock)
 
 static void tmio_mmc_clk_stop(struct tmio_mmc_host *host)
 {
+	struct mfd_cell *cell = host->pdev->dev.platform_data;
+	struct tmio_mmc_data *pdata = cell->driver_data;
+
+	/*
+	 * Testing on sh-mobile showed that SDIO IRQs are unmasked when
+	 * CTL_CLK_AND_WAIT_CTL gets written, so we have to disable the
+	 * device IRQ here and restore the SDIO IRQ mask before
+	 * re-enabling the device IRQ.
+	 */
+	if (pdata->flags & TMIO_MMC_SDIO_IRQ)
+		disable_irq(host->irq);
 	sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0000);
 	msleep(10);
+	if (pdata->flags & TMIO_MMC_SDIO_IRQ) {
+		tmio_mmc_enable_sdio_irq(host->mmc, host->sdio_irq_enabled);
+		enable_irq(host->irq);
+	}
 	sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, ~0x0100 &
 		sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL));
 	msleep(10);
@@ -277,11 +317,21 @@ static void tmio_mmc_clk_stop(struct tmio_mmc_host *host)
 
 static void tmio_mmc_clk_start(struct tmio_mmc_host *host)
 {
+	struct mfd_cell *cell = host->pdev->dev.platform_data;
+	struct tmio_mmc_data *pdata = cell->driver_data;
+
 	sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, 0x0100 |
 		sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL));
 	msleep(10);
+	/* see comment in tmio_mmc_clk_stop above */
+	if (pdata->flags & TMIO_MMC_SDIO_IRQ)
+		disable_irq(host->irq);
 	sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0100);
 	msleep(10);
+	if (pdata->flags & TMIO_MMC_SDIO_IRQ) {
+		tmio_mmc_enable_sdio_irq(host->mmc, host->sdio_irq_enabled);
+		enable_irq(host->irq);
+	}
 }
 
 static void reset(struct tmio_mmc_host *host)
@@ -554,7 +604,10 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
 static irqreturn_t tmio_mmc_irq(int irq, void *devid)
 {
 	struct tmio_mmc_host *host = devid;
+	struct mfd_cell	*cell = host->pdev->dev.platform_data;
+	struct tmio_mmc_data *pdata = cell->driver_data;
 	unsigned int ireg, irq_mask, status;
+	unsigned int sdio_ireg, sdio_irq_mask, sdio_status;
 
 	pr_debug("MMC IRQ begin\n");
 
@@ -562,6 +615,29 @@ static irqreturn_t tmio_mmc_irq(int irq, void *devid)
 	irq_mask = sd_ctrl_read32(host, CTL_IRQ_MASK);
 	ireg = status & TMIO_MASK_IRQ & ~irq_mask;
 
+	sdio_ireg = 0;
+	if (!ireg && pdata->flags & TMIO_MMC_SDIO_IRQ) {
+		sdio_status = sd_ctrl_read16(host, CTL_SDIO_STATUS);
+		sdio_irq_mask = sd_ctrl_read16(host, CTL_SDIO_IRQ_MASK);
+		sdio_ireg = sdio_status & TMIO_SDIO_MASK_ALL & ~sdio_irq_mask;
+
+		sd_ctrl_write16(host, CTL_SDIO_STATUS, sdio_status & ~TMIO_SDIO_MASK_ALL);
+
+		if (sdio_ireg && !host->sdio_irq_enabled) {
+			pr_warning("tmio_mmc: Spurious SDIO IRQ, disabling! 0x%04x 0x%04x 0x%04x\n",
+				   sdio_status, sdio_irq_mask, sdio_ireg);
+			tmio_mmc_enable_sdio_irq(host->mmc, 0);
+			goto out;
+		}
+
+		if (host->mmc->caps & MMC_CAP_SDIO_IRQ &&
+			sdio_ireg & TMIO_SDIO_STAT_IOIRQ)
+			mmc_signal_sdio_irq(host->mmc);
+
+		if (sdio_ireg)
+			goto out;
+	}
+
 	pr_debug_status(status);
 	pr_debug_status(ireg);
 
@@ -1047,6 +1123,7 @@ static const struct mmc_host_ops tmio_mmc_ops = {
 	.set_ios	= tmio_mmc_set_ios,
 	.get_ro         = tmio_mmc_get_ro,
 	.get_cd		= tmio_mmc_get_cd,
+	.enable_sdio_irq = tmio_mmc_enable_sdio_irq,
 };
 
 #ifdef CONFIG_PM
@@ -1162,6 +1239,8 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev)
 		goto cell_disable;
 
 	disable_mmc_irqs(host, TMIO_MASK_ALL);
+	if (pdata->flags & TMIO_MMC_SDIO_IRQ)
+		tmio_mmc_enable_sdio_irq(mmc, 0);
 
 	ret = request_irq(host->irq, tmio_mmc_irq, IRQF_DISABLED |
 		IRQF_TRIGGER_FALLING, dev_name(&dev->dev), host);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index dbfc053b1f95..8e70310ee945 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -57,6 +57,10 @@
  * is configured in 4-bit mode.
  */
 #define TMIO_MMC_BLKSZ_2BYTES		(1 << 1)
+/*
+ * Some controllers can support SDIO IRQ signalling.
+ */
+#define TMIO_MMC_SDIO_IRQ		(1 << 2)
 
 int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base);
 int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base);
-- 
cgit v1.2.3-71-gd317


From 6099469805c24af14250e182bb9ca082b8a6b716 Mon Sep 17 00:00:00 2001
From: Roland Stigge <stigge@antcom.de>
Date: Sun, 9 Jan 2011 09:31:39 -0500
Subject: hwmon: Support for Dallas Semiconductor DS620

Driver for Dallas Semiconductor DS620 temperature sensor and thermostat

Signed-off-by: Roland Stigge <stigge@antcom.de>
Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
---
 Documentation/hwmon/ds620 |  34 +++++
 drivers/hwmon/Kconfig     |  10 ++
 drivers/hwmon/Makefile    |   1 +
 drivers/hwmon/ds620.c     | 337 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/ds620.h |  21 +++
 5 files changed, 403 insertions(+)
 create mode 100644 Documentation/hwmon/ds620
 create mode 100644 drivers/hwmon/ds620.c
 create mode 100644 include/linux/i2c/ds620.h

(limited to 'include/linux')

diff --git a/Documentation/hwmon/ds620 b/Documentation/hwmon/ds620
new file mode 100644
index 000000000000..1fbe3cd916cc
--- /dev/null
+++ b/Documentation/hwmon/ds620
@@ -0,0 +1,34 @@
+Kernel driver ds620
+===================
+
+Supported chips:
+  * Dallas Semiconductor DS620
+    Prefix: 'ds620'
+    Datasheet: Publicly available at the Dallas Semiconductor website
+               http://www.dalsemi.com/
+
+Authors:
+        Roland Stigge <stigge@antcom.de>
+        based on ds1621.c by
+        Christian W. Zuckschwerdt <zany@triq.net>
+
+Description
+-----------
+
+The DS620 is a (one instance) digital thermometer and thermostat. It has both
+high and low temperature limits which can be user defined (i.e.  programmed
+into non-volatile on-chip registers). Temperature range is -55 degree Celsius
+to +125. Between 0 and 70 degree Celsius, accuracy is 0.5 Kelvin. The value
+returned via sysfs displays post decimal positions.
+
+The thermostat function works as follows: When configured via platform_data
+(struct ds620_platform_data) .pomode == 0 (default), the thermostat output pin
+PO is always low. If .pomode == 1, the thermostat is in PO_LOW mode. I.e., the
+output pin PO becomes active when the temperature falls below temp1_min and
+stays active until the temperature goes above temp1_max.
+
+Likewise, with .pomode == 2, the thermostat is in PO_HIGH mode. I.e., the PO
+output pin becomes active when the temperature goes above temp1_max and stays
+active until the temperature falls below temp1_min.
+
+The PO output pin of the DS620 operates active-low.
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 0f09567257fd..bdc13d28b1ea 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -274,6 +274,16 @@ config SENSORS_ATXP1
 	  This driver can also be built as a module.  If so, the module
 	  will be called atxp1.
 
+config SENSORS_DS620
+	tristate "Dallas Semiconductor DS620"
+	depends on I2C
+	help
+	  If you say yes here you get support for Dallas Semiconductor
+	  DS620 sensor chip.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called ds620.
+
 config SENSORS_DS1621
 	tristate "Dallas Semiconductor DS1621 and DS1625"
 	depends on I2C
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 20e74086e681..dde02d99c238 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_SENSORS_ATXP1)	+= atxp1.o
 obj-$(CONFIG_SENSORS_CORETEMP)	+= coretemp.o
 obj-$(CONFIG_SENSORS_PKGTEMP)	+= pkgtemp.o
 obj-$(CONFIG_SENSORS_DME1737)	+= dme1737.o
+obj-$(CONFIG_SENSORS_DS620)	+= ds620.o
 obj-$(CONFIG_SENSORS_DS1621)	+= ds1621.o
 obj-$(CONFIG_SENSORS_EMC1403)	+= emc1403.o
 obj-$(CONFIG_SENSORS_EMC2103)	+= emc2103.o
diff --git a/drivers/hwmon/ds620.c b/drivers/hwmon/ds620.c
new file mode 100644
index 000000000000..257957c69d92
--- /dev/null
+++ b/drivers/hwmon/ds620.c
@@ -0,0 +1,337 @@
+/*
+ *  ds620.c - Support for temperature sensor and thermostat DS620
+ *
+ *  Copyright (C) 2010, 2011 Roland Stigge <stigge@antcom.de>
+ *
+ *  based on ds1621.c by Christian W. Zuckschwerdt  <zany@triq.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/i2c.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/sysfs.h>
+#include <linux/i2c/ds620.h>
+
+/*
+ * Many DS620 constants specified below
+ *  15   14   13   12   11   10   09    08
+ * |Done|NVB |THF |TLF |R1  |R0  |AUTOC|1SHOT|
+ *
+ *  07   06   05   04   03   02   01    00
+ * |PO2 |PO1 |A2  |A1  |A0  |    |     |     |
+ */
+#define DS620_REG_CONFIG_DONE		0x8000
+#define DS620_REG_CONFIG_NVB		0x4000
+#define DS620_REG_CONFIG_THF		0x2000
+#define DS620_REG_CONFIG_TLF		0x1000
+#define DS620_REG_CONFIG_R1		0x0800
+#define DS620_REG_CONFIG_R0		0x0400
+#define DS620_REG_CONFIG_AUTOC		0x0200
+#define DS620_REG_CONFIG_1SHOT		0x0100
+#define DS620_REG_CONFIG_PO2		0x0080
+#define DS620_REG_CONFIG_PO1		0x0040
+#define DS620_REG_CONFIG_A2		0x0020
+#define DS620_REG_CONFIG_A1		0x0010
+#define DS620_REG_CONFIG_A0		0x0008
+
+/* The DS620 registers */
+static const u8 DS620_REG_TEMP[3] = {
+	0xAA,			/* input, word, RO */
+	0xA2,			/* min, word, RW */
+	0xA0,			/* max, word, RW */
+};
+
+#define DS620_REG_CONF		0xAC	/* word, RW */
+#define DS620_COM_START		0x51	/* no data */
+#define DS620_COM_STOP		0x22	/* no data */
+
+/* Each client has this additional data */
+struct ds620_data {
+	struct device *hwmon_dev;
+	struct mutex update_lock;
+	char valid;		/* !=0 if following fields are valid */
+	unsigned long last_updated;	/* In jiffies */
+
+	u16 temp[3];		/* Register values, word */
+};
+
+/*
+ *  Temperature registers are word-sized.
+ *  DS620 uses a high-byte first convention, which is exactly opposite to
+ *  the SMBus standard.
+ */
+static int ds620_read_temp(struct i2c_client *client, u8 reg)
+{
+	int ret;
+
+	ret = i2c_smbus_read_word_data(client, reg);
+	if (ret < 0)
+		return ret;
+	return swab16(ret);
+}
+
+static int ds620_write_temp(struct i2c_client *client, u8 reg, u16 value)
+{
+	return i2c_smbus_write_word_data(client, reg, swab16(value));
+}
+
+static void ds620_init_client(struct i2c_client *client)
+{
+	struct ds620_platform_data *ds620_info = client->dev.platform_data;
+	u16 conf, new_conf;
+
+	new_conf = conf =
+	    swab16(i2c_smbus_read_word_data(client, DS620_REG_CONF));
+
+	/* switch to continuous conversion mode */
+	new_conf &= ~DS620_REG_CONFIG_1SHOT;
+	/* already high at power-on, but don't trust the BIOS! */
+	new_conf |= DS620_REG_CONFIG_PO2;
+	/* thermostat mode according to platform data */
+	if (ds620_info && ds620_info->pomode == 1)
+		new_conf &= ~DS620_REG_CONFIG_PO1; /* PO_LOW */
+	else if (ds620_info && ds620_info->pomode == 2)
+		new_conf |= DS620_REG_CONFIG_PO1; /* PO_HIGH */
+	else
+		new_conf &= ~DS620_REG_CONFIG_PO2; /* always low */
+	/* with highest precision */
+	new_conf |= DS620_REG_CONFIG_R1 | DS620_REG_CONFIG_R0;
+
+	if (conf != new_conf)
+		i2c_smbus_write_word_data(client, DS620_REG_CONF,
+					  swab16(new_conf));
+
+	/* start conversion */
+	i2c_smbus_write_byte(client, DS620_COM_START);
+}
+
+static struct ds620_data *ds620_update_client(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct ds620_data *data = i2c_get_clientdata(client);
+	struct ds620_data *ret = data;
+
+	mutex_lock(&data->update_lock);
+
+	if (time_after(jiffies, data->last_updated + HZ + HZ / 2)
+	    || !data->valid) {
+		int i;
+		int res;
+
+		dev_dbg(&client->dev, "Starting ds620 update\n");
+
+		for (i = 0; i < ARRAY_SIZE(data->temp); i++) {
+			res = ds620_read_temp(client,
+					      DS620_REG_TEMP[i]);
+			if (res < 0) {
+				ret = ERR_PTR(res);
+				goto abort;
+			}
+
+			data->temp[i] = res;
+		}
+
+		data->last_updated = jiffies;
+		data->valid = 1;
+	}
+abort:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+static ssize_t show_temp(struct device *dev, struct device_attribute *da,
+			 char *buf)
+{
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+	struct ds620_data *data = ds620_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n", ((data->temp[attr->index] / 8) * 625) / 10);
+}
+
+static ssize_t set_temp(struct device *dev, struct device_attribute *da,
+			const char *buf, size_t count)
+{
+	int res;
+	long val;
+
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+	struct i2c_client *client = to_i2c_client(dev);
+	struct ds620_data *data = i2c_get_clientdata(client);
+
+	res = strict_strtol(buf, 10, &val);
+
+	if (res)
+		return res;
+
+	val = (val * 10 / 625) * 8;
+
+	mutex_lock(&data->update_lock);
+	data->temp[attr->index] = val;
+	ds620_write_temp(client, DS620_REG_TEMP[attr->index],
+			 data->temp[attr->index]);
+	mutex_unlock(&data->update_lock);
+	return count;
+}
+
+static ssize_t show_alarm(struct device *dev, struct device_attribute *da,
+			  char *buf)
+{
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
+	struct ds620_data *data = ds620_update_client(dev);
+	struct i2c_client *client = to_i2c_client(dev);
+	u16 conf, new_conf;
+	int res;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	/* reset alarms if necessary */
+	res = i2c_smbus_read_word_data(client, DS620_REG_CONF);
+	if (res < 0)
+		return res;
+
+	conf = swab16(res);
+	new_conf = conf;
+	new_conf &= ~attr->index;
+	if (conf != new_conf) {
+		res = i2c_smbus_write_word_data(client, DS620_REG_CONF,
+						swab16(new_conf));
+		if (res < 0)
+			return res;
+	}
+
+	return sprintf(buf, "%d\n", !!(conf & attr->index));
+}
+
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp1_min, S_IWUSR | S_IRUGO, show_temp, set_temp, 1);
+static SENSOR_DEVICE_ATTR(temp1_max, S_IWUSR | S_IRUGO, show_temp, set_temp, 2);
+static SENSOR_DEVICE_ATTR(temp1_min_alarm, S_IRUGO, show_alarm, NULL,
+			  DS620_REG_CONFIG_TLF);
+static SENSOR_DEVICE_ATTR(temp1_max_alarm, S_IRUGO, show_alarm, NULL,
+			  DS620_REG_CONFIG_THF);
+
+static struct attribute *ds620_attributes[] = {
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_min.dev_attr.attr,
+	&sensor_dev_attr_temp1_max.dev_attr.attr,
+	&sensor_dev_attr_temp1_min_alarm.dev_attr.attr,
+	&sensor_dev_attr_temp1_max_alarm.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group ds620_group = {
+	.attrs = ds620_attributes,
+};
+
+static int ds620_probe(struct i2c_client *client,
+		       const struct i2c_device_id *id)
+{
+	struct ds620_data *data;
+	int err;
+
+	data = kzalloc(sizeof(struct ds620_data), GFP_KERNEL);
+	if (!data) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	i2c_set_clientdata(client, data);
+	mutex_init(&data->update_lock);
+
+	/* Initialize the DS620 chip */
+	ds620_init_client(client);
+
+	/* Register sysfs hooks */
+	err = sysfs_create_group(&client->dev.kobj, &ds620_group);
+	if (err)
+		goto exit_free;
+
+	data->hwmon_dev = hwmon_device_register(&client->dev);
+	if (IS_ERR(data->hwmon_dev)) {
+		err = PTR_ERR(data->hwmon_dev);
+		goto exit_remove_files;
+	}
+
+	dev_info(&client->dev, "temperature sensor found\n");
+
+	return 0;
+
+exit_remove_files:
+	sysfs_remove_group(&client->dev.kobj, &ds620_group);
+exit_free:
+	kfree(data);
+exit:
+	return err;
+}
+
+static int ds620_remove(struct i2c_client *client)
+{
+	struct ds620_data *data = i2c_get_clientdata(client);
+
+	hwmon_device_unregister(data->hwmon_dev);
+	sysfs_remove_group(&client->dev.kobj, &ds620_group);
+
+	kfree(data);
+
+	return 0;
+}
+
+static const struct i2c_device_id ds620_id[] = {
+	{"ds620", 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, ds620_id);
+
+/* This is the driver that will be inserted */
+static struct i2c_driver ds620_driver = {
+	.class = I2C_CLASS_HWMON,
+	.driver = {
+		   .name = "ds620",
+	},
+	.probe = ds620_probe,
+	.remove = ds620_remove,
+	.id_table = ds620_id,
+};
+
+static int __init ds620_init(void)
+{
+	return i2c_add_driver(&ds620_driver);
+}
+
+static void __exit ds620_exit(void)
+{
+	i2c_del_driver(&ds620_driver);
+}
+
+MODULE_AUTHOR("Roland Stigge <stigge@antcom.de>");
+MODULE_DESCRIPTION("DS620 driver");
+MODULE_LICENSE("GPL");
+
+module_init(ds620_init);
+module_exit(ds620_exit);
diff --git a/include/linux/i2c/ds620.h b/include/linux/i2c/ds620.h
new file mode 100644
index 000000000000..736bb87ac0fc
--- /dev/null
+++ b/include/linux/i2c/ds620.h
@@ -0,0 +1,21 @@
+#ifndef _LINUX_DS620_H
+#define _LINUX_DS620_H
+
+#include <linux/types.h>
+#include <linux/i2c.h>
+
+/* platform data for the DS620 temperature sensor and thermostat */
+
+struct ds620_platform_data {
+	/*
+	 *  Thermostat output pin PO mode:
+	 *  0 = always low (default)
+	 *  1 = PO_LOW
+	 *  2 = PO_HIGH
+	 *
+	 * (see Documentation/hwmon/ds620)
+	 */
+	int pomode;
+};
+
+#endif /* _LINUX_DS620_H */
-- 
cgit v1.2.3-71-gd317


From 2a8c0c68487a68441e701f493f43fd547d87c8df Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 8 Jan 2011 19:37:20 -0800
Subject: fs: fix dcache.h kernel-doc notation

Fix new kernel-doc notation warning in dcache.h:

  Warning(include/linux/dcache.h:316): Excess function parameter 'Returns' description in '__d_rcu_to_refcount'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc:	Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dcache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index bd07758943e0..59fcd24b1468 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -307,7 +307,7 @@ extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
  * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok
  * @dentry: dentry to take a ref on
  * @seq: seqcount to verify against
- * @Returns: 0 on failure, else 1.
+ * Returns: 0 on failure, else 1.
  *
  * __d_rcu_to_refcount operates on a dentry,seq pair that was returned
  * by __d_lookup_rcu, to get a reference on an rcu-walk dentry.
-- 
cgit v1.2.3-71-gd317


From 175881db8916a5f5cdf920d32214caef588870fd Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 8 Jan 2011 19:38:02 -0800
Subject: hrtimer.h: fix kernel-doc warning

Fix new kernel-doc notation warning in hrtimer.h:

  Warning(include/linux/hrtimer.h:150): Excess struct/union/enum/typedef member 'first' description in 'hrtimer_clock_base'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hrtimer.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 330586ffffbb..f376ddc64c4d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -131,7 +131,6 @@ struct hrtimer_sleeper {
  * @index:		clock type index for per_cpu support when moving a
  *			timer to a base on another cpu.
  * @active:		red black tree root node for the active timers
- * @first:		pointer to the timer node which expires first
  * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
  * @softirq_time:	the time when running the hrtimer queue in the softirq
-- 
cgit v1.2.3-71-gd317


From 0dc1488527a3c01383a50e5df7187219567586a3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 8 Jan 2011 19:40:33 -0800
Subject: pipe_fs_i.h: fix kernel-doc warning

Fix kernel-doc notation warnings in pipe_fs_i.h:

  Warning(include/linux/pipe_fs_i.h:58): No description found for parameter 'buffers'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index bb27d7ec2fb9..77257c92155a 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -30,6 +30,7 @@ struct pipe_buffer {
  *	struct pipe_inode_info - a linux kernel pipe
  *	@wait: reader/writer wait point in case of empty/full pipe
  *	@nrbufs: the number of non-empty pipe buffers in this pipe
+ *	@buffers: total number of buffers (should be a power of 2)
  *	@curbuf: the current pipe buffer entry
  *	@tmp_page: cached released page
  *	@readers: number of current readers of this pipe
-- 
cgit v1.2.3-71-gd317


From 37721e1b0cf98cb65895f234d8c500d270546529 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 10 Jan 2011 08:17:10 +0200
Subject: headers: path.h redux

Remove path.h from sched.h and other files.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/audit.h            | 1 +
 include/linux/dcookies.h         | 2 +-
 include/linux/sched.h            | 1 -
 security/apparmor/include/file.h | 3 +--
 security/selinux/include/avc.h   | 1 -
 5 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8b5c0620abf9..359df0487690 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -372,6 +372,7 @@ struct audit_buffer;
 struct audit_context;
 struct inode;
 struct netlink_skb_parms;
+struct path;
 struct linux_binprm;
 struct mq_attr;
 struct mqstat;
diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h
index 24c806f12a6c..5ac3bdd5cee6 100644
--- a/include/linux/dcookies.h
+++ b/include/linux/dcookies.h
@@ -13,10 +13,10 @@
 #ifdef CONFIG_PROFILING
  
 #include <linux/dcache.h>
-#include <linux/path.h>
 #include <linux/types.h>
  
 struct dcookie_user;
+struct path;
  
 /**
  * dcookie_register - register a user of dcookies
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 341acbbc434a..c118a7f203aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -70,7 +70,6 @@ struct sched_param {
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
-#include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h
index be36feabb16a..ab8c6d87f758 100644
--- a/security/apparmor/include/file.h
+++ b/security/apparmor/include/file.h
@@ -15,12 +15,11 @@
 #ifndef __AA_FILE_H
 #define __AA_FILE_H
 
-#include <linux/path.h>
-
 #include "domain.h"
 #include "match.h"
 
 struct aa_profile;
+struct path;
 
 /*
  * We use MAY_EXEC, MAY_WRITE, MAY_READ, MAY_APPEND and the following flags
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index e94e82f73818..5615081b73ec 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -15,7 +15,6 @@
 #include <linux/audit.h>
 #include <linux/lsm_audit.h>
 #include <linux/in6.h>
-#include <linux/path.h>
 #include <asm/system.h>
 #include "flask.h"
 #include "av_permissions.h"
-- 
cgit v1.2.3-71-gd317


From 57cc7215b70856dc6bae8e55b00ecd7b1d7429b1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 10 Jan 2011 08:18:25 +0200
Subject: headers: kobject.h redux

Remove kobject.h from files which don't need it, notably,
sched.h and fs.h.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/plat-s3c24xx/cpu-freq.c           | 1 -
 arch/powerpc/platforms/pseries/eeh_sysfs.c | 1 -
 drivers/char/snsc.h                        | 1 -
 drivers/net/bonding/bonding.h              | 1 -
 drivers/pci/hotplug/acpiphp.h              | 1 -
 drivers/pci/hotplug/rpaphp_slot.c          | 1 -
 drivers/s390/char/tape_class.h             | 1 -
 drivers/sh/clk/core.c                      | 1 -
 drivers/usb/musb/musb_debugfs.c            | 1 -
 fs/gfs2/incore.h                           | 1 +
 fs/nilfs2/super.c                          | 1 -
 fs/sysfs/inode.c                           | 1 +
 fs/sysfs/sysfs.h                           | 1 +
 include/linux/firmware-map.h               | 1 -
 include/linux/fs.h                         | 2 +-
 include/linux/sched.h                      | 1 -
 include/linux/sunrpc/cache.h               | 1 +
 security/apparmor/include/match.h          | 1 +
 18 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-s3c24xx/cpu-freq.c b/arch/arm/plat-s3c24xx/cpu-freq.c
index 1ecc15bfe9d4..25a8fc7f512e 100644
--- a/arch/arm/plat-s3c24xx/cpu-freq.c
+++ b/arch/arm/plat-s3c24xx/cpu-freq.c
@@ -21,7 +21,6 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/sysdev.h>
-#include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
 
diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c
index 15e13b568904..23982c7892d2 100644
--- a/arch/powerpc/platforms/pseries/eeh_sysfs.c
+++ b/arch/powerpc/platforms/pseries/eeh_sysfs.c
@@ -25,7 +25,6 @@
 #include <linux/pci.h>
 #include <asm/ppc-pci.h>
 #include <asm/pci-bridge.h>
-#include <linux/kobject.h>
 
 /**
  * EEH_SHOW_ATTR -- create sysfs entry for eeh statistic
diff --git a/drivers/char/snsc.h b/drivers/char/snsc.h
index 4be62eda9fbc..e8c52c882b21 100644
--- a/drivers/char/snsc.h
+++ b/drivers/char/snsc.h
@@ -19,7 +19,6 @@
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
-#include <linux/kobject.h>
 #include <linux/fs.h>
 #include <linux/cdev.h>
 #include <linux/semaphore.h>
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 4da384cc7603..31fe980e4e28 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -18,7 +18,6 @@
 #include <linux/timer.h>
 #include <linux/proc_fs.h>
 #include <linux/if_bonding.h>
-#include <linux/kobject.h>
 #include <linux/cpumask.h>
 #include <linux/in6.h>
 #include "bond_3ad.h"
diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h
index bab52047baa8..7722108e78df 100644
--- a/drivers/pci/hotplug/acpiphp.h
+++ b/drivers/pci/hotplug/acpiphp.h
@@ -36,7 +36,6 @@
 #define _ACPIPHP_H
 
 #include <linux/acpi.h>
-#include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/pci_hotplug.h>
 
diff --git a/drivers/pci/hotplug/rpaphp_slot.c b/drivers/pci/hotplug/rpaphp_slot.c
index 2ea9cf1a8d02..b283bbea6d24 100644
--- a/drivers/pci/hotplug/rpaphp_slot.c
+++ b/drivers/pci/hotplug/rpaphp_slot.c
@@ -24,7 +24,6 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/pci.h>
 #include <linux/string.h>
diff --git a/drivers/s390/char/tape_class.h b/drivers/s390/char/tape_class.h
index 707b7f48c232..9e32780c317f 100644
--- a/drivers/s390/char/tape_class.h
+++ b/drivers/s390/char/tape_class.h
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/major.h>
-#include <linux/kobject.h>
 #include <linux/kobj_map.h>
 #include <linux/cdev.h>
 
diff --git a/drivers/sh/clk/core.c b/drivers/sh/clk/core.c
index 3f5e387ed564..5f63c3b83828 100644
--- a/drivers/sh/clk/core.c
+++ b/drivers/sh/clk/core.c
@@ -21,7 +21,6 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/kobject.h>
 #include <linux/sysdev.h>
 #include <linux/seq_file.h>
 #include <linux/err.h>
diff --git a/drivers/usb/musb/musb_debugfs.c b/drivers/usb/musb/musb_debugfs.c
index 9e8639d4e862..b0176e4569e0 100644
--- a/drivers/usb/musb/musb_debugfs.c
+++ b/drivers/usb/musb/musb_debugfs.c
@@ -36,7 +36,6 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/kobject.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/debugfs.h>
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8d3d2b4a0a7d..a79790c06275 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 
 #include <linux/fs.h>
+#include <linux/kobject.h>
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6ea32d9b1b9d..70dfdd532b83 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
-#include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 30ac27345586..0a12eb89cd32 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ffaaa816bfba..3d28af31d863 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
 #include <linux/fs.h>
 
 struct sysfs_open_dirent;
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index c6dcc1dfe781..43fe52fcef0f 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -17,7 +17,6 @@
 #define _LINUX_FIRMWARE_MAP_H
 
 #include <linux/list.h>
-#include <linux/kobject.h>
 
 /*
  * provide a dummy interface if CONFIG_FIRMWARE_MEMMAP is disabled
diff --git a/include/linux/fs.h b/include/linux/fs.h
index baf3e556ff0e..f84d9928bdb1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -382,7 +382,6 @@ struct inodes_stat_t {
 #include <linux/path.h>
 #include <linux/stat.h>
 #include <linux/cache.h>
-#include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
@@ -402,6 +401,7 @@ struct hd_geometry;
 struct iovec;
 struct nameidata;
 struct kiocb;
+struct kobject;
 struct pipe_inode_info;
 struct poll_table_struct;
 struct kstatfs;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c118a7f203aa..abc527aa8550 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -87,7 +87,6 @@ struct sched_param {
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
-#include <linux/kobject.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
 
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 6950c981882d..78aa104250b7 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -13,6 +13,7 @@
 #ifndef _LINUX_SUNRPC_CACHE_H_
 #define _LINUX_SUNRPC_CACHE_H_
 
+#include <linux/kref.h>
 #include <linux/slab.h>
 #include <asm/atomic.h>
 #include <linux/proc_fs.h>
diff --git a/security/apparmor/include/match.h b/security/apparmor/include/match.h
index 734a6d35112c..19ba16e8aacd 100644
--- a/security/apparmor/include/match.h
+++ b/security/apparmor/include/match.h
@@ -15,6 +15,7 @@
 #ifndef __AA_MATCH_H
 #define __AA_MATCH_H
 
+#include <linux/kref.h>
 #include <linux/workqueue.h>
 
 #define DFA_NOMATCH			0
-- 
cgit v1.2.3-71-gd317


From 1a7d946993aaf2a79e9c65abbe169a108e351bcb Mon Sep 17 00:00:00 2001
From: Mattia Dongili <malattia@linux.it>
Date: Sat, 8 Jan 2011 18:47:29 +0900
Subject: sony-laptop: support new hotkeys on the P, Z and EC series

Add new mappings for assist, VAIO, zoom and eject buttons present on
refurbished P, Z and EC models.

Reported-by: Gyorgy Jeney <nog.lkml@gmail.com>
Reported-by: Stephan Mueller <smueller@chronox.de>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/sony-laptop.c | 10 ++++++++++
 include/linux/sonypi.h             |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 68edc97f1926..b4a95bb2f232 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -235,6 +235,7 @@ static int sony_laptop_input_index[] = {
 	57,	/* 70 SONYPI_EVENT_VOLUME_DEC_PRESSED */
 	-1,	/* 71 SONYPI_EVENT_BRIGHTNESS_PRESSED */
 	58,	/* 72 SONYPI_EVENT_MEDIA_PRESSED */
+	59,	/* 72 SONYPI_EVENT_VENDOR_PRESSED */
 };
 
 static int sony_laptop_input_keycode_map[] = {
@@ -297,6 +298,7 @@ static int sony_laptop_input_keycode_map[] = {
 	KEY_VOLUMEUP,	/* 56 SONYPI_EVENT_VOLUME_INC_PRESSED */
 	KEY_VOLUMEDOWN,	/* 57 SONYPI_EVENT_VOLUME_DEC_PRESSED */
 	KEY_MEDIA,	/* 58 SONYPI_EVENT_MEDIA_PRESSED */
+	KEY_VENDOR,	/* 59 SONYPI_EVENT_VENDOR_PRESSED */
 };
 
 /* release buttons after a short delay if pressed */
@@ -894,10 +896,18 @@ static struct sony_nc_event sony_100_events[] = {
 	{ 0x0A, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x8C, SONYPI_EVENT_FNKEY_F12 },
 	{ 0x0C, SONYPI_EVENT_FNKEY_RELEASED },
+	{ 0x9d, SONYPI_EVENT_ZOOM_PRESSED },
+	{ 0x1d, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0x9f, SONYPI_EVENT_CD_EJECT_PRESSED },
 	{ 0x1f, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0xa1, SONYPI_EVENT_MEDIA_PRESSED },
 	{ 0x21, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0xa4, SONYPI_EVENT_CD_EJECT_PRESSED },
+	{ 0x24, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0xa5, SONYPI_EVENT_VENDOR_PRESSED },
+	{ 0x25, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0xa6, SONYPI_EVENT_HELP_PRESSED },
+	{ 0x26, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0, 0 },
 };
 
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index 4f95c1aac2fd..0e6dc3891942 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -112,6 +112,7 @@
 #define SONYPI_EVENT_VOLUME_DEC_PRESSED		70
 #define SONYPI_EVENT_BRIGHTNESS_PRESSED		71
 #define SONYPI_EVENT_MEDIA_PRESSED		72
+#define SONYPI_EVENT_VENDOR_PRESSED		73
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3-71-gd317


From 8aefcd557d26d0023a36f9ec5afbf55e59f8f26b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 10 Jan 2011 12:29:43 -0500
Subject: ext4: dynamically allocate the jbd2_inode in ext4_inode_info as
 necessary

Replace the jbd2_inode structure (which is 48 bytes) with a pointer
and only allocate the jbd2_inode when it is needed --- that is, when
the file system has a journal present and the inode has been opened
for writing.  This allows us to further slim down the ext4_inode_info
structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h       |  2 +-
 fs/ext4/ext4_jbd2.h  |  2 +-
 fs/ext4/file.c       | 22 ++++++++++++++++++++++
 fs/ext4/inode.c      | 17 ++++++++++++-----
 fs/ext4/super.c      | 16 +++++++---------
 fs/jbd2/journal.c    | 20 +++++++++++++-------
 include/linux/jbd2.h | 20 ++++++++++++++++++--
 7 files changed, 74 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2fb531cfd48b..32b7daa41a42 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -811,7 +811,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
-	struct jbd2_inode jinode;
+	struct jbd2_inode *jinode;
 
 	struct ext4_ext_cache i_cached_extent;
 	/*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..d8b992e658c1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
 	if (ext4_handle_valid(handle))
-		return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
 	return 0;
 }
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddceef..bb003dc9ffff 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
 	struct super_block *sb = inode->i_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct vfsmount *mnt = filp->f_path.mnt;
 	struct path path;
 	char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 			ext4_mark_super_dirty(sb);
 		}
 	}
+	/*
+	 * Set up the jbd2_inode if we are opening the inode for
+	 * writing and the journal is present
+	 */
+	if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+		struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+
+		spin_lock(&inode->i_lock);
+		if (!ei->jinode) {
+			if (!jinode) {
+				spin_unlock(&inode->i_lock);
+				return -ENOMEM;
+			}
+			ei->jinode = jinode;
+			jbd2_journal_init_jbd_inode(ei->jinode, inode);
+			jinode = NULL;
+		}
+		spin_unlock(&inode->i_lock);
+		if (unlikely(jinode != NULL))
+			jbd2_free_inode(jinode);
+	}
 	return dquot_file_open(inode, filp);
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0801ee6a173e..2693fcda30d8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -55,10 +55,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
 	trace_ext4_begin_ordered_truncate(inode, new_size);
-	return jbd2_journal_begin_ordered_truncate(
-					EXT4_SB(inode->i_sb)->s_journal,
-					&EXT4_I(inode)->jinode,
-					new_size);
+	/*
+	 * If jinode is zero, then we never opened the file for
+	 * writing, so there's no need to call
+	 * jbd2_journal_begin_ordered_truncate() since there's no
+	 * outstanding writes we need to flush.
+	 */
+	if (!EXT4_I(inode)->jinode)
+		return 0;
+	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+						   EXT4_I(inode)->jinode,
+						   new_size);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -4054,7 +4061,7 @@ int ext4_block_truncate_page(handle_t *handle,
 	if (ext4_should_journal_data(inode)) {
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
-		if (ext4_should_order_data(inode))
+		if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
 			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f5960d673e4e..1cd4326c530b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -818,12 +818,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
-	/*
-	 * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-	 * therefore it can be null here.  Don't check it, just initialize
-	 * jinode.
-	 */
-	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
@@ -832,6 +826,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 #endif
+	ei->jinode = NULL;
 	INIT_LIST_HEAD(&ei->i_completed_io_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->cur_aio_dio = NULL;
@@ -900,9 +895,12 @@ void ext4_clear_inode(struct inode *inode)
 	end_writeback(inode);
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
-	if (EXT4_JOURNAL(inode))
-		jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
-				       &EXT4_I(inode)->jinode);
+	if (EXT4_I(inode)->jinode) {
+		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+					       EXT4_I(inode)->jinode);
+		jbd2_free_inode(EXT4_I(inode)->jinode);
+		EXT4_I(inode)->jinode = NULL;
+	}
 }
 
 static inline void ext4_show_quota_options(struct seq_file *seq,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2447bd86f801..9e4686900f18 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2286,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 
 #endif
 
-struct kmem_cache *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 
 static int __init journal_init_handle_cache(void)
 {
-	jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
-				sizeof(handle_t),
-				0,		/* offset */
-				SLAB_TEMPORARY,	/* flags */
-				NULL);		/* ctor */
+	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
 	if (jbd2_handle_cache == NULL) {
-		printk(KERN_EMERG "JBD: failed to create handle cache\n");
+		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+	if (jbd2_inode_cache == NULL) {
+		printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+		kmem_cache_destroy(jbd2_handle_cache);
 		return -ENOMEM;
 	}
 	return 0;
@@ -2306,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
 {
 	if (jbd2_handle_cache)
 		kmem_cache_destroy(jbd2_handle_cache);
+	if (jbd2_inode_cache)
+		kmem_cache_destroy(jbd2_inode_cache);
+
 }
 
 /*
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 2ae86aa21fce..27e79c27ba08 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -94,7 +94,7 @@ extern void jbd2_free(void *ptr, size_t size);
  *
  * This is an opaque datatype.
  **/
-typedef struct handle_s		handle_t;	/* Atomic operation type */
+typedef struct jbd2_journal_handle handle_t;	/* Atomic operation type */
 
 
 /**
@@ -416,7 +416,7 @@ struct jbd2_revoke_table_s;
  * in so it can be fixed later.
  */
 
-struct handle_s
+struct jbd2_journal_handle
 {
 	/* Which compound transaction is this update a part of? */
 	transaction_t		*h_transaction;
@@ -1158,6 +1158,22 @@ static inline void jbd2_free_handle(handle_t *handle)
 	kmem_cache_free(jbd2_handle_cache, handle);
 }
 
+/*
+ * jbd2_inode management (optional, for those file systems that want to use
+ * dynamically allocated jbd2_inode structures)
+ */
+extern struct kmem_cache *jbd2_inode_cache;
+
+static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags)
+{
+	return kmem_cache_alloc(jbd2_inode_cache, gfp_flags);
+}
+
+static inline void jbd2_free_inode(struct jbd2_inode *jinode)
+{
+	kmem_cache_free(jbd2_inode_cache, jinode);
+}
+
 /* Primary revoke support */
 #define JOURNAL_REVOKE_DEFAULT_HASH 256
 extern int	   jbd2_journal_init_revoke(journal_t *, int);
-- 
cgit v1.2.3-71-gd317


From b853b96b1dbdc05fc8eae141a595366d8172962b Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Mon, 22 Nov 2010 12:29:17 +0100
Subject: ext3: Add batched discard support for ext3

Walk through allocation groups and trim all free extents. It can be
invoked through FITRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search for free extents in allocation groups specified by Byte range
start -> start+len. When the free extent is within this range, blocks are
marked as used and then trimmed. Afterwards these blocks are marked as
free in per-group bitmap.

[JK: Fixed up error handling and trimming of a single group]

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c        | 266 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ext3_fs.h |   1 +
 2 files changed, 267 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db22649426..045995c8ce5a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+	ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	if (offsetp)
+		*offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+	if (blockgrpp)
+		*blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
+
 /**
  * ext3_get_group_desc() -- load group descriptor from disk
  * @sb:			super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
 	return ext3_bg_num_gdb_meta(sb,group);
 
 }
+
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @group:		allocation group to trim
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @gdp:		allocation group description structure
+ * @minblocks:		minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+				ext3_grpblk_t start, ext3_grpblk_t max,
+				ext3_grpblk_t minblocks)
+{
+	handle_t *handle;
+	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+	ext3_fsblk_t discard_block;
+	struct ext3_sb_info *sbi;
+	struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+	struct ext3_group_desc *gdp;
+	int err = 0, ret = 0;
+
+	/*
+	 * We will update one block bitmap, and one group descriptor
+	 */
+	handle = ext3_journal_start_sb(sb, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	bitmap_bh = read_block_bitmap(sb, group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto err_out;
+
+	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto err_out;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	sbi = EXT3_SB(sb);
+
+	 /* Walk through the whole group */
+	while (start < max) {
+		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+		if (start < 0)
+			break;
+		next = start;
+
+		/*
+		 * Allocate contiguous free extents by setting bits in the
+		 * block bitmap
+		 */
+		while (next < max
+			&& claim_block(sb_bgl_lock(sbi, group),
+					next, bitmap_bh)) {
+			next++;
+		}
+
+		 /* We did not claim any blocks */
+		if (next == start)
+			continue;
+
+		discard_block = (ext3_fsblk_t)start +
+				ext3_group_first_block_no(sb, group);
+
+		/* Update counters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+
+		/* Do not issue a TRIM on extents smaller than minblocks */
+		if ((next - start) < minblocks)
+			goto free_extent;
+
+		 /* Send the TRIM command down to the device */
+		err = sb_issue_discard(sb, discard_block, next - start,
+				       GFP_NOFS, 0);
+		count += (next - start);
+free_extent:
+		freed = 0;
+
+		/*
+		 * Clear bits in the bitmap
+		 */
+		for (bit = start; bit < next; bit++) {
+			BUFFER_TRACE(bitmap_bh, "clear bit");
+			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+						bit, bitmap_bh->b_data)) {
+				ext3_error(sb, __func__,
+					"bit already cleared for block "E3FSBLK,
+					 (unsigned long)bit);
+				BUFFER_TRACE(bitmap_bh, "bit already cleared");
+			} else {
+				freed++;
+			}
+		}
+
+		/* Update couters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+
+		start = next;
+		if (err < 0) {
+			if (err != -EOPNOTSUPP)
+				ext3_warning(sb, __func__, "Discard command "
+					     "returned error %d\n", err);
+			break;
+		}
+
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+
+		/* No more suitable extents */
+		if ((free_blocks - count) < minblocks)
+			break;
+	}
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (!err)
+		err = ret;
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (!err)
+		err = ret;
+
+	ext3_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+err_out:
+	if (err)
+		count = err;
+	ext3_journal_stop(handle);
+	brelse(bitmap_bh);
+
+	return count;
+}
+
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @start:		First Byte to trim
+ * @len:		number of Bytes to trim from start
+ * @minlen:		minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	ext3_grpblk_t last_block, first_block, free_blocks;
+	unsigned long first_group, last_group;
+	unsigned long group, ngroups;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	uint64_t start, len, minlen, trimmed;
+	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	len = range->len >> sb->s_blocksize_bits;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+	trimmed = 0;
+
+	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+	if (start >= max_blks)
+		goto out;
+	if (start < le32_to_cpu(es->s_first_data_block)) {
+		len -= le32_to_cpu(es->s_first_data_block) - start;
+		start = le32_to_cpu(es->s_first_data_block);
+	}
+	if (start + len > max_blks)
+		len = max_blks - start;
+
+	ngroups = EXT3_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/* Determine first and last group to examine based on start and len */
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+				     &first_group, &first_block);
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+				     &last_group, &last_block);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+	last_block = EXT3_BLOCKS_PER_GROUP(sb);
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	for (group = first_group; group <= last_group; group++) {
+		gdp = ext3_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			break;
+
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		if (free_blocks < minlen)
+			continue;
+
+		if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+			len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+		else
+			last_block = first_block + len;
+
+		ret = ext3_trim_all_free(sb, group, first_block,
+					last_block, minlen);
+		if (ret < 0)
+			break;
+
+		trimmed += ret;
+		first_block = 0;
+	}
+
+	if (ret >= 0)
+		ret = 0;
+
+out:
+	range->len = trimmed * sb->s_blocksize;
+
+	return ret;
+}
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 6ce1bca01724..a443965946bb 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -856,6 +856,7 @@ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
 extern void ext3_init_block_alloc_info(struct inode *);
 extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
+extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
 
 /* dir.c */
 extern int ext3_check_dir_entry(const char *, struct inode *,
-- 
cgit v1.2.3-71-gd317


From 055adcbd7da75868697e767adc4f3272f6cae76c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 23 Nov 2010 18:49:54 -0800
Subject: quota: Use %pV and __attribute__((format (printf in __quota_error and
 fix fallout

Use %pV in __quota_error so a single printk can not be
interleaved with other logging messages.
Add __attribute__((format (printf, 3, 4))) so format
and arguments can be verified by compiler.
Make sure printk formats and arguments match.

Block # needed a pointer dereference.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 18 +++++++++++-------
 fs/quota/quota_tree.c    |  9 +++++----
 include/linux/quotaops.h |  5 +++--
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efcd..84becd3e4772 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 
 void __quota_error(struct super_block *sb, const char *func,
-		  const char *fmt, ...)
+		   const char *fmt, ...)
 {
-	va_list args;
-
 	if (printk_ratelimit()) {
+		va_list args;
+		struct va_format vaf;
+
 		va_start(args, fmt);
-		printk(KERN_ERR "Quota error (device %s): %s: ",
-		       sb->s_id, func);
-		vprintk(fmt, args);
-		printk("\n");
+
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+		       sb->s_id, func, &vaf);
+
 		va_end(args);
 	}
 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabcc..e41c1becf096 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		return -ENOMEM;
 	ret = read_blk(info, *blk, buf);
 	if (ret < 0) {
-		quota_error(dquot->dq_sb, "Can't read quota data "
-			    "block %u", blk);
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    *blk);
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		} else {
 			ret = write_blk(info, *blk, buf);
 			if (ret < 0)
-				quota_error(dquot->dq_sb, "Can't write quota "
-					    "tree block %u", blk);
+				quota_error(dquot->dq_sb,
+					    "Can't write quota tree block %u",
+					    *blk);
 		}
 	}
 out_buf:
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index d1a9193960f1..223b14cd129c 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -31,8 +31,9 @@ static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
 #define quota_error(sb, fmt, args...) \
 	__quota_error((sb), __func__, fmt , ## args)
 
-extern void __quota_error(struct super_block *sb, const char *func,
-			 const char *fmt, ...);
+extern __attribute__((format (printf, 3, 4)))
+void __quota_error(struct super_block *sb, const char *func,
+		   const char *fmt, ...);
 
 /*
  * declaration of quota_function calls in kernel.
-- 
cgit v1.2.3-71-gd317


From a4ae3094869f18e26ece25ad175bbe4cd740e60b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 7 Dec 2010 11:55:27 -0600
Subject: ext3: speed up file creates by optimizing rec_len functions

The addition of 64k block capability in the rec_len_from_disk
and rec_len_to_disk functions added a bit of math overhead which
slows down file create workloads needlessly when the architecture
cannot even support 64k blocks, thanks to page size limits.

Similar changes already exist in the ext4 codebase.

The directory entry checking can also be optimized a bit
by sprinkling in some unlikely() conditions to move the
error handling out of line.

bonnie++ sequential file creates on a 512MB ramdisk speeds up
from about 77,000/s to about 82,000/s, about a 6% improvement.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/dir.c           | 15 ++++++++-------
 include/linux/ext3_fs.h |  9 +++++++++
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf6..34f0a072b935 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
 	const char * error_msg = NULL;
 	const int rlen = ext3_rec_len_from_disk(de->rec_len);
 
-	if (rlen < EXT3_DIR_REC_LEN(1))
+	if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
 		error_msg = "rec_len is smaller than minimal";
-	else if (rlen % 4 != 0)
+	else if (unlikely(rlen % 4 != 0))
 		error_msg = "rec_len % 4 != 0";
-	else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+	else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
 		error_msg = "rec_len is too small for name_len";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+	else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
 		error_msg = "directory entry across blocks";
-	else if (le32_to_cpu(de->inode) >
-			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
 		error_msg = "inode out of bounds";
 
-	if (error_msg != NULL)
+	if (unlikely(error_msg != NULL))
 		ext3_error (dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
 			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
 			(unsigned long) le32_to_cpu(de->inode),
 			rlen, de->name_len);
+
 	return error_msg == NULL ? 1 : 0;
 }
 
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index a443965946bb..65990ef612f5 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -724,21 +724,30 @@ struct ext3_dir_entry_2 {
 					 ~EXT3_DIR_ROUND)
 #define EXT3_MAX_REC_LEN		((1<<16)-1)
 
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
 static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
 {
 	unsigned len = le16_to_cpu(dlen);
 
+#if (PAGE_CACHE_SIZE >= 65536)
 	if (len == EXT3_MAX_REC_LEN)
 		return 1 << 16;
+#endif
 	return len;
 }
 
 static inline __le16 ext3_rec_len_to_disk(unsigned len)
 {
+#if (PAGE_CACHE_SIZE >= 65536)
 	if (len == (1 << 16))
 		return cpu_to_le16(EXT3_MAX_REC_LEN);
 	else if (len > (1 << 16))
 		BUG();
+#endif
 	return cpu_to_le16(len);
 }
 
-- 
cgit v1.2.3-71-gd317


From d96336b05d718b03ff03c94c0dc0cc283a29d534 Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Mon, 27 Dec 2010 13:46:38 -0800
Subject: ext2: Resolve 'dereferencing pointer to incomplete type' when
 enabling EXT2_XATTR_DEBUG

When I enable EXT2_XATTR_DEBUG in fs/ext2/xattr.c I get a build error stating
the following:

  CC      fs/ext2/xattr.o
fs/ext2/xattr.c: In function 'ext2_xattr_cache_insert':
fs/ext2/xattr.c:841: error: dereferencing pointer to incomplete type
fs/ext2/xattr.c:846: error: dereferencing pointer to incomplete type
make[2]: *** [fs/ext2/xattr.o] Error 1
make[1]: *** [fs/ext2] Error 2
make: *** [fs] Error 2

These lines reference ext2_xattr_cache->c_entry_count which is defined
in struct mb_cache. struct mb_cache is currently only defined in fs/mbcache.c.
Moving struct mb_cache definition to include/linux/mbcache.h to resolve the
issue.

Signed-off-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c            | 12 ------------
 include/linux/mbcache.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237b..a25444ab2baf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 #endif
 
-struct mb_cache {
-	struct list_head		c_cache_list;
-	const char			*c_name;
-	atomic_t			c_entry_count;
-	int				c_max_entries;
-	int				c_bucket_bits;
-	struct kmem_cache		*c_entry_cache;
-	struct list_head		*c_block_hash;
-	struct list_head		*c_index_hash;
-};
-
-
 /*
  * Global data: list of all mbcache's, lru list, and a spinlock for
  * accessing cache data structures on SMP machines. The lru list is
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 54cbbac1e71d..5525d370701d 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -18,6 +18,17 @@ struct mb_cache_entry {
 	} e_index;
 };
 
+struct mb_cache {
+	struct list_head		c_cache_list;
+	const char			*c_name;
+	atomic_t			c_entry_count;
+	int				c_max_entries;
+	int				c_bucket_bits;
+	struct kmem_cache		*c_entry_cache;
+	struct list_head		*c_block_hash;
+	struct list_head		*c_index_hash;
+};
+
 /* Functions on caches */
 
 struct mb_cache *mb_cache_create(const char *, int);
-- 
cgit v1.2.3-71-gd317


From 9d084a3d5dffd076a9a006164ea0dbd9c495f2b0 Mon Sep 17 00:00:00 2001
From: Fabien Marteau <fabien.marteau@armadeus.com>
Date: Mon, 10 Jan 2011 11:01:13 -0800
Subject: Input: add Austria Microsystem AS5011 joystick driver

This is driver for EasyPoint AS5011 2 axis joystick chip. This chip is
plugged on an I2C bus.

Tested on ARM processor (i.MX27).

Signed-off-by: Fabien Marteau <fabien.marteau@armadeus.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/joystick/Kconfig  |  10 ++
 drivers/input/joystick/Makefile |   1 +
 drivers/input/joystick/as5011.c | 367 ++++++++++++++++++++++++++++++++++++++++
 include/linux/input/as5011.h    |  20 +++
 4 files changed, 398 insertions(+)
 create mode 100644 drivers/input/joystick/as5011.c
 create mode 100644 include/linux/input/as5011.h

(limited to 'include/linux')

diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig
index 5b596165b571..56eb471b5576 100644
--- a/drivers/input/joystick/Kconfig
+++ b/drivers/input/joystick/Kconfig
@@ -255,6 +255,16 @@ config JOYSTICK_AMIGA
 	  To compile this driver as a module, choose M here: the
 	  module will be called amijoy.
 
+config JOYSTICK_AS5011
+	tristate "Austria Microsystem AS5011 joystick"
+	depends on I2C
+	help
+	  Say Y here if you have an AS5011 digital joystick connected to your
+	  system.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called as5011.
+
 config JOYSTICK_JOYDUMP
 	tristate "Gameport data dumper"
 	select GAMEPORT
diff --git a/drivers/input/joystick/Makefile b/drivers/input/joystick/Makefile
index f3a8cbe2abb6..92dc0de9dfed 100644
--- a/drivers/input/joystick/Makefile
+++ b/drivers/input/joystick/Makefile
@@ -7,6 +7,7 @@
 obj-$(CONFIG_JOYSTICK_A3D)		+= a3d.o
 obj-$(CONFIG_JOYSTICK_ADI)		+= adi.o
 obj-$(CONFIG_JOYSTICK_AMIGA)		+= amijoy.o
+obj-$(CONFIG_JOYSTICK_AS5011)		+= as5011.o
 obj-$(CONFIG_JOYSTICK_ANALOG)		+= analog.o
 obj-$(CONFIG_JOYSTICK_COBRA)		+= cobra.o
 obj-$(CONFIG_JOYSTICK_DB9)		+= db9.o
diff --git a/drivers/input/joystick/as5011.c b/drivers/input/joystick/as5011.c
new file mode 100644
index 000000000000..f6732b57ca07
--- /dev/null
+++ b/drivers/input/joystick/as5011.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2010, 2011 Fabien Marteau <fabien.marteau@armadeus.com>
+ * Sponsored by ARMadeus Systems
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Driver for Austria Microsystems joysticks AS5011
+ *
+ * TODO:
+ *	- Power on the chip when open() and power down when close()
+ *	- Manage power mode
+ */
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/gpio.h>
+#include <linux/delay.h>
+#include <linux/input/as5011.h>
+#include <linux/slab.h>
+
+#define DRIVER_DESC "Driver for Austria Microsystems AS5011 joystick"
+#define MODULE_DEVICE_ALIAS "as5011"
+
+MODULE_AUTHOR("Fabien Marteau <fabien.marteau@armadeus.com>");
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
+
+/* registers */
+#define AS5011_CTRL1		0x76
+#define AS5011_CTRL2		0x75
+#define AS5011_XP		0x43
+#define AS5011_XN		0x44
+#define AS5011_YP		0x53
+#define AS5011_YN		0x54
+#define AS5011_X_REG		0x41
+#define AS5011_Y_REG		0x42
+#define AS5011_X_RES_INT	0x51
+#define AS5011_Y_RES_INT	0x52
+
+/* CTRL1 bits */
+#define AS5011_CTRL1_LP_PULSED		0x80
+#define AS5011_CTRL1_LP_ACTIVE		0x40
+#define AS5011_CTRL1_LP_CONTINUE	0x20
+#define AS5011_CTRL1_INT_WUP_EN		0x10
+#define AS5011_CTRL1_INT_ACT_EN		0x08
+#define AS5011_CTRL1_EXT_CLK_EN		0x04
+#define AS5011_CTRL1_SOFT_RST		0x02
+#define AS5011_CTRL1_DATA_VALID		0x01
+
+/* CTRL2 bits */
+#define AS5011_CTRL2_EXT_SAMPLE_EN	0x08
+#define AS5011_CTRL2_RC_BIAS_ON		0x04
+#define AS5011_CTRL2_INV_SPINNING	0x02
+
+#define AS5011_MAX_AXIS	80
+#define AS5011_MIN_AXIS	(-80)
+#define AS5011_FUZZ	8
+#define AS5011_FLAT	40
+
+struct as5011_device {
+	struct input_dev *input_dev;
+	struct i2c_client *i2c_client;
+	unsigned int button_gpio;
+	unsigned int button_irq;
+	unsigned int axis_irq;
+};
+
+static int as5011_i2c_write(struct i2c_client *client,
+			    uint8_t aregaddr,
+			    uint8_t avalue)
+{
+	uint8_t data[2] = { aregaddr, avalue };
+	struct i2c_msg msg = {
+		client->addr, I2C_M_IGNORE_NAK, 2, (uint8_t *)data
+	};
+	int error;
+
+	error = i2c_transfer(client->adapter, &msg, 1);
+	return error < 0 ? error : 0;
+}
+
+static int as5011_i2c_read(struct i2c_client *client,
+			   uint8_t aregaddr, signed char *value)
+{
+	uint8_t data[2] = { aregaddr };
+	struct i2c_msg msg_set[2] = {
+		{ client->addr, I2C_M_REV_DIR_ADDR, 1, (uint8_t *)data },
+		{ client->addr, I2C_M_RD | I2C_M_NOSTART, 1, (uint8_t *)data }
+	};
+	int error;
+
+	error = i2c_transfer(client->adapter, msg_set, 2);
+	if (error < 0)
+		return error;
+
+	*value = data[0] & 0x80 ? -1 * (1 + ~data[0]) : data[0];
+	return 0;
+}
+
+static irqreturn_t as5011_button_interrupt(int irq, void *dev_id)
+{
+	struct as5011_device *as5011 = dev_id;
+	int val = gpio_get_value_cansleep(as5011->button_gpio);
+
+	input_report_key(as5011->input_dev, BTN_JOYSTICK, !val);
+	input_sync(as5011->input_dev);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t as5011_axis_interrupt(int irq, void *dev_id)
+{
+	struct as5011_device *as5011 = dev_id;
+	int error;
+	signed char x, y;
+
+	error = as5011_i2c_read(as5011->i2c_client, AS5011_X_RES_INT, &x);
+	if (error < 0)
+		goto out;
+
+	error = as5011_i2c_read(as5011->i2c_client, AS5011_Y_RES_INT, &y);
+	if (error < 0)
+		goto out;
+
+	input_report_abs(as5011->input_dev, ABS_X, x);
+	input_report_abs(as5011->input_dev, ABS_Y, y);
+	input_sync(as5011->input_dev);
+
+out:
+	return IRQ_HANDLED;
+}
+
+static int __devinit as5011_configure_chip(struct as5011_device *as5011,
+				const struct as5011_platform_data *plat_dat)
+{
+	struct i2c_client *client = as5011->i2c_client;
+	int error;
+	signed char value;
+
+	/* chip soft reset */
+	error = as5011_i2c_write(client, AS5011_CTRL1,
+				 AS5011_CTRL1_SOFT_RST);
+	if (error < 0) {
+		dev_err(&client->dev, "Soft reset failed\n");
+		return error;
+	}
+
+	mdelay(10);
+
+	error = as5011_i2c_write(client, AS5011_CTRL1,
+				 AS5011_CTRL1_LP_PULSED |
+				 AS5011_CTRL1_LP_ACTIVE |
+				 AS5011_CTRL1_INT_ACT_EN);
+	if (error < 0) {
+		dev_err(&client->dev, "Power config failed\n");
+		return error;
+	}
+
+	error = as5011_i2c_write(client, AS5011_CTRL2,
+				 AS5011_CTRL2_INV_SPINNING);
+	if (error < 0) {
+		dev_err(&client->dev, "Can't invert spinning\n");
+		return error;
+	}
+
+	/* write threshold */
+	error = as5011_i2c_write(client, AS5011_XP, plat_dat->xp);
+	if (error < 0) {
+		dev_err(&client->dev, "Can't write threshold\n");
+		return error;
+	}
+
+	error = as5011_i2c_write(client, AS5011_XN, plat_dat->xn);
+	if (error < 0) {
+		dev_err(&client->dev, "Can't write threshold\n");
+		return error;
+	}
+
+	error = as5011_i2c_write(client, AS5011_YP, plat_dat->yp);
+	if (error < 0) {
+		dev_err(&client->dev, "Can't write threshold\n");
+		return error;
+	}
+
+	error = as5011_i2c_write(client, AS5011_YN, plat_dat->yn);
+	if (error < 0) {
+		dev_err(&client->dev, "Can't write threshold\n");
+		return error;
+	}
+
+	/* to free irq gpio in chip */
+	error = as5011_i2c_read(client, AS5011_X_RES_INT, &value);
+	if (error < 0) {
+		dev_err(&client->dev, "Can't read i2c X resolution value\n");
+		return error;
+	}
+
+	return 0;
+}
+
+static int __devinit as5011_probe(struct i2c_client *client,
+				const struct i2c_device_id *id)
+{
+	const struct as5011_platform_data *plat_data;
+	struct as5011_device *as5011;
+	struct input_dev *input_dev;
+	int irq;
+	int error;
+
+	plat_data = client->dev.platform_data;
+	if (!plat_data)
+		return -EINVAL;
+
+	if (!plat_data->axis_irq) {
+		dev_err(&client->dev, "No axis IRQ?\n");
+		return -EINVAL;
+	}
+
+	if (!i2c_check_functionality(client->adapter,
+				     I2C_FUNC_PROTOCOL_MANGLING)) {
+		dev_err(&client->dev,
+			"need i2c bus that supports protocol mangling\n");
+		return -ENODEV;
+	}
+
+	as5011 = kmalloc(sizeof(struct as5011_device), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!as5011 || !input_dev) {
+		dev_err(&client->dev,
+			"Can't allocate memory for device structure\n");
+		error = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	as5011->i2c_client = client;
+	as5011->input_dev = input_dev;
+	as5011->button_gpio = plat_data->button_gpio;
+	as5011->axis_irq = plat_data->axis_irq;
+
+	input_dev->name = "Austria Microsystem as5011 joystick";
+	input_dev->id.bustype = BUS_I2C;
+	input_dev->dev.parent = &client->dev;
+
+	__set_bit(EV_KEY, input_dev->evbit);
+	__set_bit(EV_ABS, input_dev->evbit);
+	__set_bit(BTN_JOYSTICK, input_dev->keybit);
+
+	input_set_abs_params(input_dev, ABS_X,
+		AS5011_MIN_AXIS, AS5011_MAX_AXIS, AS5011_FUZZ, AS5011_FLAT);
+	input_set_abs_params(as5011->input_dev, ABS_Y,
+		AS5011_MIN_AXIS, AS5011_MAX_AXIS, AS5011_FUZZ, AS5011_FLAT);
+
+	error = gpio_request(as5011->button_gpio, "AS5011 button");
+	if (error < 0) {
+		dev_err(&client->dev, "Failed to request button gpio\n");
+		goto err_free_mem;
+	}
+
+	irq = gpio_to_irq(as5011->button_gpio);
+	if (irq < 0) {
+		dev_err(&client->dev,
+			"Failed to get irq number for button gpio\n");
+		goto err_free_button_gpio;
+	}
+
+	as5011->button_irq = irq;
+
+	error = request_threaded_irq(as5011->button_irq,
+				     NULL, as5011_button_interrupt,
+				     IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
+				     "as5011_button", as5011);
+	if (error < 0) {
+		dev_err(&client->dev,
+			"Can't allocate button irq %d\n", as5011->button_irq);
+		goto err_free_button_gpio;
+	}
+
+	error = as5011_configure_chip(as5011, plat_data);
+	if (error)
+		goto err_free_button_irq;
+
+	error = request_threaded_irq(as5011->axis_irq, NULL,
+				     as5011_axis_interrupt,
+				     plat_data->axis_irqflags,
+				     "as5011_joystick", as5011);
+	if (error) {
+		dev_err(&client->dev,
+			"Can't allocate axis irq %d\n", plat_data->axis_irq);
+		goto err_free_button_irq;
+	}
+
+	error = input_register_device(as5011->input_dev);
+	if (error) {
+		dev_err(&client->dev, "Failed to register input device\n");
+		goto err_free_axis_irq;
+	}
+
+	i2c_set_clientdata(client, as5011);
+
+	return 0;
+
+err_free_axis_irq:
+	free_irq(as5011->axis_irq, as5011);
+err_free_button_irq:
+	free_irq(as5011->button_irq, as5011);
+err_free_button_gpio:
+	gpio_free(as5011->button_gpio);
+err_free_mem:
+	input_free_device(input_dev);
+	kfree(as5011);
+
+	return error;
+}
+
+static int __devexit as5011_remove(struct i2c_client *client)
+{
+	struct as5011_device *as5011 = i2c_get_clientdata(client);
+
+	free_irq(as5011->axis_irq, as5011);
+	free_irq(as5011->button_irq, as5011);
+	gpio_free(as5011->button_gpio);
+
+	input_unregister_device(as5011->input_dev);
+	kfree(as5011);
+
+	return 0;
+}
+
+static const struct i2c_device_id as5011_id[] = {
+	{ MODULE_DEVICE_ALIAS, 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, as5011_id);
+
+static struct i2c_driver as5011_driver = {
+	.driver = {
+		.name = "as5011",
+	},
+	.probe		= as5011_probe,
+	.remove		= __devexit_p(as5011_remove),
+	.id_table	= as5011_id,
+};
+
+static int __init as5011_init(void)
+{
+	return i2c_add_driver(&as5011_driver);
+}
+module_init(as5011_init);
+
+static void __exit as5011_exit(void)
+{
+	i2c_del_driver(&as5011_driver);
+}
+module_exit(as5011_exit);
diff --git a/include/linux/input/as5011.h b/include/linux/input/as5011.h
new file mode 100644
index 000000000000..1affd0ddfa9d
--- /dev/null
+++ b/include/linux/input/as5011.h
@@ -0,0 +1,20 @@
+#ifndef _AS5011_H
+#define _AS5011_H
+
+/*
+ * Copyright (c) 2010, 2011 Fabien Marteau <fabien.marteau@armadeus.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+struct as5011_platform_data {
+	unsigned int button_gpio;
+	unsigned int axis_irq; /* irq number */
+	unsigned long axis_irqflags;
+	char xp, xn; /* threshold for x axis */
+	char yp, yn; /* threshold for y axis */
+};
+
+#endif /* _AS5011_H */
-- 
cgit v1.2.3-71-gd317


From 6650239a4b01077e80d5a4468562756d77afaa59 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 8 Jan 2011 17:45:38 -0500
Subject: NFS: Don't use vm_map_ram() in readdir

vm_map_ram() is not available on NOMMU platforms, and causes trouble
on incoherrent architectures such as ARM when we access the page data
through both the direct and the virtual mapping.

The alternative is to use the direct mapping to access page data
for the case when we are not crossing a page boundary, but to copy
the data into a linear scratch buffer when we are accessing data
that spans page boundaries.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: stable@kernel.org  [2.6.37]
---
 fs/nfs/dir.c               |  44 ++++++-------
 fs/nfs/nfs2xdr.c           |   6 --
 fs/nfs/nfs3xdr.c           |   6 --
 fs/nfs/nfs4xdr.c           |   6 --
 include/linux/sunrpc/xdr.h |   4 +-
 net/sunrpc/xdr.c           | 155 ++++++++++++++++++++++++++++++++++++---------
 6 files changed, 148 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 996dd8989a91..0108cf4f3403 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,7 +33,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
 
 #include "delegation.h"
@@ -459,25 +458,26 @@ out:
 /* Perform conversion from xdr to cache array */
 static
 int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-				void *xdr_page, struct page *page, unsigned int buflen)
+				struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
 	struct xdr_stream stream;
-	struct xdr_buf buf;
-	__be32 *ptr = xdr_page;
+	struct xdr_buf buf = {
+		.pages = xdr_pages,
+		.page_len = buflen,
+		.buflen = buflen,
+		.len = buflen,
+	};
+	struct page *scratch;
 	struct nfs_cache_array *array;
 	unsigned int count = 0;
 	int status;
 
-	buf.head->iov_base = xdr_page;
-	buf.head->iov_len = buflen;
-	buf.tail->iov_len = 0;
-	buf.page_base = 0;
-	buf.page_len = 0;
-	buf.buflen = buf.head->iov_len;
-	buf.len = buf.head->iov_len;
-
-	xdr_init_decode(&stream, &buf, ptr);
+	scratch = alloc_page(GFP_KERNEL);
+	if (scratch == NULL)
+		return -ENOMEM;
 
+	xdr_init_decode(&stream, &buf, NULL);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
 	do {
 		status = xdr_decode(desc, entry, &stream);
@@ -506,6 +506,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 		} else
 			status = PTR_ERR(array);
 	}
+
+	put_page(scratch);
 	return status;
 }
 
@@ -521,7 +523,6 @@ static
 void nfs_readdir_free_large_page(void *ptr, struct page **pages,
 		unsigned int npages)
 {
-	vm_unmap_ram(ptr, npages);
 	nfs_readdir_free_pagearray(pages, npages);
 }
 
@@ -530,9 +531,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
  * to nfs_readdir_free_large_page
  */
 static
-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-	void *ptr;
 	unsigned int i;
 
 	for (i = 0; i < npages; i++) {
@@ -541,13 +541,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
 			goto out_freepages;
 		pages[i] = page;
 	}
+	return 0;
 
-	ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-	if (!IS_ERR_OR_NULL(ptr))
-		return ptr;
 out_freepages:
 	nfs_readdir_free_pagearray(pages, i);
-	return NULL;
+	return -ENOMEM;
 }
 
 static
@@ -577,8 +575,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	memset(array, 0, sizeof(struct nfs_cache_array));
 	array->eof_index = -1;
 
-	pages_ptr = nfs_readdir_large_page(pages, array_size);
-	if (!pages_ptr)
+	status = nfs_readdir_large_page(pages, array_size);
+	if (status < 0)
 		goto out_release_array;
 	do {
 		unsigned int pglen;
@@ -587,7 +585,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		if (status < 0)
 			break;
 		pglen = status;
-		status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
 		if (status < 0) {
 			if (status == -ENOSPC)
 				status = 0;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5914a1911c95..b382a1b5e7e4 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -487,12 +487,6 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
 
 	entry->d_type = DT_UNKNOWN;
 
-	p = xdr_inline_peek(xdr, 8);
-	if (p != NULL)
-		entry->eof = !p[0] && p[1];
-	else
-		entry->eof = 0;
-
 	return p;
 
 out_overflow:
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f6cc60f06dac..ba91236c6ee7 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -647,12 +647,6 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
 			memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
 	}
 
-	p = xdr_inline_peek(xdr, 8);
-	if (p != NULL)
-		entry->eof = !p[0] && p[1];
-	else
-		entry->eof = 0;
-
 	return p;
 
 out_overflow:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1826b012e6..0662a9821df5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6215,12 +6215,6 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	if (verify_attr_len(xdr, p, len) < 0)
 		goto out_overflow;
 
-	p = xdr_inline_peek(xdr, 8);
-	if (p != NULL)
-		entry->eof = !p[0] && p[1];
-	else
-		entry->eof = 0;
-
 	return p;
 
 out_overflow:
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 498ab93a81e4..7783c687c777 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -201,6 +201,8 @@ struct xdr_stream {
 
 	__be32 *end;		/* end of available buffer space */
 	struct kvec *iov;	/* pointer to the current kvec */
+	struct kvec scratch;	/* Scratch buffer */
+	struct page **page_ptr;	/* pointer to the current page */
 };
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
@@ -208,7 +210,7 @@ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 		unsigned int base, unsigned int len);
 extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
-extern __be32 *xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes);
+extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
 extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index cd9e841e7492..679cd674b81d 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -552,6 +552,74 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
 }
 EXPORT_SYMBOL_GPL(xdr_write_pages);
 
+static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+		__be32 *p, unsigned int len)
+{
+	if (len > iov->iov_len)
+		len = iov->iov_len;
+	if (p == NULL)
+		p = (__be32*)iov->iov_base;
+	xdr->p = p;
+	xdr->end = (__be32*)(iov->iov_base + len);
+	xdr->iov = iov;
+	xdr->page_ptr = NULL;
+}
+
+static int xdr_set_page_base(struct xdr_stream *xdr,
+		unsigned int base, unsigned int len)
+{
+	unsigned int pgnr;
+	unsigned int maxlen;
+	unsigned int pgoff;
+	unsigned int pgend;
+	void *kaddr;
+
+	maxlen = xdr->buf->page_len;
+	if (base >= maxlen)
+		return -EINVAL;
+	maxlen -= base;
+	if (len > maxlen)
+		len = maxlen;
+
+	base += xdr->buf->page_base;
+
+	pgnr = base >> PAGE_SHIFT;
+	xdr->page_ptr = &xdr->buf->pages[pgnr];
+	kaddr = page_address(*xdr->page_ptr);
+
+	pgoff = base & ~PAGE_MASK;
+	xdr->p = (__be32*)(kaddr + pgoff);
+
+	pgend = pgoff + len;
+	if (pgend > PAGE_SIZE)
+		pgend = PAGE_SIZE;
+	xdr->end = (__be32*)(kaddr + pgend);
+	xdr->iov = NULL;
+	return 0;
+}
+
+static void xdr_set_next_page(struct xdr_stream *xdr)
+{
+	unsigned int newbase;
+
+	newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
+	newbase -= xdr->buf->page_base;
+
+	if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
+		xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
+}
+
+static bool xdr_set_next_buffer(struct xdr_stream *xdr)
+{
+	if (xdr->page_ptr != NULL)
+		xdr_set_next_page(xdr);
+	else if (xdr->iov == xdr->buf->head) {
+		if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
+			xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
+	}
+	return xdr->p != xdr->end;
+}
+
 /**
  * xdr_init_decode - Initialize an xdr_stream for decoding data.
  * @xdr: pointer to xdr_stream struct
@@ -560,41 +628,67 @@ EXPORT_SYMBOL_GPL(xdr_write_pages);
  */
 void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 {
-	struct kvec *iov = buf->head;
-	unsigned int len = iov->iov_len;
-
-	if (len > buf->len)
-		len = buf->len;
 	xdr->buf = buf;
-	xdr->iov = iov;
-	xdr->p = p;
-	xdr->end = (__be32 *)((char *)iov->iov_base + len);
+	xdr->scratch.iov_base = NULL;
+	xdr->scratch.iov_len = 0;
+	if (buf->head[0].iov_len != 0)
+		xdr_set_iov(xdr, buf->head, p, buf->len);
+	else if (buf->page_len != 0)
+		xdr_set_page_base(xdr, 0, buf->len);
 }
 EXPORT_SYMBOL_GPL(xdr_init_decode);
 
-/**
- * xdr_inline_peek - Allow read-ahead in the XDR data stream
- * @xdr: pointer to xdr_stream struct
- * @nbytes: number of bytes of data to decode
- *
- * Check if the input buffer is long enough to enable us to decode
- * 'nbytes' more bytes of data starting at the current position.
- * If so return the current pointer without updating the current
- * pointer position.
- */
-__be32 * xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes)
+static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p = xdr->p;
 	__be32 *q = p + XDR_QUADLEN(nbytes);
 
 	if (unlikely(q > xdr->end || q < p))
 		return NULL;
+	xdr->p = q;
 	return p;
 }
-EXPORT_SYMBOL_GPL(xdr_inline_peek);
 
 /**
- * xdr_inline_decode - Retrieve non-page XDR data to decode
+ * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to an empty buffer
+ * @buflen: size of 'buf'
+ *
+ * The scratch buffer is used when decoding from an array of pages.
+ * If an xdr_inline_decode() call spans across page boundaries, then
+ * we copy the data into the scratch buffer in order to allow linear
+ * access.
+ */
+void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
+{
+	xdr->scratch.iov_base = buf;
+	xdr->scratch.iov_len = buflen;
+}
+EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
+
+static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p;
+	void *cpdest = xdr->scratch.iov_base;
+	size_t cplen = (char *)xdr->end - (char *)xdr->p;
+
+	if (nbytes > xdr->scratch.iov_len)
+		return NULL;
+	memcpy(cpdest, xdr->p, cplen);
+	cpdest += cplen;
+	nbytes -= cplen;
+	if (!xdr_set_next_buffer(xdr))
+		return NULL;
+	p = __xdr_inline_decode(xdr, nbytes);
+	if (p == NULL)
+		return NULL;
+	memcpy(cpdest, p, nbytes);
+	return xdr->scratch.iov_base;
+}
+
+/**
+ * xdr_inline_decode - Retrieve XDR data to decode
  * @xdr: pointer to xdr_stream struct
  * @nbytes: number of bytes of data to decode
  *
@@ -605,13 +699,16 @@ EXPORT_SYMBOL_GPL(xdr_inline_peek);
  */
 __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
 {
-	__be32 *p = xdr->p;
-	__be32 *q = p + XDR_QUADLEN(nbytes);
+	__be32 *p;
 
-	if (unlikely(q > xdr->end || q < p))
+	if (nbytes == 0)
+		return xdr->p;
+	if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
 		return NULL;
-	xdr->p = q;
-	return p;
+	p = __xdr_inline_decode(xdr, nbytes);
+	if (p != NULL)
+		return p;
+	return xdr_copy_to_scratch(xdr, nbytes);
 }
 EXPORT_SYMBOL_GPL(xdr_inline_decode);
 
@@ -671,16 +768,12 @@ EXPORT_SYMBOL_GPL(xdr_read_pages);
  */
 void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
 {
-	char * kaddr = page_address(xdr->buf->pages[0]);
 	xdr_read_pages(xdr, len);
 	/*
 	 * Position current pointer at beginning of tail, and
 	 * set remaining message length.
 	 */
-	if (len > PAGE_CACHE_SIZE - xdr->buf->page_base)
-		len = PAGE_CACHE_SIZE - xdr->buf->page_base;
-	xdr->p = (__be32 *)(kaddr + xdr->buf->page_base);
-	xdr->end = (__be32 *)((char *)xdr->p + len);
+	xdr_set_page_base(xdr, 0, len);
 }
 EXPORT_SYMBOL_GPL(xdr_enter_page);
 
-- 
cgit v1.2.3-71-gd317


From 92ed1a76ca31774eb27de14b2215841367c68056 Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <peter.korsgaard@barco.com>
Date: Mon, 10 Jan 2011 22:11:23 +0100
Subject: i2c: Add generic I2C multiplexer using GPIO API

Add an i2c mux driver providing access to i2c bus segments using a
hardware MUX sitting on a master bus and controlled through gpio pins.

E.G. something like:

  ----------              ----------  Bus segment 1   - - - - -
 |          | SCL/SDA    |          |-------------- |           |
 |          |------------|          |
 |          |            |          | Bus segment 2 |           |
 |  Linux   | GPIO 1..N  |   MUX    |---------------   Devices
 |          |------------|          |               |           |
 |          |            |          | Bus segment M
 |          |            |          |---------------|           |
  ----------              ----------                  - - - - -

SCL/SDA of the master I2C bus is multiplexed to bus segment 1..M
according to the settings of the GPIO pins 1..N.

Signed-off-by: Peter Korsgaard <peter.korsgaard@barco.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/muxes/gpio-i2cmux |  65 +++++++++++++
 MAINTAINERS                         |   8 ++
 drivers/i2c/muxes/Kconfig           |  12 +++
 drivers/i2c/muxes/Makefile          |   1 +
 drivers/i2c/muxes/gpio-i2cmux.c     | 184 ++++++++++++++++++++++++++++++++++++
 include/linux/gpio-i2cmux.h         |  38 ++++++++
 6 files changed, 308 insertions(+)
 create mode 100644 Documentation/i2c/muxes/gpio-i2cmux
 create mode 100644 drivers/i2c/muxes/gpio-i2cmux.c
 create mode 100644 include/linux/gpio-i2cmux.h

(limited to 'include/linux')

diff --git a/Documentation/i2c/muxes/gpio-i2cmux b/Documentation/i2c/muxes/gpio-i2cmux
new file mode 100644
index 000000000000..811cd78d4cdc
--- /dev/null
+++ b/Documentation/i2c/muxes/gpio-i2cmux
@@ -0,0 +1,65 @@
+Kernel driver gpio-i2cmux
+
+Author: Peter Korsgaard <peter.korsgaard@barco.com>
+
+Description
+-----------
+
+gpio-i2cmux is an i2c mux driver providing access to I2C bus segments
+from a master I2C bus and a hardware MUX controlled through GPIO pins.
+
+E.G.:
+
+  ----------              ----------  Bus segment 1   - - - - -
+ |          | SCL/SDA    |          |-------------- |           |
+ |          |------------|          |
+ |          |            |          | Bus segment 2 |           |
+ |  Linux   | GPIO 1..N  |   MUX    |---------------   Devices
+ |          |------------|          |               |           |
+ |          |            |          | Bus segment M
+ |          |            |          |---------------|           |
+  ----------              ----------                  - - - - -
+
+SCL/SDA of the master I2C bus is multiplexed to bus segment 1..M
+according to the settings of the GPIO pins 1..N.
+
+Usage
+-----
+
+gpio-i2cmux uses the platform bus, so you need to provide a struct
+platform_device with the platform_data pointing to a struct
+gpio_i2cmux_platform_data with the I2C adapter number of the master
+bus, the number of bus segments to create and the GPIO pins used
+to control it. See include/linux/gpio-i2cmux.h for details.
+
+E.G. something like this for a MUX providing 4 bus segments
+controlled through 3 GPIO pins:
+
+#include <linux/gpio-i2cmux.h>
+#include <linux/platform_device.h>
+
+static const unsigned myboard_gpiomux_gpios[] = {
+	AT91_PIN_PC26, AT91_PIN_PC25, AT91_PIN_PC24
+};
+
+static const unsigned myboard_gpiomux_values[] = {
+	0, 1, 2, 3
+};
+
+static struct gpio_i2cmux_platform_data myboard_i2cmux_data = {
+	.parent		= 1,
+	.base_nr	= 2, /* optional */
+	.values		= myboard_gpiomux_values,
+	.n_values	= ARRAY_SIZE(myboard_gpiomux_values),
+	.gpios		= myboard_gpiomux_gpios,
+	.n_gpios	= ARRAY_SIZE(myboard_gpiomux_gpios),
+	.idle		= 4, /* optional */
+};
+
+static struct platform_device myboard_i2cmux = {
+	.name		= "gpio-i2cmux",
+	.id		= 0,
+	.dev		= {
+		.platform_data	= &myboard_i2cmux_data,
+	},
+};
diff --git a/MAINTAINERS b/MAINTAINERS
index 23d04363a195..23a4765ab8d2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2608,6 +2608,14 @@ S:	Supported
 F:	drivers/i2c/busses/i2c-gpio.c
 F:	include/linux/i2c-gpio.h
 
+GENERIC GPIO I2C MULTIPLEXER DRIVER
+M:	Peter Korsgaard <peter.korsgaard@barco.com>
+L:	linux-i2c@vger.kernel.org
+S:	Supported
+F:	drivers/i2c/muxes/gpio-i2cmux.c
+F:	include/linux/gpio-i2cmux.h
+F:	Documentation/i2c/muxes/gpio-i2cmux
+
 GENERIC HDLC (WAN) DRIVERS
 M:	Krzysztof Halasa <khc@pm.waw.pl>
 W:	http://www.kernel.org/pub/linux/utils/net/hdlc/
diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index 4d91d80bfd23..90b7a0163899 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -5,6 +5,18 @@
 menu "Multiplexer I2C Chip support"
 	depends on I2C_MUX
 
+config I2C_MUX_GPIO
+	tristate "GPIO-based I2C multiplexer"
+	depends on GENERIC_GPIO
+	help
+	  If you say yes to this option, support will be included for a
+	  GPIO based I2C multiplexer. This driver provides access to
+	  I2C busses connected through a MUX, which is controlled
+	  through GPIO pins.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called gpio-i2cmux.
+
 config I2C_MUX_PCA9541
 	tristate "NXP PCA9541 I2C Master Selector"
 	depends on EXPERIMENTAL
diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile
index d743806d9b42..4640436ea61f 100644
--- a/drivers/i2c/muxes/Makefile
+++ b/drivers/i2c/muxes/Makefile
@@ -1,6 +1,7 @@
 #
 # Makefile for multiplexer I2C chip drivers.
 
+obj-$(CONFIG_I2C_MUX_GPIO)	+= gpio-i2cmux.o
 obj-$(CONFIG_I2C_MUX_PCA9541)	+= pca9541.o
 obj-$(CONFIG_I2C_MUX_PCA954x)	+= pca954x.o
 
diff --git a/drivers/i2c/muxes/gpio-i2cmux.c b/drivers/i2c/muxes/gpio-i2cmux.c
new file mode 100644
index 000000000000..7b6ce624cd6e
--- /dev/null
+++ b/drivers/i2c/muxes/gpio-i2cmux.c
@@ -0,0 +1,184 @@
+/*
+ * I2C multiplexer using GPIO API
+ *
+ * Peter Korsgaard <peter.korsgaard@barco.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/gpio-i2cmux.h>
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/gpio.h>
+
+struct gpiomux {
+	struct i2c_adapter *parent;
+	struct i2c_adapter **adap; /* child busses */
+	struct gpio_i2cmux_platform_data data;
+};
+
+static void gpiomux_set(const struct gpiomux *mux, unsigned val)
+{
+	int i;
+
+	for (i = 0; i < mux->data.n_gpios; i++)
+		gpio_set_value(mux->data.gpios[i], val & (1 << i));
+}
+
+static int gpiomux_select(struct i2c_adapter *adap, void *data, u32 chan)
+{
+	struct gpiomux *mux = data;
+
+	gpiomux_set(mux, mux->data.values[chan]);
+
+	return 0;
+}
+
+static int gpiomux_deselect(struct i2c_adapter *adap, void *data, u32 chan)
+{
+	struct gpiomux *mux = data;
+
+	gpiomux_set(mux, mux->data.idle);
+
+	return 0;
+}
+
+static int __devinit gpiomux_probe(struct platform_device *pdev)
+{
+	struct gpiomux *mux;
+	struct gpio_i2cmux_platform_data *pdata;
+	struct i2c_adapter *parent;
+	int (*deselect) (struct i2c_adapter *, void *, u32);
+	unsigned initial_state;
+	int i, ret;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata) {
+		dev_err(&pdev->dev, "Missing platform data\n");
+		return -ENODEV;
+	}
+
+	parent = i2c_get_adapter(pdata->parent);
+	if (!parent) {
+		dev_err(&pdev->dev, "Parent adapter (%d) not found\n",
+			pdata->parent);
+		return -ENODEV;
+	}
+
+	mux = kzalloc(sizeof(*mux), GFP_KERNEL);
+	if (!mux) {
+		ret = -ENOMEM;
+		goto alloc_failed;
+	}
+
+	mux->parent = parent;
+	mux->data = *pdata;
+	mux->adap = kzalloc(sizeof(struct i2c_adapter *) * pdata->n_values,
+			    GFP_KERNEL);
+	if (!mux->adap) {
+		ret = -ENOMEM;
+		goto alloc_failed2;
+	}
+
+	if (pdata->idle != GPIO_I2CMUX_NO_IDLE) {
+		initial_state = pdata->idle;
+		deselect = gpiomux_deselect;
+	} else {
+		initial_state = pdata->values[0];
+		deselect = NULL;
+	}
+
+	for (i = 0; i < pdata->n_gpios; i++) {
+		ret = gpio_request(pdata->gpios[i], "gpio-i2cmux");
+		if (ret)
+			goto err_request_gpio;
+		gpio_direction_output(pdata->gpios[i],
+				      initial_state & (1 << i));
+	}
+
+	for (i = 0; i < pdata->n_values; i++) {
+		u32 nr = pdata->base_nr ? (pdata->base_nr + i) : 0;
+
+		mux->adap[i] = i2c_add_mux_adapter(parent, mux, nr, i,
+						   gpiomux_select, deselect);
+		if (!mux->adap[i]) {
+			ret = -ENODEV;
+			dev_err(&pdev->dev, "Failed to add adapter %d\n", i);
+			goto add_adapter_failed;
+		}
+	}
+
+	dev_info(&pdev->dev, "%d port mux on %s adapter\n",
+		 pdata->n_values, parent->name);
+
+	platform_set_drvdata(pdev, mux);
+
+	return 0;
+
+add_adapter_failed:
+	for (; i > 0; i--)
+		i2c_del_mux_adapter(mux->adap[i - 1]);
+	i = pdata->n_gpios;
+err_request_gpio:
+	for (; i > 0; i--)
+		gpio_free(pdata->gpios[i - 1]);
+	kfree(mux->adap);
+alloc_failed2:
+	kfree(mux);
+alloc_failed:
+	i2c_put_adapter(parent);
+
+	return ret;
+}
+
+static int __devexit gpiomux_remove(struct platform_device *pdev)
+{
+	struct gpiomux *mux = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < mux->data.n_values; i++)
+		i2c_del_mux_adapter(mux->adap[i]);
+
+	for (i = 0; i < mux->data.n_gpios; i++)
+		gpio_free(mux->data.gpios[i]);
+
+	platform_set_drvdata(pdev, NULL);
+	i2c_put_adapter(mux->parent);
+	kfree(mux->adap);
+	kfree(mux);
+
+	return 0;
+}
+
+static struct platform_driver gpiomux_driver = {
+	.probe	= gpiomux_probe,
+	.remove	= __devexit_p(gpiomux_remove),
+	.driver	= {
+		.owner	= THIS_MODULE,
+		.name	= "gpio-i2cmux",
+	},
+};
+
+static int __init gpiomux_init(void)
+{
+	return platform_driver_register(&gpiomux_driver);
+}
+
+static void __exit gpiomux_exit(void)
+{
+	platform_driver_unregister(&gpiomux_driver);
+}
+
+module_init(gpiomux_init);
+module_exit(gpiomux_exit);
+
+MODULE_DESCRIPTION("GPIO-based I2C multiplexer driver");
+MODULE_AUTHOR("Peter Korsgaard <peter.korsgaard@barco.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:gpio-i2cmux");
diff --git a/include/linux/gpio-i2cmux.h b/include/linux/gpio-i2cmux.h
new file mode 100644
index 000000000000..4a333bb0bd0d
--- /dev/null
+++ b/include/linux/gpio-i2cmux.h
@@ -0,0 +1,38 @@
+/*
+ * gpio-i2cmux interface to platform code
+ *
+ * Peter Korsgaard <peter.korsgaard@barco.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_GPIO_I2CMUX_H
+#define _LINUX_GPIO_I2CMUX_H
+
+/* MUX has no specific idle mode */
+#define GPIO_I2CMUX_NO_IDLE	((unsigned)-1)
+
+/**
+ * struct gpio_i2cmux_platform_data - Platform-dependent data for gpio-i2cmux
+ * @parent: Parent I2C bus adapter number
+ * @base_nr: Base I2C bus number to number adapters from or zero for dynamic
+ * @values: Array of bitmasks of GPIO settings (low/high) for each
+ *	position
+ * @n_values: Number of multiplexer positions (busses to instantiate)
+ * @gpios: Array of GPIO numbers used to control MUX
+ * @n_gpios: Number of GPIOs used to control MUX
+ * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used
+ */
+struct gpio_i2cmux_platform_data {
+	int parent;
+	int base_nr;
+	const unsigned *values;
+	int n_values;
+	const unsigned *gpios;
+	int n_gpios;
+	unsigned idle;
+};
+
+#endif /* _LINUX_GPIO_I2CMUX_H */
-- 
cgit v1.2.3-71-gd317


From 0cc43a1806f078f7fd414850d8f1f1761696e4af Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 10 Jan 2011 22:11:23 +0100
Subject: i2c: Constify i2c_client where possible

Helper functions for I2C and SMBus transactions don't modify the
i2c_client that is passed to them, so it can be marked const.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/i2c-core.c   | 29 ++++++++++++++++-------------
 drivers/rtc/rtc-ds1307.c | 12 ++++++------
 include/linux/i2c.h      | 27 +++++++++++++++------------
 3 files changed, 37 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 6b4cc567645b..c7db6980e3a3 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1362,7 +1362,7 @@ EXPORT_SYMBOL(i2c_transfer);
  *
  * Returns negative errno, or else the number of bytes written.
  */
-int i2c_master_send(struct i2c_client *client, const char *buf, int count)
+int i2c_master_send(const struct i2c_client *client, const char *buf, int count)
 {
 	int ret;
 	struct i2c_adapter *adap = client->adapter;
@@ -1389,7 +1389,7 @@ EXPORT_SYMBOL(i2c_master_send);
  *
  * Returns negative errno, or else the number of bytes read.
  */
-int i2c_master_recv(struct i2c_client *client, char *buf, int count)
+int i2c_master_recv(const struct i2c_client *client, char *buf, int count)
 {
 	struct i2c_adapter *adap = client->adapter;
 	struct i2c_msg msg;
@@ -1679,7 +1679,7 @@ static int i2c_smbus_check_pec(u8 cpec, struct i2c_msg *msg)
  * This executes the SMBus "receive byte" protocol, returning negative errno
  * else the byte received from the device.
  */
-s32 i2c_smbus_read_byte(struct i2c_client *client)
+s32 i2c_smbus_read_byte(const struct i2c_client *client)
 {
 	union i2c_smbus_data data;
 	int status;
@@ -1699,7 +1699,7 @@ EXPORT_SYMBOL(i2c_smbus_read_byte);
  * This executes the SMBus "send byte" protocol, returning negative errno
  * else zero on success.
  */
-s32 i2c_smbus_write_byte(struct i2c_client *client, u8 value)
+s32 i2c_smbus_write_byte(const struct i2c_client *client, u8 value)
 {
 	return i2c_smbus_xfer(client->adapter, client->addr, client->flags,
 	                      I2C_SMBUS_WRITE, value, I2C_SMBUS_BYTE, NULL);
@@ -1714,7 +1714,7 @@ EXPORT_SYMBOL(i2c_smbus_write_byte);
  * This executes the SMBus "read byte" protocol, returning negative errno
  * else a data byte received from the device.
  */
-s32 i2c_smbus_read_byte_data(struct i2c_client *client, u8 command)
+s32 i2c_smbus_read_byte_data(const struct i2c_client *client, u8 command)
 {
 	union i2c_smbus_data data;
 	int status;
@@ -1735,7 +1735,8 @@ EXPORT_SYMBOL(i2c_smbus_read_byte_data);
  * This executes the SMBus "write byte" protocol, returning negative errno
  * else zero on success.
  */
-s32 i2c_smbus_write_byte_data(struct i2c_client *client, u8 command, u8 value)
+s32 i2c_smbus_write_byte_data(const struct i2c_client *client, u8 command,
+			      u8 value)
 {
 	union i2c_smbus_data data;
 	data.byte = value;
@@ -1753,7 +1754,7 @@ EXPORT_SYMBOL(i2c_smbus_write_byte_data);
  * This executes the SMBus "read word" protocol, returning negative errno
  * else a 16-bit unsigned "word" received from the device.
  */
-s32 i2c_smbus_read_word_data(struct i2c_client *client, u8 command)
+s32 i2c_smbus_read_word_data(const struct i2c_client *client, u8 command)
 {
 	union i2c_smbus_data data;
 	int status;
@@ -1774,7 +1775,8 @@ EXPORT_SYMBOL(i2c_smbus_read_word_data);
  * This executes the SMBus "write word" protocol, returning negative errno
  * else zero on success.
  */
-s32 i2c_smbus_write_word_data(struct i2c_client *client, u8 command, u16 value)
+s32 i2c_smbus_write_word_data(const struct i2c_client *client, u8 command,
+			      u16 value)
 {
 	union i2c_smbus_data data;
 	data.word = value;
@@ -1793,7 +1795,8 @@ EXPORT_SYMBOL(i2c_smbus_write_word_data);
  * This executes the SMBus "process call" protocol, returning negative errno
  * else a 16-bit unsigned "word" received from the device.
  */
-s32 i2c_smbus_process_call(struct i2c_client *client, u8 command, u16 value)
+s32 i2c_smbus_process_call(const struct i2c_client *client, u8 command,
+			   u16 value)
 {
 	union i2c_smbus_data data;
 	int status;
@@ -1821,7 +1824,7 @@ EXPORT_SYMBOL(i2c_smbus_process_call);
  * support this; its emulation through I2C messaging relies on a specific
  * mechanism (I2C_M_RECV_LEN) which may not be implemented.
  */
-s32 i2c_smbus_read_block_data(struct i2c_client *client, u8 command,
+s32 i2c_smbus_read_block_data(const struct i2c_client *client, u8 command,
 			      u8 *values)
 {
 	union i2c_smbus_data data;
@@ -1848,7 +1851,7 @@ EXPORT_SYMBOL(i2c_smbus_read_block_data);
  * This executes the SMBus "block write" protocol, returning negative errno
  * else zero on success.
  */
-s32 i2c_smbus_write_block_data(struct i2c_client *client, u8 command,
+s32 i2c_smbus_write_block_data(const struct i2c_client *client, u8 command,
 			       u8 length, const u8 *values)
 {
 	union i2c_smbus_data data;
@@ -1864,7 +1867,7 @@ s32 i2c_smbus_write_block_data(struct i2c_client *client, u8 command,
 EXPORT_SYMBOL(i2c_smbus_write_block_data);
 
 /* Returns the number of read bytes */
-s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client, u8 command,
+s32 i2c_smbus_read_i2c_block_data(const struct i2c_client *client, u8 command,
 				  u8 length, u8 *values)
 {
 	union i2c_smbus_data data;
@@ -1884,7 +1887,7 @@ s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client, u8 command,
 }
 EXPORT_SYMBOL(i2c_smbus_read_i2c_block_data);
 
-s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client, u8 command,
+s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client, u8 command,
 				   u8 length, const u8 *values)
 {
 	union i2c_smbus_data data;
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index d827ce570a8c..0d559b6416dd 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -106,9 +106,9 @@ struct ds1307 {
 	struct i2c_client	*client;
 	struct rtc_device	*rtc;
 	struct work_struct	work;
-	s32 (*read_block_data)(struct i2c_client *client, u8 command,
+	s32 (*read_block_data)(const struct i2c_client *client, u8 command,
 			       u8 length, u8 *values);
-	s32 (*write_block_data)(struct i2c_client *client, u8 command,
+	s32 (*write_block_data)(const struct i2c_client *client, u8 command,
 				u8 length, const u8 *values);
 };
 
@@ -158,8 +158,8 @@ MODULE_DEVICE_TABLE(i2c, ds1307_id);
 
 #define BLOCK_DATA_MAX_TRIES 10
 
-static s32 ds1307_read_block_data_once(struct i2c_client *client, u8 command,
-				  u8 length, u8 *values)
+static s32 ds1307_read_block_data_once(const struct i2c_client *client,
+				       u8 command, u8 length, u8 *values)
 {
 	s32 i, data;
 
@@ -172,7 +172,7 @@ static s32 ds1307_read_block_data_once(struct i2c_client *client, u8 command,
 	return i;
 }
 
-static s32 ds1307_read_block_data(struct i2c_client *client, u8 command,
+static s32 ds1307_read_block_data(const struct i2c_client *client, u8 command,
 				  u8 length, u8 *values)
 {
 	u8 oldvalues[I2C_SMBUS_BLOCK_MAX];
@@ -198,7 +198,7 @@ static s32 ds1307_read_block_data(struct i2c_client *client, u8 command,
 	return length;
 }
 
-static s32 ds1307_write_block_data(struct i2c_client *client, u8 command,
+static s32 ds1307_write_block_data(const struct i2c_client *client, u8 command,
 				   u8 length, const u8 *values)
 {
 	u8 currvalues[I2C_SMBUS_BLOCK_MAX];
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 56cfe23ffb39..903576df88dc 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -57,9 +57,10 @@ struct i2c_board_info;
  * transmit an arbitrary number of messages without interruption.
  * @count must be be less than 64k since msg.len is u16.
  */
-extern int i2c_master_send(struct i2c_client *client, const char *buf,
+extern int i2c_master_send(const struct i2c_client *client, const char *buf,
+			   int count);
+extern int i2c_master_recv(const struct i2c_client *client, char *buf,
 			   int count);
-extern int i2c_master_recv(struct i2c_client *client, char *buf, int count);
 
 /* Transfer num messages.
  */
@@ -78,23 +79,25 @@ extern s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
 /* Now follow the 'nice' access routines. These also document the calling
    conventions of i2c_smbus_xfer. */
 
-extern s32 i2c_smbus_read_byte(struct i2c_client *client);
-extern s32 i2c_smbus_write_byte(struct i2c_client *client, u8 value);
-extern s32 i2c_smbus_read_byte_data(struct i2c_client *client, u8 command);
-extern s32 i2c_smbus_write_byte_data(struct i2c_client *client,
+extern s32 i2c_smbus_read_byte(const struct i2c_client *client);
+extern s32 i2c_smbus_write_byte(const struct i2c_client *client, u8 value);
+extern s32 i2c_smbus_read_byte_data(const struct i2c_client *client,
+				    u8 command);
+extern s32 i2c_smbus_write_byte_data(const struct i2c_client *client,
 				     u8 command, u8 value);
-extern s32 i2c_smbus_read_word_data(struct i2c_client *client, u8 command);
-extern s32 i2c_smbus_write_word_data(struct i2c_client *client,
+extern s32 i2c_smbus_read_word_data(const struct i2c_client *client,
+				    u8 command);
+extern s32 i2c_smbus_write_word_data(const struct i2c_client *client,
 				     u8 command, u16 value);
 /* Returns the number of read bytes */
-extern s32 i2c_smbus_read_block_data(struct i2c_client *client,
+extern s32 i2c_smbus_read_block_data(const struct i2c_client *client,
 				     u8 command, u8 *values);
-extern s32 i2c_smbus_write_block_data(struct i2c_client *client,
+extern s32 i2c_smbus_write_block_data(const struct i2c_client *client,
 				      u8 command, u8 length, const u8 *values);
 /* Returns the number of read bytes */
-extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client,
+extern s32 i2c_smbus_read_i2c_block_data(const struct i2c_client *client,
 					 u8 command, u8 length, u8 *values);
-extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
+extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
 					  u8 command, u8 length,
 					  const u8 *values);
 #endif /* I2C */
-- 
cgit v1.2.3-71-gd317


From e159489baa717dbae70f9903770a6a4990865887 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 9 Jan 2011 23:32:15 +0100
Subject: workqueue: relax lockdep annotation on flush_work()

Currently, the lockdep annotation in flush_work() requires exclusive
access on the workqueue the target work is queued on and triggers
warning if a work is trying to flush another work on the same
workqueue; however, this is no longer true as workqueues can now
execute multiple works concurrently.

This patch adds lock_map_acquire_read() and make process_one_work()
hold read access to the workqueue while executing a work and
start_flush_work() check for write access if concurrnecy level is one
or the workqueue has a rescuer (as only one execution resource - the
rescuer - is guaranteed to be available under memory pressure), and
read access if higher.

This better represents what's going on and removes spurious lockdep
warnings which are triggered by fake dependency chain created through
flush_work().

* Peter pointed out that flushing another work from a WQ_MEM_RECLAIM
  wq breaks forward progress guarantee under memory pressure.
  Condition check accordingly updated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Tested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@kernel.org
---
 include/linux/lockdep.h |  3 +++
 kernel/workqueue.c      | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 71c09b26c759..9f19430c7d07 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -522,12 +522,15 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
 #  define lock_map_acquire(l)		lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_)
+#  define lock_map_acquire_read(l)	lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_)
 # else
 #  define lock_map_acquire(l)		lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_)
+#  define lock_map_acquire_read(l)	lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_)
 # endif
 # define lock_map_release(l)			lock_release(l, 1, _THIS_IP_)
 #else
 # define lock_map_acquire(l)			do { } while (0)
+# define lock_map_acquire_read(l)		do { } while (0)
 # define lock_map_release(l)			do { } while (0)
 #endif
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ee6ec82f88a..930c2390b77e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1840,7 +1840,7 @@ __acquires(&gcwq->lock)
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
-	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	trace_workqueue_execute_start(work);
 	f(work);
@@ -2384,8 +2384,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 	insert_wq_barrier(cwq, barr, work, worker);
 	spin_unlock_irq(&gcwq->lock);
 
-	lock_map_acquire(&cwq->wq->lockdep_map);
+	/*
+	 * If @max_active is 1 or rescuer is in use, flushing another work
+	 * item on the same workqueue may lead to deadlock.  Make sure the
+	 * flusher is not running on the same workqueue by verifying write
+	 * access.
+	 */
+	if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+		lock_map_acquire(&cwq->wq->lockdep_map);
+	else
+		lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_release(&cwq->wq->lockdep_map);
+
 	return true;
 already_gone:
 	spin_unlock_irq(&gcwq->lock);
-- 
cgit v1.2.3-71-gd317


From 925268a06dc2b1ff7bfcc37419a6827a0e739639 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 11 Jan 2011 16:44:01 +0900
Subject: memory hotplug: one more lock on memory hotplug

Now, memory_hotplug_(un)lock() is used for add/remove/offline pages
for avoiding races with hibernation. But this should be held in
online_pages(), too. It seems asymmetric.

There are cases where one has to avoid a race with memory hotplug
notifier and his own local code, and hotplug v.s. hotplug.
This will add a generic solution for avoiding races. In other view,
having lock here has no big impacts. online pages is tend to be
done by udev script at el against each memory section one by one.

Then, it's better to have lock here, too.

Cc: <stable@kernel.org> # 2.6.37
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/memory_hotplug.h | 6 ++++++
 mm/memory_hotplug.c            | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 31c237a00c48..12b9eb5a36c3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -161,6 +161,12 @@ extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
 extern void put_page_bootmem(struct page *page);
 #endif
 
+/*
+ * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
+ * notifier will be called under this. 2) offline/online/add/remove memory
+ * will not run simultaneously.
+ */
+
 void lock_memory_hotplug(void);
 void unlock_memory_hotplug(void);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..83163c096a75 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -407,6 +407,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	int ret;
 	struct memory_notify arg;
 
+	lock_memory_hotplug();
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
 	arg.status_change_nid = -1;
@@ -419,6 +420,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	ret = notifier_to_errno(ret);
 	if (ret) {
 		memory_notify(MEM_CANCEL_ONLINE, &arg);
+		unlock_memory_hotplug();
 		return ret;
 	}
 	/*
@@ -443,6 +445,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
 			nr_pages, pfn);
 		memory_notify(MEM_CANCEL_ONLINE, &arg);
+		unlock_memory_hotplug();
 		return ret;
 	}
 
@@ -467,6 +470,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 
 	if (onlined_pages)
 		memory_notify(MEM_ONLINE, &arg);
+	unlock_memory_hotplug();
 
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 357f54d6b38252737116a6d631f6ac28ded018ed Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 14 Dec 2010 10:11:57 -0500
Subject: NFS fix the setting of exchange id flag

Indicate support for referrals. Do not set any PNFS roles. Check the flags
returned by the server for validity. Do not use exchange flags from an old
client ID instance when recovering a client ID.

Update the EXCHID4_FLAG_XXX set to RFC 5661.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c    | 25 +++++++++++++++++++++----
 include/linux/nfs4.h |  8 ++++++--
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f2b92f6a7efb..9d992b0346e3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4518,6 +4518,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 }
 
 #ifdef CONFIG_NFS_V4_1
+/*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+	if (flags & ~EXCHGID4_FLAG_MASK_R)
+		goto out_inval;
+	if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+	    (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+		goto out_inval;
+	if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+		goto out_inval;
+	return NFS_OK;
+out_inval:
+	return -NFS4ERR_INVAL;
+}
+
 /*
  * nfs4_proc_exchange_id()
  *
@@ -4531,7 +4550,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	nfs4_verifier verifier;
 	struct nfs41_exchange_id_args args = {
 		.client = clp,
-		.flags = clp->cl_exchange_flags,
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
 	};
 	struct nfs41_exchange_id_res res = {
 		.client = clp,
@@ -4548,9 +4567,6 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	dprintk("--> %s\n", __func__);
 	BUG_ON(clp == NULL);
 
-	/* Remove server-only flags */
-	args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
-
 	p = (u32 *)verifier.data;
 	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
 	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4576,6 +4592,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 			break;
 	}
 
+	status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
 }
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 4925b22219d2..9b46300b4305 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -111,9 +111,13 @@
 
 #define EXCHGID4_FLAG_SUPP_MOVED_REFER		0x00000001
 #define EXCHGID4_FLAG_SUPP_MOVED_MIGR		0x00000002
+#define EXCHGID4_FLAG_BIND_PRINC_STATEID	0x00000100
+
 #define EXCHGID4_FLAG_USE_NON_PNFS		0x00010000
 #define EXCHGID4_FLAG_USE_PNFS_MDS		0x00020000
 #define EXCHGID4_FLAG_USE_PNFS_DS		0x00040000
+#define EXCHGID4_FLAG_MASK_PNFS			0x00070000
+
 #define EXCHGID4_FLAG_UPD_CONFIRMED_REC_A	0x40000000
 #define EXCHGID4_FLAG_CONFIRMED_R		0x80000000
 /*
@@ -121,8 +125,8 @@
  * they're set in the argument or response, have separate
  * invalid flag masks for arg (_A) and resp (_R).
  */
-#define EXCHGID4_FLAG_MASK_A			0x40070003
-#define EXCHGID4_FLAG_MASK_R			0x80070003
+#define EXCHGID4_FLAG_MASK_A			0x40070103
+#define EXCHGID4_FLAG_MASK_R			0x80070103
 
 #define SEQ4_STATUS_CB_PATH_DOWN		0x00000001
 #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING	0x00000002
-- 
cgit v1.2.3-71-gd317


From 1d1bc8f2074f0b728dfca2a3c16f2f5a3f298ffc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 4 Oct 2010 23:12:59 -0400
Subject: nfsd4: support BIND_CONN_TO_SESSION

Basic xdr and processing for BIND_CONN_TO_SESSION.  This adds a
connection to the list of connections associated with a session.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c   |  9 ++++++--
 fs/nfsd/nfs4state.c  | 59 ++++++++++++++++++++++++++++++++++++++++++++++------
 fs/nfsd/nfs4xdr.c    | 34 ++++++++++++++++++++++++++++--
 fs/nfsd/state.h      |  5 +++++
 fs/nfsd/xdr4.h       |  2 ++
 include/linux/nfs4.h |  3 +++
 6 files changed, 102 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index fd6694b49e1c..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1004,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
  * Also note, enforced elsewhere:
  *	- SEQUENCE other than as first op results in
  *	  NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
- *	- BIND_CONN_TO_SESSION must be the only op in its compound
- *	  (Will be enforced in nfsd4_bind_conn_to_session().)
+ *	- BIND_CONN_TO_SESSION must be the only op in its compound.
+ *	  (Enforced in nfsd4_bind_conn_to_session().)
  *	- DESTROY_SESSION must be the final operation in a compound, if
  *	  sessionid's in SEQUENCE and DESTROY_SESSION are the same.
  *	  (Enforced in nfsd4_destroy_session().)
@@ -1326,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
 		.op_name = "OP_EXCHANGE_ID",
 	},
+	[OP_BIND_CONN_TO_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_BIND_CONN_TO_SESSION",
+	},
 	[OP_CREATE_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_create_session,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3cf9900d5f32..956174f488a7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -679,15 +679,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
 	return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
 
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
 {
 	struct nfsd4_conn *conn;
-	u32 flags = NFS4_CDFC4_FORE;
 	int ret;
 
-	if (ses->se_flags & SESSION4_BACK_CHAN)
-		flags |= NFS4_CDFC4_BACK;
-	conn = alloc_conn(rqstp, flags);
+	conn = alloc_conn(rqstp, dir);
 	if (!conn)
 		return nfserr_jukebox;
 	nfsd4_hash_conn(conn, ses);
@@ -698,6 +695,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 	return nfs_ok;
 }
 
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+	u32 dir = NFS4_CDFC4_FORE;
+
+	if (ses->se_flags & SESSION4_BACK_CHAN)
+		dir |= NFS4_CDFC4_BACK;
+
+	return nfsd4_new_conn(rqstp, ses, dir);
+}
+
+/* must be called under client_lock */
 static void nfsd4_del_conns(struct nfsd4_session *s)
 {
 	struct nfs4_client *clp = s->se_client;
@@ -776,7 +784,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 	spin_unlock(&clp->cl_lock);
 	spin_unlock(&client_lock);
 
-	status = nfsd4_new_conn(rqstp, new);
+	status = nfsd4_new_conn_from_crses(rqstp, new);
 	/* whoops: benny points out, status is ignored! (err, or bogus) */
 	if (status) {
 		free_session(&new->se_ref);
@@ -1597,6 +1605,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
 	return argp->opcnt == resp->opcnt;
 }
 
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+	switch (*dir) {
+	case NFS4_CDFC4_FORE:
+	case NFS4_CDFC4_BACK:
+		return nfs_ok;
+	case NFS4_CDFC4_FORE_OR_BOTH:
+	case NFS4_CDFC4_BACK_OR_BOTH:
+		*dir = NFS4_CDFC4_BOTH;
+		return nfs_ok;
+	};
+	return nfserr_inval;
+}
+
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 status;
+
+	if (!nfsd4_last_compound_op(rqstp))
+		return nfserr_not_only_op;
+	spin_lock(&client_lock);
+	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+	/* Sorta weird: we only need the refcnt'ing because new_conn acquires
+	 * client_lock iself: */
+	if (cstate->session) {
+		nfsd4_get_session(cstate->session);
+		atomic_inc(&cstate->session->se_client->cl_refcount);
+	}
+	spin_unlock(&client_lock);
+	if (!cstate->session)
+		return nfserr_badsession;
+
+	status = nfsd4_map_bcts_dir(&bcts->dir);
+	nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+	return nfs_ok;
+}
+
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
 {
 	if (!session)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ca3786905dec..4ff2c9e0b276 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -421,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
 	DECODE_TAIL;
 }
 
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+	DECODE_HEAD;
+	u32 dummy;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+	COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(bcts->dir);
+	/* XXX: Perhaps Tom Tucker could help us figure out how we
+	 * should be using ctsa_use_conn_in_rdma_mode: */
+	READ32(dummy);
+
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
@@ -1359,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 
 	/* new operations for NFSv4.1 */
 	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
 	[OP_EXCHANGE_ID]	= (nfsd4_dec)nfsd4_decode_exchange_id,
 	[OP_CREATE_SESSION]	= (nfsd4_dec)nfsd4_decode_create_session,
 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -2383,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 	return nfserr;
 }
 
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+		WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+		WRITE32(bcts->dir);
+		/* XXX: ? */
+		WRITE32(0);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
@@ -3174,7 +3204,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 
 	/* NFSv4.1 operations */
 	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
 	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
 	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cf6dc83fd545..442f6d8e024c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -148,6 +148,11 @@ struct nfsd4_create_session {
 	u32				gid;
 };
 
+struct nfsd4_bind_conn_to_session {
+	struct nfs4_sessionid		sessionid;
+	u32				dir;
+};
+
 /* The single slot clientid cache structure */
 struct nfsd4_clid_slot {
 	u32				sl_seqid;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 799c30c3b495..3a7aa4d98c1f 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -427,6 +427,7 @@ struct nfsd4_op {
 
 		/* NFSv4.1 */
 		struct nfsd4_exchange_id	exchange_id;
+		struct nfsd4_bind_conn_to_session bind_conn_to_session;
 		struct nfsd4_create_session	create_session;
 		struct nfsd4_destroy_session	destroy_session;
 		struct nfsd4_sequence		sequence;
@@ -523,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 		struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
 extern __be32 nfsd4_create_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_create_session *);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 26afa3021ed6..a5918938e41e 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -65,6 +65,9 @@
 
 #define NFS4_CDFC4_FORE	0x1
 #define NFS4_CDFC4_BACK 0x2
+#define NFS4_CDFC4_BOTH 0x3
+#define NFS4_CDFC4_FORE_OR_BOTH 0x3
+#define NFS4_CDFC4_BACK_OR_BOTH 0x7
 
 #define NFS4_SET_TO_SERVER_TIME	0
 #define NFS4_SET_TO_CLIENT_TIME	1
-- 
cgit v1.2.3-71-gd317


From d75faea330dbd1873c9094e9926ae306590c0998 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 30 Nov 2010 19:15:01 -0500
Subject: rpc: move sk_bc_xprt to svc_xprt

This seems obviously transport-level information even if it's currently
used only by the server socket code.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h |  1 +
 include/linux/sunrpc/svcsock.h  |  1 -
 net/sunrpc/svcsock.c            | 10 ++++++----
 net/sunrpc/xprtsock.c           |  2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index c8f81da15c7e..7ad9751a0d87 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -79,6 +79,7 @@ struct svc_xprt {
 	struct list_head	xpt_users;	/* callbacks on free */
 
 	struct net		*xpt_net;
+	struct rpc_xprt		*xpt_bc_xprt;	/* NFSv4.1 backchannel */
 };
 
 static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 1b353a76c304..04dba23c59f2 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -28,7 +28,6 @@ struct svc_sock {
 	/* private TCP part */
 	u32			sk_reclen;	/* length of record */
 	u32			sk_tcplen;	/* current read length */
-	struct rpc_xprt		*sk_bc_xprt;	/* NFSv4.1 backchannel xprt */
 };
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 52bd1130e83a..3bb400f86eae 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -987,15 +987,17 @@ static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp,
 		vec[0] = rqstp->rq_arg.head[0];
 	} else {
 		/* REPLY */
-		if (svsk->sk_bc_xprt)
-			req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid);
+		struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
+
+		if (bc_xprt)
+			req = xprt_lookup_rqst(bc_xprt, xid);
 
 		if (!req) {
 			printk(KERN_NOTICE
 				"%s: Got unrecognized reply: "
-				"calldir 0x%x sk_bc_xprt %p xid %08x\n",
+				"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
 				__func__, ntohl(calldir),
-				svsk->sk_bc_xprt, xid);
+				bc_xprt, xid);
 			vec[0] = rqstp->rq_arg.head[0];
 			goto out;
 		}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index dfcab5ac65af..18dc42eb5597 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2379,9 +2379,9 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 	 * The backchannel uses the same socket connection as the
 	 * forechannel
 	 */
+	args->bc_xprt->xpt_bc_xprt = xprt;
 	xprt->bc_xprt = args->bc_xprt;
 	bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt);
-	bc_sock->sk_bc_xprt = xprt;
 	transport->sock = bc_sock->sk_sock;
 	transport->inet = bc_sock->sk_sk;
 
-- 
cgit v1.2.3-71-gd317


From f0418aa4b1103f959d64dc18273efa04ee0140e9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 8 Dec 2010 13:48:19 -0500
Subject: rpc: allow xprt_class->setup to return a preexisting xprt

This allows us to reuse the xprt associated with a server connection if
one has already been set up.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           |  3 +++
 net/sunrpc/xprtsock.c       | 18 +++++++++---------
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 89d10d279a20..bef0f535f746 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -321,6 +321,7 @@ void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 #define XPRT_CLOSING		(6)
 #define XPRT_CONNECTION_ABORT	(7)
 #define XPRT_CONNECTION_CLOSE	(8)
+#define XPRT_INITIALIZED	(9)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 749ad15ae305..856274d7e85c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1102,6 +1102,9 @@ found:
 				-PTR_ERR(xprt));
 		return xprt;
 	}
+	if (test_and_set_bit(XPRT_INITIALIZED, &xprt->state))
+		/* ->setup returned a pre-initialized xprt: */
+		return xprt;
 
 	spin_lock_init(&xprt->transport_lock);
 	spin_lock_init(&xprt->reserve_lock);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0ef4dd4414ec..ee091c869fcd 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2359,6 +2359,15 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 	struct svc_sock *bc_sock;
 	struct rpc_xprt *ret;
 
+	if (args->bc_xprt->xpt_bc_xprt) {
+		/*
+		 * This server connection already has a backchannel
+		 * export; we can't create a new one, as we wouldn't be
+		 * able to match replies based on xid any more.  So,
+		 * reuse the already-existing one:
+		 */
+		 return args->bc_xprt->xpt_bc_xprt;
+	}
 	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries);
 	if (IS_ERR(xprt))
 		return xprt;
@@ -2396,15 +2405,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 			xprt->address_strings[RPC_DISPLAY_PORT],
 			xprt->address_strings[RPC_DISPLAY_PROTO]);
 
-	/*
-	 * The backchannel uses the same socket connection as the
-	 * forechannel
-	 */
-	if (args->bc_xprt->xpt_bc_xprt) {
-		/* XXX: actually, want to catch this case... */
-		ret = ERR_PTR(-EINVAL);
-		goto out_err;
-	}
 	/*
 	 * Once we've associated a backchannel xprt with a connection,
 	 * we want to keep it around as long as long as the connection
-- 
cgit v1.2.3-71-gd317


From 0d2689c0f054f6a8bf3115d6386bd9c2d65dc44b Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@enac.fr>
Date: Fri, 7 Jan 2011 23:44:32 +0100
Subject: HID: add feature_mapping callback

Currently hid doesn't export the features it knows to the specific modules.
Some information can be really important in such features: MosArt and
Cypress devices are by default not in a multitouch mode.
We have to send the value 2 on the right feature.

This patch exports to the module the features report so they can find the
right feature to set up the correct mode.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@enac.fr>
Acked-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 17 +++++++++++++----
 include/linux/hid.h     |  4 ++++
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index d8d372bae3cc..8703b2cd27ca 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -290,6 +290,14 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 		goto ignore;
 	}
 
+	if (field->report_type == HID_FEATURE_REPORT) {
+		if (device->driver->feature_mapping) {
+			device->driver->feature_mapping(device, hidinput, field,
+				usage);
+		}
+		goto ignore;
+	}
+
 	if (device->driver->input_mapping) {
 		int ret = device->driver->input_mapping(device, hidinput, field,
 				usage, &bit, &max);
@@ -839,7 +847,6 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 	struct hid_input *hidinput = NULL;
 	struct input_dev *input_dev;
 	int i, j, k;
-	int max_report_type = HID_OUTPUT_REPORT;
 
 	INIT_LIST_HEAD(&hid->inputs);
 
@@ -856,10 +863,11 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 			return -1;
 	}
 
-	if (hid->quirks & HID_QUIRK_SKIP_OUTPUT_REPORTS)
-		max_report_type = HID_INPUT_REPORT;
+	for (k = HID_INPUT_REPORT; k <= HID_FEATURE_REPORT; k++) {
+		if (k == HID_OUTPUT_REPORT &&
+			hid->quirks & HID_QUIRK_SKIP_OUTPUT_REPORTS)
+			continue;
 
-	for (k = HID_INPUT_REPORT; k <= max_report_type; k++)
 		list_for_each_entry(report, &hid->report_enum[k].report_list, list) {
 
 			if (!report->maxfield)
@@ -912,6 +920,7 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 				hidinput = NULL;
 			}
 		}
+	}
 
 	if (hidinput && input_register_device(hidinput->input))
 		goto out_cleanup;
diff --git a/include/linux/hid.h b/include/linux/hid.h
index bb0f56f5c01e..96bd7920a01a 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -593,6 +593,7 @@ struct hid_usage_id {
  * @report_fixup: called before report descriptor parsing (NULL means nop)
  * @input_mapping: invoked on input registering before mapping an usage
  * @input_mapped: invoked on input registering after mapping an usage
+ * @feature_mapping: invoked on feature registering
  * @suspend: invoked on suspend (NULL means nop)
  * @resume: invoked on resume if device was not reset (NULL means nop)
  * @reset_resume: invoked on resume if device was reset (NULL means nop)
@@ -636,6 +637,9 @@ struct hid_driver {
 	int (*input_mapped)(struct hid_device *hdev,
 			struct hid_input *hidinput, struct hid_field *field,
 			struct hid_usage *usage, unsigned long **bit, int *max);
+	void (*feature_mapping)(struct hid_device *hdev,
+			struct hid_input *hidinput, struct hid_field *field,
+			struct hid_usage *usage);
 #ifdef CONFIG_PM
 	int (*suspend)(struct hid_device *hdev, pm_message_t message);
 	int (*resume)(struct hid_device *hdev);
-- 
cgit v1.2.3-71-gd317


From 281054ac8dfc083442c571be44f1c5b9821812ae Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@enac.fr>
Date: Fri, 7 Jan 2011 23:45:11 +0100
Subject: HID: set HID_MAX_FIELD at 128

Stantums multitouch panels sends more than 64 reports and this results
in not being able to handle all the touches given by this device.

This patch is required to be able to include Stantum panels in the
unified hid-multitouch driver.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@enac.fr>
Acked-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 96bd7920a01a..1ebc6e39f20d 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -402,7 +402,7 @@ struct hid_field {
 	__u16 dpad;			/* dpad input code */
 };
 
-#define HID_MAX_FIELDS 64
+#define HID_MAX_FIELDS 128
 
 struct hid_report {
 	struct list_head list;
-- 
cgit v1.2.3-71-gd317


From 4cb18728709683c91a5f6f8d5f337bfb498b089a Mon Sep 17 00:00:00 2001
From: "R.Durgadoss" <durgadoss.r@intel.com>
Date: Wed, 27 Oct 2010 03:33:29 +0530
Subject: thermal: Add event notification to thermal framework

This patch adds event notification support to the generic
thermal sysfs framework in the kernel. The notification is in the
form of a netlink event.

Signed-off-by: R.Durgadoss <durgadoss.r@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/ABI/stable/thermal-notification |   4 +
 Documentation/thermal/sysfs-api.txt           |  12 +++
 drivers/thermal/Kconfig                       |   1 +
 drivers/thermal/thermal_sys.c                 | 103 +++++++++++++++++++++++++-
 include/linux/thermal.h                       |  32 ++++++++
 5 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/stable/thermal-notification

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/thermal-notification b/Documentation/ABI/stable/thermal-notification
new file mode 100644
index 000000000000..9723e8b7aeb3
--- /dev/null
+++ b/Documentation/ABI/stable/thermal-notification
@@ -0,0 +1,4 @@
+What:		A notification mechanism for thermal related events
+Description:
+	This interface enables notification for thermal related events.
+	The notification is in the form of a netlink event.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index cb3d15bc1aeb..b61e46f449aa 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -278,3 +278,15 @@ method, the sys I/F structure will be built like this:
     |---name:			acpitz
     |---temp1_input:		37000
     |---temp1_crit:		100000
+
+4. Event Notification
+
+The framework includes a simple notification mechanism, in the form of a
+netlink event. Netlink socket initialization is done during the _init_
+of the framework. Drivers which intend to use the notification mechanism
+just need to call generate_netlink_event() with two arguments viz
+(originator, event). Typically the originator will be an integer assigned
+to a thermal_zone_device when it registers itself with the framework. The
+event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
+THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
+crosses any of the configured thresholds.
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index bf7c687519ef..f7a5dba3ca23 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -4,6 +4,7 @@
 
 menuconfig THERMAL
 	tristate "Generic Thermal sysfs driver"
+	depends on NET
 	help
 	  Generic Thermal Sysfs driver offers a generic mechanism for
 	  thermal management. Usually it's made up of one or more thermal
diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 13c72c629329..760e045c93c8 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -32,6 +32,8 @@
 #include <linux/thermal.h>
 #include <linux/spinlock.h>
 #include <linux/reboot.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
 
 MODULE_AUTHOR("Zhang Rui");
 MODULE_DESCRIPTION("Generic thermal management sysfs support");
@@ -58,6 +60,22 @@ static LIST_HEAD(thermal_tz_list);
 static LIST_HEAD(thermal_cdev_list);
 static DEFINE_MUTEX(thermal_list_lock);
 
+static unsigned int thermal_event_seqnum;
+
+static struct genl_family thermal_event_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.name = THERMAL_GENL_FAMILY_NAME,
+	.version = THERMAL_GENL_VERSION,
+	.maxattr = THERMAL_GENL_ATTR_MAX,
+};
+
+static struct genl_multicast_group thermal_event_mcgrp = {
+	.name = THERMAL_GENL_MCAST_GROUP_NAME,
+};
+
+static int genetlink_init(void);
+static void genetlink_exit(void);
+
 static int get_idr(struct idr *idr, struct mutex *lock, int *id)
 {
 	int err;
@@ -1214,6 +1232,82 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 
 EXPORT_SYMBOL(thermal_zone_device_unregister);
 
+int generate_netlink_event(u32 orig, enum events event)
+{
+	struct sk_buff *skb;
+	struct nlattr *attr;
+	struct thermal_genl_event *thermal_event;
+	void *msg_header;
+	int size;
+	int result;
+
+	/* allocate memory */
+	size = nla_total_size(sizeof(struct thermal_genl_event)) + \
+				nla_total_size(0);
+
+	skb = genlmsg_new(size, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	/* add the genetlink message header */
+	msg_header = genlmsg_put(skb, 0, thermal_event_seqnum++,
+				 &thermal_event_genl_family, 0,
+				 THERMAL_GENL_CMD_EVENT);
+	if (!msg_header) {
+		nlmsg_free(skb);
+		return -ENOMEM;
+	}
+
+	/* fill the data */
+	attr = nla_reserve(skb, THERMAL_GENL_ATTR_EVENT, \
+			sizeof(struct thermal_genl_event));
+
+	if (!attr) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	thermal_event = nla_data(attr);
+	if (!thermal_event) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	memset(thermal_event, 0, sizeof(struct thermal_genl_event));
+
+	thermal_event->orig = orig;
+	thermal_event->event = event;
+
+	/* send multicast genetlink message */
+	result = genlmsg_end(skb, msg_header);
+	if (result < 0) {
+		nlmsg_free(skb);
+		return result;
+	}
+
+	result = genlmsg_multicast(skb, 0, thermal_event_mcgrp.id, GFP_ATOMIC);
+	if (result)
+		printk(KERN_INFO "failed to send netlink event:%d", result);
+
+	return result;
+}
+EXPORT_SYMBOL(generate_netlink_event);
+
+static int genetlink_init(void)
+{
+	int result;
+
+	result = genl_register_family(&thermal_event_genl_family);
+	if (result)
+		return result;
+
+	result = genl_register_mc_group(&thermal_event_genl_family,
+					&thermal_event_mcgrp);
+	if (result)
+		genl_unregister_family(&thermal_event_genl_family);
+	return result;
+}
+
 static int __init thermal_init(void)
 {
 	int result = 0;
@@ -1225,9 +1319,15 @@ static int __init thermal_init(void)
 		mutex_destroy(&thermal_idr_lock);
 		mutex_destroy(&thermal_list_lock);
 	}
+	result = genetlink_init();
 	return result;
 }
 
+static void genetlink_exit(void)
+{
+	genl_unregister_family(&thermal_event_genl_family);
+}
+
 static void __exit thermal_exit(void)
 {
 	class_unregister(&thermal_class);
@@ -1235,7 +1335,8 @@ static void __exit thermal_exit(void)
 	idr_destroy(&thermal_cdev_idr);
 	mutex_destroy(&thermal_idr_lock);
 	mutex_destroy(&thermal_list_lock);
+	genetlink_exit();
 }
 
-subsys_initcall(thermal_init);
+fs_initcall(thermal_init);
 module_exit(thermal_exit);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1de8b9eb841b..784644961279 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -127,6 +127,37 @@ struct thermal_zone_device {
 	struct thermal_hwmon_attr temp_crit;	/* hwmon sys attr */
 #endif
 };
+/* Adding event notification support elements */
+#define THERMAL_GENL_FAMILY_NAME                "thermal_event"
+#define THERMAL_GENL_VERSION                    0x01
+#define THERMAL_GENL_MCAST_GROUP_NAME           "thermal_mc_group"
+
+enum events {
+	THERMAL_AUX0,
+	THERMAL_AUX1,
+	THERMAL_CRITICAL,
+	THERMAL_DEV_FAULT,
+};
+
+struct thermal_genl_event {
+	u32 orig;
+	enum events event;
+};
+/* attributes of thermal_genl_family */
+enum {
+	THERMAL_GENL_ATTR_UNSPEC,
+	THERMAL_GENL_ATTR_EVENT,
+	__THERMAL_GENL_ATTR_MAX,
+};
+#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
+
+/* commands supported by the thermal_genl_family */
+enum {
+	THERMAL_GENL_CMD_UNSPEC,
+	THERMAL_GENL_CMD_EVENT,
+	__THERMAL_GENL_CMD_MAX,
+};
+#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1)
 
 struct thermal_zone_device *thermal_zone_device_register(char *, int, void *,
 							 struct
@@ -146,5 +177,6 @@ struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
 							       thermal_cooling_device_ops
 							       *);
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
+extern int generate_netlink_event(u32 orig, enum events event);
 
 #endif /* __THERMAL_H__ */
-- 
cgit v1.2.3-71-gd317


From 50a88cb7eddb971077ae7dff76b116747c12c371 Mon Sep 17 00:00:00 2001
From: Jekyll Lai <jekyll.lai@gmail.com>
Date: Tue, 11 Jan 2011 21:53:55 -0800
Subject: Input: add SW_ROTATE_LOCK switch type

This switch is used to signal that user want to disable screen
transitions from portrait to landscape mode and back.

Signed-off-by: Jekyll Lai <jekyll_lai@wistron.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 6c407e553a40..4bdacada8d2a 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -775,6 +775,7 @@ struct input_keymap_entry {
 #define SW_CAMERA_LENS_COVER	0x09  /* set = lens covered */
 #define SW_KEYPAD_SLIDE		0x0a  /* set = keypad slide out */
 #define SW_FRONT_PROXIMITY	0x0b  /* set = front proximity sensor active */
+#define SW_ROTATE_LOCK		0x0c  /* set = rotate locked/disabled */
 #define SW_MAX			0x0f
 #define SW_CNT			(SW_MAX+1)
 
-- 
cgit v1.2.3-71-gd317


From af585b921e5d1e919947c4b1164b59507fe7cd7b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:46 +0200
Subject: KVM: Halt vcpu if page it tries to access is swapped out

If a guest accesses swapped out memory do not swap it in from vcpu thread
context. Schedule work to do swapping and put vcpu into halted state
instead.

Interrupts will still be delivered to the guest and if interrupt will
cause reschedule guest will continue to run another task.

[avi: remove call to get_user_pages_noio(), nacked by Linus; this
      makes everything synchrnous again]

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  18 ++++
 arch/x86/kvm/Kconfig            |   1 +
 arch/x86/kvm/Makefile           |   1 +
 arch/x86/kvm/mmu.c              |  52 ++++++++++-
 arch/x86/kvm/paging_tmpl.h      |   4 +-
 arch/x86/kvm/x86.c              | 112 ++++++++++++++++++++++-
 include/linux/kvm_host.h        |  31 +++++++
 include/trace/events/kvm.h      |  90 +++++++++++++++++++
 virt/kvm/Kconfig                |   3 +
 virt/kvm/async_pf.c             | 190 ++++++++++++++++++++++++++++++++++++++++
 virt/kvm/async_pf.h             |  36 ++++++++
 virt/kvm/kvm_main.c             |  48 +++++++---
 12 files changed, 570 insertions(+), 16 deletions(-)
 create mode 100644 virt/kvm/async_pf.c
 create mode 100644 virt/kvm/async_pf.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82aa1eb..b5f4c1a36d65 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
 #define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 
+#define ASYNC_PF_PER_VCPU 64
+
 extern spinlock_t kvm_lock;
 extern struct list_head vm_list;
 
 struct kvm_vcpu;
 struct kvm;
+struct kvm_async_pf;
 
 enum kvm_reg {
 	VCPU_REGS_RAX = 0,
@@ -412,6 +415,11 @@ struct kvm_vcpu_arch {
 	u64 hv_vapic;
 
 	cpumask_var_t wbinvd_dirty_mask;
+
+	struct {
+		bool halted;
+		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+	} apf;
 };
 
 struct kvm_arch {
@@ -585,6 +593,10 @@ struct kvm_x86_ops {
 	const struct trace_print_flags *exit_reasons_str;
 };
 
+struct kvm_arch_async_pf {
+	gfn_t gfn;
+};
+
 extern struct kvm_x86_ops *kvm_x86_ops;
 
 int kvm_mmu_module_init(void);
@@ -799,4 +811,10 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
 
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+				     struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+				 struct kvm_async_pf *work);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
+	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
 	---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..c53bf19b1da0 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,6 +9,7 @@ kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
 				assigned-dev.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
+kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o timer.o
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04aee8301..4ab04de5a76a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
  *
  */
 
+#include "irq.h"
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -2587,6 +2589,50 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 			     error_code & PFERR_WRITE_MASK, gfn);
 }
 
+int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+	struct kvm_arch_async_pf arch;
+	arch.gfn = gfn;
+
+	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+}
+
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+		     kvm_event_needs_reinjection(vcpu)))
+		return false;
+
+	return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
+			 pfn_t *pfn)
+{
+	bool async;
+
+	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);
+
+	if (!async)
+		return false; /* *pfn has correct page already */
+
+	put_page(pfn_to_page(*pfn));
+
+	if (can_do_async_pf(vcpu)) {
+		trace_kvm_try_async_get_page(async, *pfn);
+		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+			trace_kvm_async_pf_doublefault(gva, gfn);
+			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+			return true;
+		} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+			return true;
+	}
+
+	*pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+	return false;
+}
+
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 				u32 error_code)
 {
@@ -2609,7 +2655,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+	if (try_async_pf(vcpu, gfn, gpa, &pfn))
+		return 0;
+
+	/* mmio */
 	if (is_error_pfn(pfn))
 		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..c45376dd041a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -568,7 +568,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+
+	if (try_async_pf(vcpu, walker.gfn, addr, &pfn))
+		return 0;
 
 	/* mmio */
 	if (is_error_pfn(pfn))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c05d47701292..3cd4d091c2f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/uaccess.h>
+#include <linux/hash.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+	int i;
+	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+		vcpu->arch.apf.gfns[i] = ~0;
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
 	unsigned slot;
@@ -5115,6 +5123,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			vcpu->fpu_active = 0;
 			kvm_x86_ops->fpu_deactivate(vcpu);
 		}
+		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+			/* Page is swapped out. Do synthetic halt */
+			vcpu->arch.apf.halted = true;
+			r = 1;
+			goto out;
+		}
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -5243,7 +5257,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 
 	r = 1;
 	while (r > 0) {
-		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+		    !vcpu->arch.apf.halted)
 			r = vcpu_enter_guest(vcpu);
 		else {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5256,6 +5271,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 					vcpu->arch.mp_state =
 						KVM_MP_STATE_RUNNABLE;
 				case KVM_MP_STATE_RUNNABLE:
+					vcpu->arch.apf.halted = false;
 					break;
 				case KVM_MP_STATE_SIPI_RECEIVED:
 				default:
@@ -5277,6 +5293,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.request_irq_exits;
 		}
+
+		kvm_check_async_pf_completion(vcpu);
+
 		if (signal_pending(current)) {
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5792,6 +5811,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
+	kvm_clear_async_pf_completion_queue(vcpu);
+	kvm_async_pf_hash_reset(vcpu);
+	vcpu->arch.apf.halted = false;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -5880,6 +5903,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
 		goto fail_free_mce_banks;
 
+	kvm_async_pf_hash_reset(vcpu);
+
 	return 0;
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
@@ -5938,8 +5963,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
 	/*
 	 * Unpin any mmu pages first.
 	 */
-	kvm_for_each_vcpu(i, vcpu, kvm)
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvm_clear_async_pf_completion_queue(vcpu);
 		kvm_unload_vcpu_mmu(vcpu);
+	}
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_arch_vcpu_free(vcpu);
 
@@ -6050,7 +6077,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+		!vcpu->arch.apf.halted)
+		|| !list_empty_careful(&vcpu->async_pf.done)
 		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
 		|| vcpu->arch.nmi_pending ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
@@ -6109,6 +6138,83 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	u32 key = kvm_async_pf_hash_fn(gfn);
+
+	while (vcpu->arch.apf.gfns[key] != ~0)
+		key = kvm_async_pf_next_probe(key);
+
+	vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	int i;
+	u32 key = kvm_async_pf_hash_fn(gfn);
+
+	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+		     (vcpu->arch.apf.gfns[key] != gfn ||
+		      vcpu->arch.apf.gfns[key] == ~0); i++)
+		key = kvm_async_pf_next_probe(key);
+
+	return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	u32 i, j, k;
+
+	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+	while (true) {
+		vcpu->arch.apf.gfns[i] = ~0;
+		do {
+			j = kvm_async_pf_next_probe(j);
+			if (vcpu->arch.apf.gfns[j] == ~0)
+				return;
+			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+			/*
+			 * k lies cyclically in ]i,j]
+			 * |    i.k.j |
+			 * |....j i.k.| or  |.k..j i...|
+			 */
+		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+		i = j;
+	}
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+				     struct kvm_async_pf *work)
+{
+	trace_kvm_async_pf_not_present(work->gva);
+
+	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+				 struct kvm_async_pf *work)
+{
+	trace_kvm_async_pf_ready(work->gva);
+	kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+}
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a0557422715e..e56acc7857e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -40,6 +40,7 @@
 #define KVM_REQ_KICK               9
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
+#define KVM_REQ_APF_HALT          12
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -74,6 +75,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			      struct kvm_io_device *dev);
 
+#ifdef CONFIG_KVM_ASYNC_PF
+struct kvm_async_pf {
+	struct work_struct work;
+	struct list_head link;
+	struct list_head queue;
+	struct kvm_vcpu *vcpu;
+	struct mm_struct *mm;
+	gva_t gva;
+	unsigned long addr;
+	struct kvm_arch_async_pf arch;
+	struct page *page;
+	bool done;
+};
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+		       struct kvm_arch_async_pf *arch);
+#endif
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -104,6 +125,15 @@ struct kvm_vcpu {
 	gpa_t mmio_phys_addr;
 #endif
 
+#ifdef CONFIG_KVM_ASYNC_PF
+	struct {
+		u32 queued;
+		struct list_head queue;
+		struct list_head done;
+		spinlock_t lock;
+	} async_pf;
+#endif
+
 	struct kvm_vcpu_arch arch;
 };
 
@@ -302,6 +332,7 @@ void kvm_set_page_accessed(struct page *page);
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 6dd3a51ab1cb..a78a5e574632 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -185,6 +185,96 @@ TRACE_EVENT(kvm_age_page,
 		  __entry->referenced ? "YOUNG" : "OLD")
 );
 
+#ifdef CONFIG_KVM_ASYNC_PF
+TRACE_EVENT(
+	kvm_try_async_get_page,
+	TP_PROTO(bool async, u64 pfn),
+	TP_ARGS(async, pfn),
+
+	TP_STRUCT__entry(
+		__field(__u64, pfn)
+		),
+
+	TP_fast_assign(
+		__entry->pfn = (!async) ? pfn : (u64)-1;
+		),
+
+	TP_printk("pfn %#llx", __entry->pfn)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_not_present,
+	TP_PROTO(u64 gva),
+	TP_ARGS(gva),
+
+	TP_STRUCT__entry(
+		__field(__u64, gva)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		),
+
+	TP_printk("gva %#llx not present", __entry->gva)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_ready,
+	TP_PROTO(u64 gva),
+	TP_ARGS(gva),
+
+	TP_STRUCT__entry(
+		__field(__u64, gva)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		),
+
+	TP_printk("gva %#llx ready", __entry->gva)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_completed,
+	TP_PROTO(unsigned long address, struct page *page, u64 gva),
+	TP_ARGS(address, page, gva),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, address)
+		__field(pfn_t, pfn)
+		__field(u64, gva)
+		),
+
+	TP_fast_assign(
+		__entry->address = address;
+		__entry->pfn = page ? page_to_pfn(page) : 0;
+		__entry->gva = gva;
+		),
+
+	TP_printk("gva %#llx address %#lx pfn %#llx",  __entry->gva,
+		  __entry->address, __entry->pfn)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_doublefault,
+	TP_PROTO(u64 gva, u64 gfn),
+	TP_ARGS(gva, gfn),
+
+	TP_STRUCT__entry(
+		__field(u64, gva)
+		__field(u64, gfn)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		__entry->gfn = gfn;
+		),
+
+	TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
+);
+
+#endif
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 7f1178f6b839..f63ccb0a5982 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
 
 config KVM_MMIO
        bool
+
+config KVM_ASYNC_PF
+       bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
new file mode 100644
index 000000000000..857d63431cb7
--- /dev/null
+++ b/virt/kvm/async_pf.c
@@ -0,0 +1,190 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mmu_context.h>
+
+#include "async_pf.h"
+#include <trace/events/kvm.h>
+
+static struct kmem_cache *async_pf_cache;
+
+int kvm_async_pf_init(void)
+{
+	async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
+
+	if (!async_pf_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void kvm_async_pf_deinit(void)
+{
+	if (async_pf_cache)
+		kmem_cache_destroy(async_pf_cache);
+	async_pf_cache = NULL;
+}
+
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	INIT_LIST_HEAD(&vcpu->async_pf.done);
+	INIT_LIST_HEAD(&vcpu->async_pf.queue);
+	spin_lock_init(&vcpu->async_pf.lock);
+}
+
+static void async_pf_execute(struct work_struct *work)
+{
+	struct page *page = NULL;
+	struct kvm_async_pf *apf =
+		container_of(work, struct kvm_async_pf, work);
+	struct mm_struct *mm = apf->mm;
+	struct kvm_vcpu *vcpu = apf->vcpu;
+	unsigned long addr = apf->addr;
+	gva_t gva = apf->gva;
+
+	might_sleep();
+
+	use_mm(mm);
+	down_read(&mm->mmap_sem);
+	get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
+	up_read(&mm->mmap_sem);
+	unuse_mm(mm);
+
+	spin_lock(&vcpu->async_pf.lock);
+	list_add_tail(&apf->link, &vcpu->async_pf.done);
+	apf->page = page;
+	apf->done = true;
+	spin_unlock(&vcpu->async_pf.lock);
+
+	/*
+	 * apf may be freed by kvm_check_async_pf_completion() after
+	 * this point
+	 */
+
+	trace_kvm_async_pf_completed(addr, page, gva);
+
+	if (waitqueue_active(&vcpu->wq))
+		wake_up_interruptible(&vcpu->wq);
+
+	mmdrop(mm);
+	kvm_put_kvm(vcpu->kvm);
+}
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
+	/* cancel outstanding work queue item */
+	while (!list_empty(&vcpu->async_pf.queue)) {
+		struct kvm_async_pf *work =
+			list_entry(vcpu->async_pf.queue.next,
+				   typeof(*work), queue);
+		cancel_work_sync(&work->work);
+		list_del(&work->queue);
+		if (!work->done) /* work was canceled */
+			kmem_cache_free(async_pf_cache, work);
+	}
+
+	spin_lock(&vcpu->async_pf.lock);
+	while (!list_empty(&vcpu->async_pf.done)) {
+		struct kvm_async_pf *work =
+			list_entry(vcpu->async_pf.done.next,
+				   typeof(*work), link);
+		list_del(&work->link);
+		if (work->page)
+			put_page(work->page);
+		kmem_cache_free(async_pf_cache, work);
+	}
+	spin_unlock(&vcpu->async_pf.lock);
+
+	vcpu->async_pf.queued = 0;
+}
+
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
+{
+	struct kvm_async_pf *work;
+
+	if (list_empty_careful(&vcpu->async_pf.done))
+		return;
+
+	spin_lock(&vcpu->async_pf.lock);
+	work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link);
+	list_del(&work->link);
+	spin_unlock(&vcpu->async_pf.lock);
+
+	kvm_arch_async_page_present(vcpu, work);
+
+	list_del(&work->queue);
+	vcpu->async_pf.queued--;
+	if (work->page)
+		put_page(work->page);
+	kmem_cache_free(async_pf_cache, work);
+}
+
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+		       struct kvm_arch_async_pf *arch)
+{
+	struct kvm_async_pf *work;
+
+	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
+		return 0;
+
+	/* setup delayed work */
+
+	/*
+	 * do alloc nowait since if we are going to sleep anyway we
+	 * may as well sleep faulting in page
+	 */
+	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
+	if (!work)
+		return 0;
+
+	work->page = NULL;
+	work->done = false;
+	work->vcpu = vcpu;
+	work->gva = gva;
+	work->addr = gfn_to_hva(vcpu->kvm, gfn);
+	work->arch = *arch;
+	work->mm = current->mm;
+	atomic_inc(&work->mm->mm_count);
+	kvm_get_kvm(work->vcpu->kvm);
+
+	/* this can't really happen otherwise gfn_to_pfn_async
+	   would succeed */
+	if (unlikely(kvm_is_error_hva(work->addr)))
+		goto retry_sync;
+
+	INIT_WORK(&work->work, async_pf_execute);
+	if (!schedule_work(&work->work))
+		goto retry_sync;
+
+	list_add_tail(&work->queue, &vcpu->async_pf.queue);
+	vcpu->async_pf.queued++;
+	kvm_arch_async_page_not_present(vcpu, work);
+	return 1;
+retry_sync:
+	kvm_put_kvm(work->vcpu->kvm);
+	mmdrop(work->mm);
+	kmem_cache_free(async_pf_cache, work);
+	return 0;
+}
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
new file mode 100644
index 000000000000..e7ef6447cb82
--- /dev/null
+++ b/virt/kvm/async_pf.h
@@ -0,0 +1,36 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KVM_ASYNC_PF_H__
+#define __KVM_ASYNC_PF_H__
+
+#ifdef CONFIG_KVM_ASYNC_PF
+int kvm_async_pf_init(void);
+void kvm_async_pf_deinit(void);
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
+#else
+#define kvm_async_pf_init() (0)
+#define kvm_async_pf_deinit() do{}while(0)
+#define kvm_async_pf_vcpu_init(C) do{}while(0)
+#endif
+
+#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5225052aebc1..75fd590c0214 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -55,6 +55,7 @@
 #include <asm-generic/bitops/le.h>
 
 #include "coalesced_mmio.h"
+#include "async_pf.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -186,6 +187,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	init_waitqueue_head(&vcpu->wq);
+	kvm_async_pf_vcpu_init(vcpu);
 
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
@@ -946,15 +948,20 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
+			bool *async)
 {
 	struct page *page[1];
-	int npages;
+	int npages = 0;
 	pfn_t pfn;
 
-	if (atomic)
+	/* we can do it either atomically or asynchronously, not both */
+	BUG_ON(atomic && async);
+
+	if (atomic || async)
 		npages = __get_user_pages_fast(addr, 1, 1, page);
-	else {
+
+	if (unlikely(npages != 1) && !atomic) {
 		might_sleep();
 		npages = get_user_pages_fast(addr, 1, 1, page);
 	}
@@ -976,6 +983,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
 
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
+			if (async && !(vma->vm_flags & VM_PFNMAP) &&
+			    (vma->vm_flags & VM_WRITE))
+				*async = true;
 			up_read(&current->mm->mmap_sem);
 return_fault_page:
 			get_page(fault_page);
@@ -993,32 +1003,41 @@ return_fault_page:
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 {
-	return hva_to_pfn(kvm, addr, true);
+	return hva_to_pfn(kvm, addr, true, NULL);
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
 {
 	unsigned long addr;
 
+	if (async)
+		*async = false;
+
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr)) {
 		get_page(bad_page);
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr, atomic);
+	return hva_to_pfn(kvm, addr, atomic, async);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, true);
+	return __gfn_to_pfn(kvm, gfn, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async)
+{
+	return __gfn_to_pfn(kvm, gfn, false, async);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
+
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, false);
+	return __gfn_to_pfn(kvm, gfn, false, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
@@ -1026,7 +1045,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(kvm, addr, false);
+	return hva_to_pfn(kvm, addr, false, NULL);
 }
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -2336,6 +2355,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		goto out_free_5;
 	}
 
+	r = kvm_async_pf_init();
+	if (r)
+		goto out_free;
+
 	kvm_chardev_ops.owner = module;
 	kvm_vm_fops.owner = module;
 	kvm_vcpu_fops.owner = module;
@@ -2343,7 +2366,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	r = misc_register(&kvm_dev);
 	if (r) {
 		printk(KERN_ERR "kvm: misc device register failed\n");
-		goto out_free;
+		goto out_unreg;
 	}
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2353,6 +2376,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
 	return 0;
 
+out_unreg:
+	kvm_async_pf_deinit();
 out_free:
 	kmem_cache_destroy(kvm_vcpu_cache);
 out_free_5:
@@ -2385,6 +2410,7 @@ void kvm_exit(void)
 	kvm_exit_debug();
 	misc_deregister(&kvm_dev);
 	kmem_cache_destroy(kvm_vcpu_cache);
+	kvm_async_pf_deinit();
 	sysdev_unregister(&kvm_sysdev);
 	sysdev_class_unregister(&kvm_sysdev_class);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
-- 
cgit v1.2.3-71-gd317


From 49c7754ce57063b819b01eb8a4290841ad0886c4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 18 Oct 2010 15:22:23 +0200
Subject: KVM: Add memory slot versioning and use it to provide fast guest
 write interface

Keep track of memslots changes by keeping generation number in memslots
structure. Provide kvm_write_guest_cached() function that skips
gfn_to_hva() translation if memslots was not changed since previous
invocation.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c        |  1 +
 include/linux/kvm_host.h  |  7 +++++
 include/linux/kvm_types.h |  7 +++++
 virt/kvm/kvm_main.c       | 75 +++++++++++++++++++++++++++++++++++++++--------
 4 files changed, 78 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 71beb27597fd..bd254779d1cc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3190,6 +3190,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		}
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+		slots->generation++;
 
 		old_slots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e56acc7857e2..e6748204cd56 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -199,6 +199,7 @@ struct kvm_irq_routing_table {};
 
 struct kvm_memslots {
 	int nmemslots;
+	u64 generation;
 	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
 					KVM_PRIVATE_MEM_SLOTS];
 };
@@ -352,12 +353,18 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len);
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			      gpa_t gpa);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 7ac0d4eee430..fa7cc7244cbd 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -67,4 +67,11 @@ struct kvm_lapic_irq {
 	u32 dest_id;
 };
 
+struct gfn_to_hva_cache {
+	u64 generation;
+	gpa_t gpa;
+	unsigned long hva;
+	struct kvm_memory_slot *memslot;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 75fd590c0214..228f00f87966 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -687,6 +687,7 @@ skip_lpage:
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		if (mem->slot >= slots->nmemslots)
 			slots->nmemslots = mem->slot + 1;
+		slots->generation++;
 		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
 
 		old_memslots = kvm->memslots;
@@ -721,6 +722,7 @@ skip_lpage:
 	memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 	if (mem->slot >= slots->nmemslots)
 		slots->nmemslots = mem->slot + 1;
+	slots->generation++;
 
 	/* actual memory is freed via old in kvm_free_physmem_slot below */
 	if (!npages) {
@@ -851,10 +853,10 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
+						gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; ++i) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -865,6 +867,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 	}
 	return NULL;
 }
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
+}
 EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -927,12 +934,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return memslot - slots->memslots;
 }
 
-static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 				     gfn_t *nr_pages)
 {
-	struct kvm_memory_slot *slot;
-
-	slot = gfn_to_memslot(kvm, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return bad_hva();
 
@@ -944,7 +948,7 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
 
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
-	return gfn_to_hva_many(kvm, gfn, NULL);
+	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
@@ -1054,7 +1058,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 	unsigned long addr;
 	gfn_t entry;
 
-	addr = gfn_to_hva_many(kvm, gfn, &entry);
+	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
 	if (kvm_is_error_hva(addr))
 		return -1;
 
@@ -1238,6 +1242,47 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 	return 0;
 }
 
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			      gpa_t gpa)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int offset = offset_in_page(gpa);
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+
+	ghc->gpa = gpa;
+	ghc->generation = slots->generation;
+	ghc->memslot = __gfn_to_memslot(slots, gfn);
+	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
+	if (!kvm_is_error_hva(ghc->hva))
+		ghc->hva += offset;
+	else
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int r;
+
+	if (slots->generation != ghc->generation)
+		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+	if (kvm_is_error_hva(ghc->hva))
+		return -EFAULT;
+
+	r = copy_to_user((void __user *)ghc->hva, data, len);
+	if (r)
+		return -EFAULT;
+	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
 	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
@@ -1263,11 +1308,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     gfn_t gfn)
 {
-	struct kvm_memory_slot *memslot;
-
-	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
@@ -1275,6 +1318,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 	}
 }
 
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+	memslot = gfn_to_memslot(kvm, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
+}
+
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
-- 
cgit v1.2.3-71-gd317


From 344d9588a9df06182684168be4f1408b55c7da3e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:50 +0200
Subject: KVM: Add PV MSR to enable asynchronous page faults delivery.

Guest enables async PF vcpu functionality using this MSR.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/cpuid.txt     |  3 +++
 Documentation/kvm/msr.txt       | 35 ++++++++++++++++++++++++++++++++++-
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/asm/kvm_para.h |  4 ++++
 arch/x86/kvm/x86.c              | 38 ++++++++++++++++++++++++++++++++++++--
 include/linux/kvm.h             |  1 +
 include/linux/kvm_host.h        |  1 +
 virt/kvm/async_pf.c             | 20 ++++++++++++++++++++
 8 files changed, 101 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt
index 14a12ea92b7f..882068538c9c 100644
--- a/Documentation/kvm/cpuid.txt
+++ b/Documentation/kvm/cpuid.txt
@@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP                 ||     2 || deprecated.
 KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
                                    ||       || 0x4b564d00 and 0x4b564d01
 ------------------------------------------------------------------------------
+KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
+                                   ||       || writing to msr 0x4b564d02
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
index 8ddcfe84c09a..e67b4a8783df 100644
--- a/Documentation/kvm/msr.txt
+++ b/Documentation/kvm/msr.txt
@@ -3,7 +3,6 @@ Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
 =====================================================
 
 KVM makes use of some custom MSRs to service some requests.
-At present, this facility is only used by kvmclock.
 
 Custom MSRs have a range reserved for them, that goes from
 0x4b564d00 to 0x4b564dff. There are MSRs outside this area,
@@ -151,3 +150,37 @@ MSR_KVM_SYSTEM_TIME: 0x12
 			return PRESENT;
 		} else
 			return NON_PRESENT;
+
+MSR_KVM_ASYNC_PF_EN: 0x4b564d02
+	data: Bits 63-6 hold 64-byte aligned physical address of a
+	64 byte memory area which must be in guest RAM and must be
+	zeroed. Bits 5-1 are reserved and should be zero. Bit 0 is 1
+	when asynchronous page faults are enabled on the vcpu 0 when
+	disabled.
+
+	First 4 byte of 64 byte memory location will be written to by
+	the hypervisor at the time of asynchronous page fault (APF)
+	injection to indicate type of asynchronous page fault. Value
+	of 1 means that the page referred to by the page fault is not
+	present. Value 2 means that the page is now available. Disabling
+	interrupt inhibits APFs. Guest must not enable interrupt
+	before the reason is read, or it may be overwritten by another
+	APF. Since APF uses the same exception vector as regular page
+	fault guest must reset the reason to 0 before it does
+	something that can generate normal page fault.  If during page
+	fault APF reason is 0 it means that this is regular page
+	fault.
+
+	During delivery of type 1 APF cr2 contains a token that will
+	be used to notify a guest when missing page becomes
+	available. When page becomes available type 2 APF is sent with
+	cr2 set to the token associated with the page. There is special
+	kind of token 0xffffffff which tells vcpu that it should wake
+	up all processes waiting for APFs and no individual type 2 APFs
+	will be sent.
+
+	If APF is disabled while there are outstanding APFs, they will
+	not be delivered.
+
+	Currently type 2 APF will be always delivered on the same vcpu as
+	type 1 was, but guest should not rely on that.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c3076bcf5ef7..0d7039804b4c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -419,6 +419,8 @@ struct kvm_vcpu_arch {
 	struct {
 		bool halted;
 		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+		struct gfn_to_hva_cache data;
+		u64 msr_val;
 	} apf;
 };
 
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index e3faaaf4301e..8662ae0a035c 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
  * are available. The use of 0x11 and 0x12 is deprecated
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
+#define KVM_FEATURE_ASYNC_PF		4
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,12 @@
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
+#define KVM_ASYNC_PF_ENABLED			(1 << 0)
+
 /* Operations for KVM_HC_MMU_OP */
 #define KVM_MMU_OP_WRITE_PTE            1
 #define KVM_MMU_OP_FLUSH_TLB	        2
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd254779d1cc..063c07296764 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -783,12 +783,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	7
+#define KVM_SAVE_MSRS_BEGIN	8
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1425,6 +1425,29 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	return 0;
 }
 
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+	gpa_t gpa = data & ~0x3f;
+
+	/* Bits 1:5 are resrved, Should be zero */
+	if (data & 0x3e)
+		return 1;
+
+	vcpu->arch.apf.msr_val = data;
+
+	if (!(data & KVM_ASYNC_PF_ENABLED)) {
+		kvm_clear_async_pf_completion_queue(vcpu);
+		kvm_async_pf_hash_reset(vcpu);
+		return 0;
+	}
+
+	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+		return 1;
+
+	kvm_async_pf_wakeup_all(vcpu);
+	return 0;
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1506,6 +1529,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		}
 		break;
 	}
+	case MSR_KVM_ASYNC_PF_EN:
+		if (kvm_pv_enable_async_pf(vcpu, data))
+			return 1;
+		break;
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1782,6 +1809,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_SYSTEM_TIME_NEW:
 		data = vcpu->arch.time;
 		break;
+	case MSR_KVM_ASYNC_PF_EN:
+		data = vcpu->arch.apf.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -1929,6 +1959,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
+	case KVM_CAP_ASYNC_PF:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -5792,6 +5823,8 @@ free_vcpu:
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.apf.msr_val = 0;
+
 	vcpu_load(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
@@ -5811,6 +5844,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr7 = DR7_FIXED_1;
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	vcpu->arch.apf.msr_val = 0;
 
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_async_pf_hash_reset(vcpu);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 919ae53adc5c..ea2dc1a2e13d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -540,6 +540,7 @@ struct kvm_ppc_pvinfo {
 #endif
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_ASYNC_PF 59
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e6748204cd56..ee4314e15ead 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -93,6 +93,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
 void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
 int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 		       struct kvm_arch_async_pf *arch);
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 struct kvm_vcpu {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index e97eae965a4c..1f59498561b2 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -190,3 +190,23 @@ retry_sync:
 	kmem_cache_free(async_pf_cache, work);
 	return 0;
 }
+
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
+{
+	struct kvm_async_pf *work;
+
+	if (!list_empty(&vcpu->async_pf.done))
+		return 0;
+
+	work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	work->page = bad_page;
+	get_page(bad_page);
+	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
+
+	list_add_tail(&work->link, &vcpu->async_pf.done);
+	vcpu->async_pf.queued++;
+	return 0;
+}
-- 
cgit v1.2.3-71-gd317


From 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 22 Oct 2010 14:18:18 -0200
Subject: KVM: propagate fault r/w information to gup(), allow read-only memory

As suggested by Andrea, pass r/w error code to gup(), upgrading read fault
to writable if host pte allows it.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 27 +++++++++++++++---------
 arch/x86/kvm/paging_tmpl.h | 13 ++++++++----
 include/linux/kvm_host.h   |  5 ++++-
 virt/kvm/kvm_main.c        | 51 +++++++++++++++++++++++++++++++++++++---------
 4 files changed, 71 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99433943170c..53509f5973db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2216,7 +2216,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int level, gfn_t gfn, pfn_t pfn)
+			int map_writable, int level, gfn_t gfn, pfn_t pfn)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
@@ -2225,9 +2225,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
 		if (iterator.level == level) {
-			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+			unsigned pte_access = ACC_ALL;
+
+			if (!map_writable)
+				pte_access &= ~ACC_WRITE_MASK;
+			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
 				     0, write, 1, &pt_write,
-				     level, gfn, pfn, false, true);
+				     level, gfn, pfn, false, map_writable);
 			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
@@ -2288,6 +2292,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
+	bool map_writable;
 
 	level = mapping_level(vcpu, gfn);
 
@@ -2302,7 +2307,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, &map_writable);
 
 	/* mmio */
 	if (is_error_pfn(pfn))
@@ -2312,7 +2317,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, level, gfn, pfn);
+	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -2611,11 +2616,11 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
-			 gva_t gva, pfn_t *pfn)
+			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
 {
 	bool async;
 
-	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);
+	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
 
 	if (!async)
 		return false; /* *pfn has correct page already */
@@ -2632,7 +2637,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
 
 	return false;
 }
@@ -2645,6 +2650,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	int level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
+	int write = error_code & PFERR_WRITE_MASK;
+	bool map_writable;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2660,7 +2667,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn))
+	if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn, write, &map_writable))
 		return 0;
 
 	/* mmio */
@@ -2670,7 +2677,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+	r = __direct_map(vcpu, gpa, write, map_writable,
 			 level, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d6b281e989b1..ba00eefa7bcd 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -427,7 +427,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int user_fault, int write_fault, int hlevel,
-			 int *ptwrite, pfn_t pfn)
+			 int *ptwrite, pfn_t pfn, bool map_writable)
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
@@ -501,7 +501,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
-		     gw->gfn, pfn, false, true);
+		     gw->gfn, pfn, false, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
 	return it.sptep;
@@ -539,6 +539,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
 	unsigned long mmu_seq;
+	bool map_writable;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
@@ -569,13 +570,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn))
+	if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn, write_fault,
+			 &map_writable))
 		return 0;
 
 	/* mmio */
 	if (is_error_pfn(pfn))
 		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
 
+	if (!map_writable)
+		walker.pte_access &= ~ACC_WRITE_MASK;
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
@@ -583,7 +588,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-			     level, &write_pt, pfn);
+			     level, &write_pt, pfn, map_writable);
 	(void)sptep;
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 sptep, *sptep, write_pt);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ee4314e15ead..462b982fedfb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -334,8 +334,11 @@ void kvm_set_page_accessed(struct page *page);
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async);
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+		       bool write_fault, bool *writable);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+		      bool *writable);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
 int memslot_id(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 475a100f3a22..2803b4db2a38 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -959,7 +959,7 @@ static pfn_t get_fault_pfn(void)
 }
 
 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
-			bool *async)
+			bool *async, bool write_fault, bool *writable)
 {
 	struct page *page[1];
 	int npages = 0;
@@ -968,12 +968,34 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
 
+	BUG_ON(!write_fault && !writable);
+
+	if (writable)
+		*writable = true;
+
 	if (atomic || async)
 		npages = __get_user_pages_fast(addr, 1, 1, page);
 
 	if (unlikely(npages != 1) && !atomic) {
 		might_sleep();
-		npages = get_user_pages_fast(addr, 1, 1, page);
+
+		if (writable)
+			*writable = write_fault;
+
+		npages = get_user_pages_fast(addr, 1, write_fault, page);
+
+		/* map read fault as writable if possible */
+		if (unlikely(!write_fault) && npages == 1) {
+			struct page *wpage[1];
+
+			npages = __get_user_pages_fast(addr, 1, 1, wpage);
+			if (npages == 1) {
+				*writable = true;
+				put_page(page[0]);
+				page[0] = wpage[0];
+			}
+			npages = 1;
+		}
 	}
 
 	if (unlikely(npages != 1)) {
@@ -1011,11 +1033,12 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 {
-	return hva_to_pfn(kvm, addr, true, NULL);
+	return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+			  bool write_fault, bool *writable)
 {
 	unsigned long addr;
 
@@ -1028,32 +1051,40 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr, atomic, async);
+	return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, true, NULL);
+	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async)
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+		       bool write_fault, bool *writable)
 {
-	return __gfn_to_pfn(kvm, gfn, false, async);
+	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
 
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, false, NULL);
+	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+		      bool *writable)
+{
+	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
+
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(kvm, addr, false, NULL);
+	return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
 }
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
-- 
cgit v1.2.3-71-gd317


From 515a01279a187415322a80736800a7d6325876ab Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 27 Oct 2010 18:23:54 +0900
Subject: KVM: pre-allocate one more dirty bitmap to avoid vmalloc()

Currently x86's kvm_vm_ioctl_get_dirty_log() needs to allocate a bitmap by
vmalloc() which will be used in the next logging and this has been causing
bad effect to VGA and live-migration: vmalloc() consumes extra systime,
triggers tlb flush, etc.

This patch resolves this issue by pre-allocating one more bitmap and switching
between two bitmaps during dirty logging.

Performance improvement:
  I measured performance for the case of VGA update by trace-cmd.
  The result was 1.5 times faster than the original one.

  In the case of live migration, the improvement ratio depends on the workload
  and the guest memory size. In general, the larger the memory size is the more
  benefits we get.

Note:
  This does not change other architectures's logic but the allocation size
  becomes twice. This will increase the actual memory consumption only when
  the new size changes the number of pages allocated by vmalloc().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c       | 16 +++++-----------
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 11 +++++++++--
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2a785472431..35f82f2c66f6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3208,18 +3208,15 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		struct kvm_memslots *slots, *old_slots;
 		unsigned long *dirty_bitmap;
 
-		r = -ENOMEM;
-		dirty_bitmap = vmalloc(n);
-		if (!dirty_bitmap)
-			goto out;
+		dirty_bitmap = memslot->dirty_bitmap_head;
+		if (memslot->dirty_bitmap == dirty_bitmap)
+			dirty_bitmap += n / sizeof(long);
 		memset(dirty_bitmap, 0, n);
 
 		r = -ENOMEM;
 		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-		if (!slots) {
-			vfree(dirty_bitmap);
+		if (!slots)
 			goto out;
-		}
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
 		slots->generation++;
@@ -3235,11 +3232,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		spin_unlock(&kvm->mmu_lock);
 
 		r = -EFAULT;
-		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
-			vfree(dirty_bitmap);
+		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
 			goto out;
-		}
-		vfree(dirty_bitmap);
 	} else {
 		r = -EFAULT;
 		if (clear_user(log->dirty_bitmap, n))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 462b982fedfb..bcf71c7730f0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -150,6 +150,7 @@ struct kvm_memory_slot {
 	unsigned long flags;
 	unsigned long *rmap;
 	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_head;
 	struct {
 		unsigned long rmap_pde;
 		int write_count;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0021c2862140..27649fdaa007 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -449,8 +449,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 	if (!memslot->dirty_bitmap)
 		return;
 
-	vfree(memslot->dirty_bitmap);
+	vfree(memslot->dirty_bitmap_head);
 	memslot->dirty_bitmap = NULL;
+	memslot->dirty_bitmap_head = NULL;
 }
 
 /*
@@ -537,15 +538,21 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/*
+ * Allocation size is twice as large as the actual dirty bitmap size.
+ * This makes it possible to do double buffering: see x86's
+ * kvm_vm_ioctl_get_dirty_log().
+ */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
-	unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
+	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
 	memslot->dirty_bitmap = vmalloc(dirty_bytes);
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 
 	memset(memslot->dirty_bitmap, 0, dirty_bytes);
+	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From d89f5eff70a31237ffa1e21c51d23ca532110aea Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 9 Nov 2010 17:02:49 +0100
Subject: KVM: Clean up vm creation and release

IA64 support forces us to abstract the allocation of the kvm structure.
But instead of mixing this up with arch-specific initialization and
doing the same on destruction, split both steps. This allows to move
generic destruction calls into generic code.

It also fixes error clean-up on failures of kvm_create_vm for IA64.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  4 ++++
 arch/ia64/kvm/kvm-ia64.c         | 28 +++++++---------------------
 arch/powerpc/kvm/powerpc.c       | 20 +++-----------------
 arch/s390/kvm/kvm-s390.c         | 23 ++++++-----------------
 arch/x86/kvm/x86.c               | 12 ++----------
 include/linux/kvm_host.h         | 15 ++++++++++++++-
 virt/kvm/kvm_main.c              | 19 +++++++++++++------
 7 files changed, 49 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 2f229e5de498..2689ee54a1c9 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 
+#define __KVM_HAVE_ARCH_VM_ALLOC 1
+struct kvm *kvm_arch_alloc_vm(void);
+void kvm_arch_free_vm(struct kvm *kvm);
+
 #endif /* __ASSEMBLY__*/
 
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f56a6316e134..48a48bdc59c3 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -749,7 +749,7 @@ out:
 	return r;
 }
 
-static struct kvm *kvm_alloc_kvm(void)
+struct kvm *kvm_arch_alloc_vm(void)
 {
 
 	struct kvm *kvm;
@@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void)
 	vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
 
 	if (!vm_base)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
 	kvm = (struct kvm *)(vm_base +
@@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 #define GUEST_PHYSICAL_RR4	0x2739
 #define VMM_INIT_RR		0x1660
 
-static void kvm_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
 	BUG_ON(!kvm);
 
+	kvm->arch.is_sn2 = ia64_platform_is("sn2");
+
 	kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
 	kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
 	kvm->arch.vmm_init_rr = VMM_INIT_RR;
@@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm)
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
-}
-
-struct  kvm *kvm_arch_create_vm(void)
-{
-	struct kvm *kvm = kvm_alloc_kvm();
-
-	if (IS_ERR(kvm))
-		return ERR_PTR(-ENOMEM);
-
-	kvm->arch.is_sn2 = ia64_platform_is("sn2");
-
-	kvm_init_vm(kvm);
-
-	return kvm;
 
+	return 0;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
@@ -1357,7 +1346,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	return -EINVAL;
 }
 
-static void free_kvm(struct kvm *kvm)
+void kvm_arch_free_vm(struct kvm *kvm)
 {
 	unsigned long vm_base = kvm->arch.vm_base;
 
@@ -1399,9 +1388,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 #endif
 	kfree(kvm->arch.vioapic);
 	kvm_release_vm_pages(kvm);
-	kvm_free_physmem(kvm);
-	cleanup_srcu_struct(&kvm->srcu);
-	free_kvm(kvm);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 38f756f25053..99758460efde 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn)
 	*(int *)rtn = kvmppc_core_check_processor_compat();
 }
 
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-	struct kvm *kvm;
-
-	kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
-
-	return kvm;
+	return 0;
 }
 
-static void kvmppc_free_vcpus(struct kvm *kvm)
+void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	unsigned int i;
 	struct kvm_vcpu *vcpu;
@@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
 }
 
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-	kvmppc_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
-	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm);
-}
-
 int kvm_dev_ioctl_check_extension(long ext)
 {
 	int r;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 985d825494f1..bade533ba288 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	return r;
 }
 
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-	struct kvm *kvm;
 	int rc;
 	char debug_name[16];
 
 	rc = s390_enable_sie();
 	if (rc)
-		goto out_nokvm;
-
-	rc = -ENOMEM;
-	kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-	if (!kvm)
-		goto out_nokvm;
+		goto out_err;
 
 	kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
 	if (!kvm->arch.sca)
-		goto out_nosca;
+		goto out_err;
 
 	sprintf(debug_name, "kvm-%u", current->pid);
 
@@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void)
 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
 	VM_EVENT(kvm, 3, "%s", "vm created");
 
-	return kvm;
+	return 0;
 out_nodbf:
 	free_page((unsigned long)(kvm->arch.sca));
-out_nosca:
-	kfree(kvm);
-out_nokvm:
-	return ERR_PTR(rc);
+out_err:
+	return rc;
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
 	free_page((unsigned long)(kvm->arch.sca));
 	debug_unregister(kvm->arch.dbf);
-	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm);
 }
 
 /* Section: vcpu related */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5daead833669..b7ee61d5bc81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5961,13 +5961,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-struct  kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
-
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
@@ -5976,7 +5971,7 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	spin_lock_init(&kvm->arch.tsc_write_lock);
 
-	return kvm;
+	return 0;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -6021,13 +6016,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
 	if (kvm->arch.apic_access_page)
 		put_page(kvm->arch.apic_access_page);
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
-	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bcf71c7730f0..2d63f2c0137c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/msi.h>
+#include <linux/slab.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -441,7 +442,19 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
-struct  kvm *kvm_arch_create_vm(void);
+#ifndef __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+	return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+}
+
+static inline void kvm_arch_free_vm(struct kvm *kvm)
+{
+	kfree(kvm);
+}
+#endif
+
+int kvm_arch_init_vm(struct kvm *kvm);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fce0578eab0e..4023264c4cd5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -383,11 +383,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 static struct kvm *kvm_create_vm(void)
 {
-	int r = 0, i;
-	struct kvm *kvm = kvm_arch_create_vm();
+	int r, i;
+	struct kvm *kvm = kvm_arch_alloc_vm();
 
-	if (IS_ERR(kvm))
-		goto out;
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	r = kvm_arch_init_vm(kvm);
+	if (r)
+		goto out_err_nodisable;
 
 	r = hardware_enable_all();
 	if (r)
@@ -427,7 +431,7 @@ static struct kvm *kvm_create_vm(void)
 	spin_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
 	spin_unlock(&kvm_lock);
-out:
+
 	return kvm;
 
 out_err:
@@ -438,7 +442,7 @@ out_err_nodisable:
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm->buses[i]);
 	kfree(kvm->memslots);
-	kfree(kvm);
+	kvm_arch_free_vm(kvm);
 	return ERR_PTR(r);
 }
 
@@ -512,6 +516,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_arch_flush_shadow(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
+	kvm_free_physmem(kvm);
+	cleanup_srcu_struct(&kvm->srcu);
+	kvm_arch_free_vm(kvm);
 	hardware_disable_all();
 	mmdrop(mm);
 }
-- 
cgit v1.2.3-71-gd317


From 0645211c43df0b96c51e12980066b3227e10b164 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:03 +0100
Subject: KVM: Switch assigned device IRQ forwarding to threaded handler

This improves the IRQ forwarding for assigned devices: By using the
kernel's threaded IRQ scheme, we can get rid of the latency-prone work
queue and simplify the code in the same run.

Moreover, we no longer have to hold assigned_dev_lock while raising the
guest IRQ, which can be a lenghty operation as we may have to iterate
over all VCPUs. The lock is now only used for synchronizing masking vs.
unmasking of INTx-type IRQs, thus is renames to intx_lock.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h |  12 +-----
 virt/kvm/assigned-dev.c  | 107 +++++++++++++++--------------------------------
 2 files changed, 36 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2d63f2c0137c..9fe7fefe76b1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -470,16 +470,8 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
-#define KVM_ASSIGNED_MSIX_PENDING		0x1
-struct kvm_guest_msix_entry {
-	u32 vector;
-	u16 entry;
-	u16 flags;
-};
-
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
-	struct work_struct interrupt_work;
 	struct list_head list;
 	int assigned_dev_id;
 	int host_segnr;
@@ -490,13 +482,13 @@ struct kvm_assigned_dev_kernel {
 	bool host_irq_disabled;
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
-	struct kvm_guest_msix_entry *guest_msix_entries;
+	struct msix_entry *guest_msix_entries;
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
 	struct pci_dev *dev;
 	struct kvm *kvm;
-	spinlock_t assigned_dev_lock;
+	spinlock_t intx_lock;
 };
 
 struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ecc4419fadca..1d77ce16360a 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 	return index;
 }
 
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 {
-	struct kvm_assigned_dev_kernel *assigned_dev;
-	int i;
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	u32 vector;
+	int index;
 
-	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
-				    interrupt_work);
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
+		spin_lock(&assigned_dev->intx_lock);
+		disable_irq_nosync(irq);
+		assigned_dev->host_irq_disabled = true;
+		spin_unlock(&assigned_dev->intx_lock);
+	}
 
-	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		struct kvm_guest_msix_entry *guest_entries =
-			assigned_dev->guest_msix_entries;
-		for (i = 0; i < assigned_dev->entries_nr; i++) {
-			if (!(guest_entries[i].flags &
-					KVM_ASSIGNED_MSIX_PENDING))
-				continue;
-			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+		index = find_index_from_host_irq(assigned_dev, irq);
+		if (index >= 0) {
+			vector = assigned_dev->
+					guest_msix_entries[index].vector;
 			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id,
-				    guest_entries[i].vector, 1);
+				    assigned_dev->irq_source_id, vector, 1);
 		}
 	} else
 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 			    assigned_dev->guest_irq, 1);
 
-	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
-}
-
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
-	unsigned long flags;
-	struct kvm_assigned_dev_kernel *assigned_dev =
-		(struct kvm_assigned_dev_kernel *) dev_id;
-
-	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int index = find_index_from_host_irq(assigned_dev, irq);
-		if (index < 0)
-			goto out;
-		assigned_dev->guest_msix_entries[index].flags |=
-			KVM_ASSIGNED_MSIX_PENDING;
-	}
-
-	schedule_work(&assigned_dev->interrupt_work);
-
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-	}
-
-out:
-	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -114,7 +87,6 @@ out:
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_assigned_dev_kernel *dev;
-	unsigned long flags;
 
 	if (kian->gsi == -1)
 		return;
@@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	/* The guest irq may be shared so this ack may be
 	 * from another device.
 	 */
-	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
+	spin_lock(&dev->intx_lock);
 	if (dev->host_irq_disabled) {
 		enable_irq(dev->host_irq);
 		dev->host_irq_disabled = false;
 	}
-	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
+	spin_unlock(&dev->intx_lock);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -155,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm,
 			      struct kvm_assigned_dev_kernel *assigned_dev)
 {
 	/*
-	 * In kvm_free_device_irq, cancel_work_sync return true if:
-	 * 1. work is scheduled, and then cancelled.
-	 * 2. work callback is executed.
-	 *
-	 * The first one ensured that the irq is disabled and no more events
-	 * would happen. But for the second one, the irq may be enabled (e.g.
-	 * for MSI). So we disable irq here to prevent further events.
+	 * We disable irq here to prevent further events.
 	 *
 	 * Notice this maybe result in nested disable if the interrupt type is
 	 * INTx, but it's OK for we are going to free it.
 	 *
 	 * If this function is a part of VM destroy, please ensure that till
 	 * now, the kvm state is still legal for probably we also have to wait
-	 * interrupt_work done.
+	 * on a currently running IRQ handler.
 	 */
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int i;
 		for (i = 0; i < assigned_dev->entries_nr; i++)
-			disable_irq_nosync(assigned_dev->
-					   host_msix_entries[i].vector);
-
-		cancel_work_sync(&assigned_dev->interrupt_work);
+			disable_irq(assigned_dev->host_msix_entries[i].vector);
 
 		for (i = 0; i < assigned_dev->entries_nr; i++)
 			free_irq(assigned_dev->host_msix_entries[i].vector,
@@ -188,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm,
 		pci_disable_msix(assigned_dev->dev);
 	} else {
 		/* Deal with MSI and INTx */
-		disable_irq_nosync(assigned_dev->host_irq);
-		cancel_work_sync(&assigned_dev->interrupt_work);
+		disable_irq(assigned_dev->host_irq);
 
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
@@ -268,8 +230,9 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	 * on the same interrupt line is not a happy situation: there
 	 * are going to be long delays in accepting, acking, etc.
 	 */
-	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
-			0, "kvm_assigned_intx_device", (void *)dev))
+	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+				 IRQF_ONESHOT, "kvm_assigned_intx_device",
+				 (void *)dev))
 		return -EIO;
 	return 0;
 }
@@ -287,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 	}
 
 	dev->host_irq = dev->dev->irq;
-	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
-			"kvm_assigned_msi_device", (void *)dev)) {
+	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+				 0, "kvm_assigned_msi_device", (void *)dev)) {
 		pci_disable_msi(dev->dev);
 		return -EIO;
 	}
@@ -313,10 +276,10 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 		return r;
 
 	for (i = 0; i < dev->entries_nr; i++) {
-		r = request_irq(dev->host_msix_entries[i].vector,
-				kvm_assigned_dev_intr, 0,
-				"kvm_assigned_msix_device",
-				(void *)dev);
+		r = request_threaded_irq(dev->host_msix_entries[i].vector,
+					 NULL, kvm_assigned_dev_thread,
+					 0, "kvm_assigned_msix_device",
+					 (void *)dev);
 		if (r)
 			goto err;
 	}
@@ -557,12 +520,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->host_devfn = assigned_dev->devfn;
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
-	spin_lock_init(&match->assigned_dev_lock);
+	spin_lock_init(&match->intx_lock);
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-	INIT_WORK(&match->interrupt_work,
-		  kvm_assigned_dev_interrupt_work_handler);
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
@@ -654,9 +615,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 			r = -ENOMEM;
 			goto msix_nr_out;
 		}
-		adev->guest_msix_entries = kzalloc(
-				sizeof(struct kvm_guest_msix_entry) *
-				entry_nr->entry_nr, GFP_KERNEL);
+		adev->guest_msix_entries =
+			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
+				GFP_KERNEL);
 		if (!adev->guest_msix_entries) {
 			kfree(adev->host_msix_entries);
 			r = -ENOMEM;
-- 
cgit v1.2.3-71-gd317


From 1e001d49f9f9a0e3eb72939ad49d9a2c7754e9c1 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:04 +0100
Subject: KVM: Refactor IRQ names of assigned devices

Cosmetic change, but it helps to correlate IRQs with PCI devices.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/assigned-dev.c  | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9fe7fefe76b1..4bd663d6443d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -489,6 +489,7 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 	spinlock_t intx_lock;
+	char irq_name[32];
 };
 
 struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 1d77ce16360a..7623408dbc81 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -231,8 +231,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	 * are going to be long delays in accepting, acking, etc.
 	 */
 	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 IRQF_ONESHOT, "kvm_assigned_intx_device",
-				 (void *)dev))
+				 IRQF_ONESHOT, dev->irq_name, (void *)dev))
 		return -EIO;
 	return 0;
 }
@@ -251,7 +250,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 
 	dev->host_irq = dev->dev->irq;
 	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 0, "kvm_assigned_msi_device", (void *)dev)) {
+				 0, dev->irq_name, (void *)dev)) {
 		pci_disable_msi(dev->dev);
 		return -EIO;
 	}
@@ -278,8 +277,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 	for (i = 0; i < dev->entries_nr; i++) {
 		r = request_threaded_irq(dev->host_msix_entries[i].vector,
 					 NULL, kvm_assigned_dev_thread,
-					 0, "kvm_assigned_msix_device",
-					 (void *)dev);
+					 0, dev->irq_name, (void *)dev);
 		if (r)
 			goto err;
 	}
@@ -336,6 +334,9 @@ static int assign_host_irq(struct kvm *kvm,
 	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
 		return r;
 
+	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
+		 pci_name(dev->dev));
+
 	switch (host_irq_type) {
 	case KVM_DEV_IRQ_HOST_INTX:
 		r = assigned_device_enable_host_intx(kvm, dev);
-- 
cgit v1.2.3-71-gd317


From bd2b53b20fcd0d6c4c815b54e6d464e34429d3a4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 18 Nov 2010 19:09:08 +0200
Subject: KVM: fast-path msi injection with irqfd

Store irq routing table pointer in the irqfd object,
and use that to inject MSI directly without bouncing out to
a kernel thread.

While we touch this structure, rearrange irqfd fields to make fastpath
better packed for better cache utilization.

This also adds some comments about locking rules and rcu usage in code.

Some notes on the design:
- Use pointer into the rt instead of copying an entry,
  to make it possible to use rcu, thus side-stepping
  locking complexities.  We also save some memory this way.
- Old workqueue code is still used for level irqs.
  I don't think we DTRT with level anyway, however,
  it seems easier to keep the code around as
  it has been thought through and debugged, and fix level later than
  rip out and re-instate it later.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 16 +++++++++
 virt/kvm/eventfd.c       | 91 +++++++++++++++++++++++++++++++++++++++++-------
 virt/kvm/irq_comm.c      |  7 ++--
 3 files changed, 99 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4bd663d6443d..f17beae3cca0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -17,6 +17,7 @@
 #include <linux/preempt.h>
 #include <linux/msi.h>
 #include <linux/slab.h>
+#include <linux/rcupdate.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -240,6 +241,10 @@ struct kvm {
 
 	struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
+	/*
+	 * Update side is protected by irq_lock and,
+	 * if configured, irqfds.lock.
+	 */
 	struct kvm_irq_routing_table __rcu *irq_routing;
 	struct hlist_head mask_notifier_list;
 	struct hlist_head irq_ack_notifier_list;
@@ -511,6 +516,8 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   unsigned long *deliver_bitmask);
 #endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
+		int irq_source_id, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
@@ -652,17 +659,26 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 void kvm_eventfd_init(struct kvm *kvm);
 int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
 void kvm_irqfd_release(struct kvm *kvm);
+void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
 #else
 
 static inline void kvm_eventfd_init(struct kvm *kvm) {}
+
 static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
 	return -EINVAL;
 }
 
 static inline void kvm_irqfd_release(struct kvm *kvm) {}
+
+static inline void kvm_irq_routing_update(struct kvm *kvm,
+					  struct kvm_irq_routing_table *irq_rt)
+{
+	rcu_assign_pointer(kvm->irq_routing, irq_rt);
+}
+
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
 	return -ENOSYS;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c1f1e3c62984..2ca4535f4fb7 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -44,14 +44,19 @@
  */
 
 struct _irqfd {
-	struct kvm               *kvm;
-	struct eventfd_ctx       *eventfd;
-	int                       gsi;
-	struct list_head          list;
-	poll_table                pt;
-	wait_queue_t              wait;
-	struct work_struct        inject;
-	struct work_struct        shutdown;
+	/* Used for MSI fast-path */
+	struct kvm *kvm;
+	wait_queue_t wait;
+	/* Update side is protected by irqfds.lock */
+	struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
+	/* Used for level IRQ fast-path */
+	int gsi;
+	struct work_struct inject;
+	/* Used for setup/shutdown */
+	struct eventfd_ctx *eventfd;
+	struct list_head list;
+	poll_table pt;
+	struct work_struct shutdown;
 };
 
 static struct workqueue_struct *irqfd_cleanup_wq;
@@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
 	unsigned long flags = (unsigned long)key;
+	struct kvm_kernel_irq_routing_entry *irq;
+	struct kvm *kvm = irqfd->kvm;
 
-	if (flags & POLLIN)
+	if (flags & POLLIN) {
+		rcu_read_lock();
+		irq = rcu_dereference(irqfd->irq_entry);
 		/* An event has been signaled, inject an interrupt */
-		schedule_work(&irqfd->inject);
+		if (irq)
+			kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+		else
+			schedule_work(&irqfd->inject);
+		rcu_read_unlock();
+	}
 
 	if (flags & POLLHUP) {
 		/* The eventfd is closing, detach from KVM */
-		struct kvm *kvm = irqfd->kvm;
 		unsigned long flags;
 
 		spin_lock_irqsave(&kvm->irqfds.lock, flags);
@@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 	add_wait_queue(wqh, &irqfd->wait);
 }
 
+/* Must be called under irqfds.lock */
+static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
+			 struct kvm_irq_routing_table *irq_rt)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct hlist_node *n;
+
+	if (irqfd->gsi >= irq_rt->nr_rt_entries) {
+		rcu_assign_pointer(irqfd->irq_entry, NULL);
+		return;
+	}
+
+	hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
+		/* Only fast-path MSI. */
+		if (e->type == KVM_IRQ_ROUTING_MSI)
+			rcu_assign_pointer(irqfd->irq_entry, e);
+		else
+			rcu_assign_pointer(irqfd->irq_entry, NULL);
+	}
+}
+
 static int
 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 {
+	struct kvm_irq_routing_table *irq_rt;
 	struct _irqfd *irqfd, *tmp;
 	struct file *file = NULL;
 	struct eventfd_ctx *eventfd = NULL;
@@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 		goto fail;
 	}
 
+	irq_rt = rcu_dereference_protected(kvm->irq_routing,
+					   lockdep_is_held(&kvm->irqfds.lock));
+	irqfd_update(kvm, irqfd, irq_rt);
+
 	events = file->f_op->poll(file, &irqfd->pt);
 
 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
@@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
 	spin_lock_irq(&kvm->irqfds.lock);
 
 	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
-		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
+		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
+			/*
+			 * This rcu_assign_pointer is needed for when
+			 * another thread calls kvm_irqfd_update before
+			 * we flush workqueue below.
+			 * It is paired with synchronize_rcu done by caller
+			 * of that function.
+			 */
+			rcu_assign_pointer(irqfd->irq_entry, NULL);
 			irqfd_deactivate(irqfd);
+		}
 	}
 
 	spin_unlock_irq(&kvm->irqfds.lock);
@@ -321,6 +369,25 @@ kvm_irqfd_release(struct kvm *kvm)
 
 }
 
+/*
+ * Change irq_routing and irqfd.
+ * Caller must invoke synchronize_rcu afterwards.
+ */
+void kvm_irq_routing_update(struct kvm *kvm,
+			    struct kvm_irq_routing_table *irq_rt)
+{
+	struct _irqfd *irqfd;
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	rcu_assign_pointer(kvm->irq_routing, irq_rt);
+
+	list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+		irqfd_update(kvm, irqfd, irq_rt);
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+}
+
 /*
  * create a host-wide workqueue for issuing deferred shutdown requests
  * aggregated from all vm* instances. We need our own isolated single-thread
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 8edca9141b78..9f614b4e365f 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 }
 
-static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-		       struct kvm *kvm, int irq_source_id, int level)
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		struct kvm *kvm, int irq_source_id, int level)
 {
 	struct kvm_lapic_irq irq;
 
@@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
 	mutex_lock(&kvm->irq_lock);
 	old = kvm->irq_routing;
-	rcu_assign_pointer(kvm->irq_routing, new);
+	kvm_irq_routing_update(kvm, new);
 	mutex_unlock(&kvm->irq_lock);
+
 	synchronize_rcu();
 
 	new = old;
-- 
cgit v1.2.3-71-gd317


From 27923eb19c5d1197bd9d1472abdc2e749f21387a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 25 Nov 2010 10:25:44 +0100
Subject: KVM: PPC: Fix compile warning

KVM compilation fails with the following warning:

include/linux/kvm_host.h: In function 'kvm_irq_routing_update':
include/linux/kvm_host.h:679:2: error: 'struct kvm' has no member named 'irq_routing'

That function is only used and reasonable to have on systems that implement
an in-kernel interrupt chip. PPC doesn't.

Fix by #ifdef'ing it out when no irqchip is available.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f17beae3cca0..da0794f707f6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -673,11 +673,13 @@ static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 
 static inline void kvm_irqfd_release(struct kvm *kvm) {}
 
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 static inline void kvm_irq_routing_update(struct kvm *kvm,
 					  struct kvm_irq_routing_table *irq_rt)
 {
 	rcu_assign_pointer(kvm->irq_routing, irq_rt);
 }
+#endif
 
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
-- 
cgit v1.2.3-71-gd317


From a4ee1ca4a36e7857d90ae8c2b85f1bde9a042c10 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 23 Nov 2010 11:13:00 +0800
Subject: KVM: MMU: delay flush all tlbs on sync_page path

Quote from Avi:
| I don't think we need to flush immediately; set a "tlb dirty" bit somewhere
| that is cleareded when we flush the tlb.  kvm_mmu_notifier_invalidate_page()
| can consult the bit and force a flush if set.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 12 ++++++++++--
 include/linux/kvm_host.h   |  2 ++
 virt/kvm/kvm_main.c        |  7 ++++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a43f4ccd30bb..2b3d66c7b68d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -746,6 +746,14 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ *   We should flush all tlbs if spte is dropped even though guest is
+ *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ *   used by guest then tlbs are not flushed, so guest is allowed to access the
+ *   freed pages.
+ *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
@@ -781,14 +789,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		gfn = gpte_to_gfn(gpte);
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
 		if (gfn != sp->gfns[i]) {
 			drop_spte(vcpu->kvm, &sp->spt[i],
 				      shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index da0794f707f6..ac4e83a1a10d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -254,6 +254,7 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
+	long tlbs_dirty;
 #endif
 };
 
@@ -382,6 +383,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5156d458a84d..ee99b77e4451 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+	int dirty_count = kvm->tlbs_dirty;
+
+	smp_mb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
+	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -249,7 +253,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	kvm->mmu_notifier_seq++;
-	need_tlb_flush = kvm_unmap_hva(kvm, address);
+	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 
@@ -293,6 +297,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	kvm->mmu_notifier_count++;
 	for (; start < end; start += PAGE_SIZE)
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
+	need_tlb_flush |= kvm->tlbs_dirty;
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 
-- 
cgit v1.2.3-71-gd317


From d4dbf470096c51cb4785167ea59fdbdea87ccbe4 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 7 Dec 2010 12:59:07 +0900
Subject: KVM: MMU: Make the way of accessing lpage_info more generic

Large page information has two elements but one of them, write_count, alone
is accessed by a helper function.

This patch replaces this helper function with more generic one which returns
newly named kvm_lpage_info structure and use it to access the other element
rmap_pde.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 54 ++++++++++++++++++++++--------------------------
 include/linux/kvm_host.h | 10 +++++----
 2 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index abda57fac659..475a1225f6ec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -477,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 }
 
 /*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
  */
-static int *slot_largepage_idx(gfn_t gfn,
-			       struct kvm_memory_slot *slot,
-			       int level)
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+					      struct kvm_memory_slot *slot,
+					      int level)
 {
 	unsigned long idx;
 
 	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
 	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-	return &slot->lpage_info[level - 2][idx].write_count;
+	return &slot->lpage_info[level - 2][idx];
 }
 
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *slot;
-	int *write_count;
+	struct kvm_lpage_info *linfo;
 	int i;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		write_count   = slot_largepage_idx(gfn, slot, i);
-		*write_count += 1;
+		linfo = lpage_info_slot(gfn, slot, i);
+		linfo->write_count += 1;
 	}
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *slot;
-	int *write_count;
+	struct kvm_lpage_info *linfo;
 	int i;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		write_count   = slot_largepage_idx(gfn, slot, i);
-		*write_count -= 1;
-		WARN_ON(*write_count < 0);
+		linfo = lpage_info_slot(gfn, slot, i);
+		linfo->write_count -= 1;
+		WARN_ON(linfo->write_count < 0);
 	}
 }
 
@@ -525,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
 				int level)
 {
 	struct kvm_memory_slot *slot;
-	int *largepage_idx;
+	struct kvm_lpage_info *linfo;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	if (slot) {
-		largepage_idx = slot_largepage_idx(gfn, slot, level);
-		return *largepage_idx;
+		linfo = lpage_info_slot(gfn, slot, level);
+		return linfo->write_count;
 	}
 
 	return 1;
@@ -585,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 {
 	struct kvm_memory_slot *slot;
-	unsigned long idx;
+	struct kvm_lpage_info *linfo;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 
-	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+	linfo = lpage_info_slot(gfn, slot, level);
 
-	return &slot->lpage_info[level - 2][idx].rmap_pde;
+	return &linfo->rmap_pde;
 }
 
 /*
@@ -882,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 		end = start + (memslot->npages << PAGE_SHIFT);
 		if (hva >= start && hva < end) {
 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			gfn_t gfn = memslot->base_gfn + gfn_offset;
 
 			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-				unsigned long idx;
-				int sh;
-
-				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
-				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
-					(memslot->base_gfn >> sh);
-				ret |= handler(kvm,
-					&memslot->lpage_info[j][idx].rmap_pde,
-					data);
+				struct kvm_lpage_info *linfo;
+
+				linfo = lpage_info_slot(gfn, memslot,
+							PT_DIRECTORY_LEVEL + j);
+				ret |= handler(kvm, &linfo->rmap_pde, data);
 			}
 			trace_kvm_age_page(hva, memslot, ret);
 			retval |= ret;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac4e83a1a10d..bd0da8f12500 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -146,6 +146,11 @@ struct kvm_vcpu {
  */
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
+struct kvm_lpage_info {
+	unsigned long rmap_pde;
+	int write_count;
+};
+
 struct kvm_memory_slot {
 	gfn_t base_gfn;
 	unsigned long npages;
@@ -153,10 +158,7 @@ struct kvm_memory_slot {
 	unsigned long *rmap;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_head;
-	struct {
-		unsigned long rmap_pde;
-		int write_count;
-	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
+	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned long userspace_addr;
 	int user_alloc;
 	int id;
-- 
cgit v1.2.3-71-gd317


From 5c663a1534d27d817e17eed06a83d08f497f9f4f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 8 Dec 2010 18:04:51 +0200
Subject: KVM: Fix build error on s390 due to missing tlbs_dirty

Make it available for all archs.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd0da8f12500..b5021db21858 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,8 +256,8 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
-	long tlbs_dirty;
 #endif
+	long tlbs_dirty;
 };
 
 /* The guest did something we don't support. */
-- 
cgit v1.2.3-71-gd317


From 3a93f2a9f4d8f73d74c0e552feb68a10f778a219 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 10 Nov 2010 14:38:29 +0000
Subject: regulator: Report actual configured voltage to set_voltage()

Change the interface used by set_voltage() to report the selected value
to the regulator core in terms of a selector used by list_voltage().
This allows the regulator core to know the voltage that was chosen
without having to do an explict get_voltage(), which would be much more
expensive as it will generally access hardware.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/88pm8607.c           |  3 ++-
 drivers/regulator/ab3100.c             |  5 ++++-
 drivers/regulator/ab8500.c             |  5 ++++-
 drivers/regulator/core.c               | 14 ++++++++++--
 drivers/regulator/da903x.c             | 17 +++++++++-----
 drivers/regulator/isl6271a-regulator.c |  6 ++++-
 drivers/regulator/lp3971.c             | 10 +++++++--
 drivers/regulator/lp3972.c             | 10 +++++++--
 drivers/regulator/max1586.c            | 30 ++++++++++++-------------
 drivers/regulator/max8649.c            |  3 ++-
 drivers/regulator/max8660.c            | 14 +++++++++---
 drivers/regulator/max8925-regulator.c  |  3 ++-
 drivers/regulator/max8952.c            |  3 ++-
 drivers/regulator/max8998.c            |  8 +++++--
 drivers/regulator/mc13783-regulator.c  | 10 +++++++--
 drivers/regulator/pcap-regulator.c     |  7 ++++--
 drivers/regulator/pcf50633-regulator.c |  5 ++++-
 drivers/regulator/tps65023-regulator.c |  9 ++++++--
 drivers/regulator/tps6507x-regulator.c | 10 +++++++--
 drivers/regulator/tps6586x-regulator.c | 15 ++++++++-----
 drivers/regulator/twl-regulator.c      | 11 ++++++---
 drivers/regulator/wm831x-dcdc.c        | 17 +++++++++-----
 drivers/regulator/wm831x-ldo.c         | 41 ++++++++++++++++++++++++----------
 drivers/regulator/wm8350-regulator.c   |  8 +++++--
 drivers/regulator/wm8400-regulator.c   |  8 +++++--
 drivers/regulator/wm8994-regulator.c   |  6 +++--
 include/linux/regulator/driver.h       |  3 ++-
 27 files changed, 202 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/88pm8607.c b/drivers/regulator/88pm8607.c
index 2ce2eb71d0f5..dd6308499bd4 100644
--- a/drivers/regulator/88pm8607.c
+++ b/drivers/regulator/88pm8607.c
@@ -249,7 +249,7 @@ static int choose_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
 }
 
 static int pm8607_set_voltage(struct regulator_dev *rdev,
-			      int min_uV, int max_uV)
+			      int min_uV, int max_uV, unsigned *selector)
 {
 	struct pm8607_regulator_info *info = rdev_get_drvdata(rdev);
 	uint8_t val, mask;
@@ -263,6 +263,7 @@ static int pm8607_set_voltage(struct regulator_dev *rdev,
 	ret = choose_voltage(rdev, min_uV, max_uV);
 	if (ret < 0)
 		return -EINVAL;
+	*selector = ret;
 	val = (uint8_t)(ret << info->vol_shift);
 	mask = ((1 << info->vol_nbits) - 1)  << info->vol_shift;
 
diff --git a/drivers/regulator/ab3100.c b/drivers/regulator/ab3100.c
index b349266a43de..ed6feaf9398d 100644
--- a/drivers/regulator/ab3100.c
+++ b/drivers/regulator/ab3100.c
@@ -362,7 +362,8 @@ static int ab3100_get_best_voltage_index(struct regulator_dev *reg,
 }
 
 static int ab3100_set_voltage_regulator(struct regulator_dev *reg,
-					int min_uV, int max_uV)
+					int min_uV, int max_uV,
+					unsigned *selector)
 {
 	struct ab3100_regulator *abreg = reg->reg_data;
 	u8 regval;
@@ -373,6 +374,8 @@ static int ab3100_set_voltage_regulator(struct regulator_dev *reg,
 	if (bestindex < 0)
 		return bestindex;
 
+	*selector = bestindex;
+
 	err = abx500_get_register_interruptible(abreg->dev, 0,
 						abreg->regreg, &regval);
 	if (err) {
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index db6b70f20511..2f4ec0facef1 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -215,7 +215,8 @@ static int ab8500_get_best_voltage_index(struct regulator_dev *rdev,
 }
 
 static int ab8500_regulator_set_voltage(struct regulator_dev *rdev,
-		int min_uV, int max_uV)
+					int min_uV, int max_uV,
+					unsigned *selector)
 {
 	int regulator_id, ret;
 	struct ab8500_regulator_info *info = rdev_get_drvdata(rdev);
@@ -232,6 +233,8 @@ static int ab8500_regulator_set_voltage(struct regulator_dev *rdev,
 		return ret;
 	}
 
+	*selector = ret;
+
 	/* set the registers for the request */
 	ret = abx500_mask_and_set_register_interruptible(info->dev,
 		info->voltage_bank, info->voltage_reg,
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 81336e23848a..67d3a61f3785 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -723,13 +723,16 @@ static int machine_constraints_voltage(struct regulator_dev *rdev,
 	struct regulator_ops *ops = rdev->desc->ops;
 	const char *name = rdev_get_name(rdev);
 	int ret;
+	unsigned selector;
 
 	/* do we need to apply the constraint voltage */
 	if (rdev->constraints->apply_uV &&
 		rdev->constraints->min_uV == rdev->constraints->max_uV &&
 		ops->set_voltage) {
 		ret = ops->set_voltage(rdev,
-			rdev->constraints->min_uV, rdev->constraints->max_uV);
+				       rdev->constraints->min_uV,
+				       rdev->constraints->max_uV,
+				       &selector);
 			if (ret < 0) {
 				printk(KERN_ERR "%s: failed to apply %duV constraint to %s\n",
 				       __func__,
@@ -1625,6 +1628,7 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
 {
 	struct regulator_dev *rdev = regulator->rdev;
 	int ret;
+	unsigned selector;
 
 	mutex_lock(&rdev->mutex);
 
@@ -1640,7 +1644,13 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
 		goto out;
 	regulator->min_uV = min_uV;
 	regulator->max_uV = max_uV;
-	ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV);
+
+	ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV, &selector);
+
+	if (rdev->desc->ops->list_voltage)
+		selector = rdev->desc->ops->list_voltage(rdev, selector);
+	else
+		selector = -1;
 
 out:
 	_notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, NULL);
diff --git a/drivers/regulator/da903x.c b/drivers/regulator/da903x.c
index f8c4661a7a81..362e08221085 100644
--- a/drivers/regulator/da903x.c
+++ b/drivers/regulator/da903x.c
@@ -107,7 +107,7 @@ static inline int check_range(struct da903x_regulator_info *info,
 
 /* DA9030/DA9034 common operations */
 static int da903x_set_ldo_voltage(struct regulator_dev *rdev,
-				  int min_uV, int max_uV)
+				  int min_uV, int max_uV, unsigned *selector)
 {
 	struct da903x_regulator_info *info = rdev_get_drvdata(rdev);
 	struct device *da9034_dev = to_da903x_dev(rdev);
@@ -119,6 +119,7 @@ static int da903x_set_ldo_voltage(struct regulator_dev *rdev,
 	}
 
 	val = (min_uV - info->min_uV + info->step_uV - 1) / info->step_uV;
+	*selector = val;
 	val <<= info->vol_shift;
 	mask = ((1 << info->vol_nbits) - 1)  << info->vol_shift;
 
@@ -187,7 +188,8 @@ static int da903x_list_voltage(struct regulator_dev *rdev, unsigned selector)
 
 /* DA9030 specific operations */
 static int da9030_set_ldo1_15_voltage(struct regulator_dev *rdev,
-				       int min_uV, int max_uV)
+				      int min_uV, int max_uV,
+				      unsigned *selector)
 {
 	struct da903x_regulator_info *info = rdev_get_drvdata(rdev);
 	struct device *da903x_dev = to_da903x_dev(rdev);
@@ -200,6 +202,7 @@ static int da9030_set_ldo1_15_voltage(struct regulator_dev *rdev,
 	}
 
 	val = (min_uV - info->min_uV + info->step_uV - 1) / info->step_uV;
+	*selector = val;
 	val <<= info->vol_shift;
 	mask = ((1 << info->vol_nbits) - 1)  << info->vol_shift;
 	val |= DA9030_LDO_UNLOCK; /* have to set UNLOCK bits */
@@ -214,7 +217,8 @@ static int da9030_set_ldo1_15_voltage(struct regulator_dev *rdev,
 }
 
 static int da9030_set_ldo14_voltage(struct regulator_dev *rdev,
-				  int min_uV, int max_uV)
+				    int min_uV, int max_uV,
+				    unsigned *selector)
 {
 	struct da903x_regulator_info *info = rdev_get_drvdata(rdev);
 	struct device *da903x_dev = to_da903x_dev(rdev);
@@ -234,6 +238,7 @@ static int da9030_set_ldo14_voltage(struct regulator_dev *rdev,
 		val = (min_uV - thresh + info->step_uV - 1) / info->step_uV;
 	}
 
+	*selector = val;
 	val <<= info->vol_shift;
 	mask = ((1 << info->vol_nbits) - 1)  << info->vol_shift;
 
@@ -263,7 +268,7 @@ static int da9030_get_ldo14_voltage(struct regulator_dev *rdev)
 
 /* DA9034 specific operations */
 static int da9034_set_dvc_voltage(struct regulator_dev *rdev,
-				  int min_uV, int max_uV)
+				  int min_uV, int max_uV, unsigned *selector)
 {
 	struct da903x_regulator_info *info = rdev_get_drvdata(rdev);
 	struct device *da9034_dev = to_da903x_dev(rdev);
@@ -276,6 +281,7 @@ static int da9034_set_dvc_voltage(struct regulator_dev *rdev,
 	}
 
 	val = (min_uV - info->min_uV + info->step_uV - 1) / info->step_uV;
+	*selector = val;
 	val <<= info->vol_shift;
 	mask = ((1 << info->vol_nbits) - 1)  << info->vol_shift;
 
@@ -289,7 +295,7 @@ static int da9034_set_dvc_voltage(struct regulator_dev *rdev,
 }
 
 static int da9034_set_ldo12_voltage(struct regulator_dev *rdev,
-				    int min_uV, int max_uV)
+				    int min_uV, int max_uV, unsigned *selector)
 {
 	struct da903x_regulator_info *info = rdev_get_drvdata(rdev);
 	struct device *da9034_dev = to_da903x_dev(rdev);
@@ -302,6 +308,7 @@ static int da9034_set_ldo12_voltage(struct regulator_dev *rdev,
 
 	val = (min_uV - info->min_uV + info->step_uV - 1) / info->step_uV;
 	val = (val >= 20) ? val - 12 : ((val > 7) ? 8 : val);
+	*selector = val;
 	val <<= info->vol_shift;
 	mask = ((1 << info->vol_nbits) - 1)  << info->vol_shift;
 
diff --git a/drivers/regulator/isl6271a-regulator.c b/drivers/regulator/isl6271a-regulator.c
index b8cc6389a541..b5639e82fcc7 100644
--- a/drivers/regulator/isl6271a-regulator.c
+++ b/drivers/regulator/isl6271a-regulator.c
@@ -58,7 +58,9 @@ out:
 	return data;
 }
 
-static int isl6271a_set_voltage(struct regulator_dev *dev, int minuV, int maxuV)
+static int isl6271a_set_voltage(struct regulator_dev *dev,
+				int minuV, int maxuV,
+				unsigned *selector)
 {
 	struct isl_pmic *pmic = rdev_get_drvdata(dev);
 	int vsel, err, data;
@@ -78,6 +80,8 @@ static int isl6271a_set_voltage(struct regulator_dev *dev, int minuV, int maxuV)
 	/* Convert the microvolts to data for the chip */
 	data = (vsel - ISL6271A_VOLTAGE_MIN) / ISL6271A_VOLTAGE_STEP;
 
+	*selector = data;
+
 	mutex_lock(&pmic->mtx);
 
 	err = i2c_smbus_write_byte(pmic->client, data);
diff --git a/drivers/regulator/lp3971.c b/drivers/regulator/lp3971.c
index 3bb82b624e19..0f22ef12601c 100644
--- a/drivers/regulator/lp3971.c
+++ b/drivers/regulator/lp3971.c
@@ -168,7 +168,8 @@ static int lp3971_ldo_get_voltage(struct regulator_dev *dev)
 }
 
 static int lp3971_ldo_set_voltage(struct regulator_dev *dev,
-				  int min_uV, int max_uV)
+				  int min_uV, int max_uV,
+				  unsigned int *selector)
 {
 	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
 	int ldo = rdev_get_id(dev) - LP3971_LDO1;
@@ -187,6 +188,8 @@ static int lp3971_ldo_set_voltage(struct regulator_dev *dev,
 	if (val > LDO_VOL_MAX_IDX || vol_map[val] > max_vol)
 		return -EINVAL;
 
+	*selector = val;
+
 	return lp3971_set_bits(lp3971, LP3971_LDO_VOL_CONTR_REG(ldo),
 			LDO_VOL_CONTR_MASK << LDO_VOL_CONTR_SHIFT(ldo),
 			val << LDO_VOL_CONTR_SHIFT(ldo));
@@ -256,7 +259,8 @@ static int lp3971_dcdc_get_voltage(struct regulator_dev *dev)
 }
 
 static int lp3971_dcdc_set_voltage(struct regulator_dev *dev,
-				  int min_uV, int max_uV)
+				   int min_uV, int max_uV,
+				   unsigned int *selector)
 {
 	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
 	int buck = rdev_get_id(dev) - LP3971_DCDC1;
@@ -277,6 +281,8 @@ static int lp3971_dcdc_set_voltage(struct regulator_dev *dev,
 	if (val > BUCK_TARGET_VOL_MAX_IDX || vol_map[val] > max_vol)
 		return -EINVAL;
 
+	*selector = val;
+
 	ret = lp3971_set_bits(lp3971, LP3971_BUCK_TARGET_VOL1_REG(buck),
 	       BUCK_TARGET_VOL_MASK, val);
 	if (ret)
diff --git a/drivers/regulator/lp3972.c b/drivers/regulator/lp3972.c
index e07062fd0b42..6aa1b506fb5d 100644
--- a/drivers/regulator/lp3972.c
+++ b/drivers/regulator/lp3972.c
@@ -292,7 +292,8 @@ static int lp3972_ldo_get_voltage(struct regulator_dev *dev)
 }
 
 static int lp3972_ldo_set_voltage(struct regulator_dev *dev,
-				  int min_uV, int max_uV)
+				  int min_uV, int max_uV,
+				  unsigned int *selector)
 {
 	struct lp3972 *lp3972 = rdev_get_drvdata(dev);
 	int ldo = rdev_get_id(dev) - LP3972_LDO1;
@@ -313,6 +314,8 @@ static int lp3972_ldo_set_voltage(struct regulator_dev *dev,
 	if (val > LP3972_LDO_VOL_MAX_IDX(ldo) || vol_map[val] > max_vol)
 		return -EINVAL;
 
+	*selector = val;
+
 	shift = LP3972_LDO_VOL_CONTR_SHIFT(ldo);
 	ret = lp3972_set_bits(lp3972, LP3972_LDO_VOL_CONTR_REG(ldo),
 		LP3972_LDO_VOL_MASK(ldo) << shift, val << shift);
@@ -416,7 +419,8 @@ static int lp3972_dcdc_get_voltage(struct regulator_dev *dev)
 }
 
 static int lp3972_dcdc_set_voltage(struct regulator_dev *dev,
-				  int min_uV, int max_uV)
+				   int min_uV, int max_uV,
+				   unsigned int *selector)
 {
 	struct lp3972 *lp3972 = rdev_get_drvdata(dev);
 	int buck = rdev_get_id(dev) - LP3972_DCDC1;
@@ -438,6 +442,8 @@ static int lp3972_dcdc_set_voltage(struct regulator_dev *dev,
 	    vol_map[val] > max_vol)
 		return -EINVAL;
 
+	*selector = val;
+
 	ret = lp3972_set_bits(lp3972, LP3972_BUCK_VOL1_REG(buck),
 				LP3972_BUCK_VOL_MASK, val);
 	if (ret)
diff --git a/drivers/regulator/max1586.c b/drivers/regulator/max1586.c
index 559cfa271a44..3f49512c5134 100644
--- a/drivers/regulator/max1586.c
+++ b/drivers/regulator/max1586.c
@@ -63,12 +63,12 @@ static int max1586_v3_calc_voltage(struct max1586_data *max1586,
 	return max1586->min_uV + (selector * range_uV / MAX1586_V3_MAX_VSEL);
 }
 
-static int max1586_v3_set(struct regulator_dev *rdev, int min_uV, int max_uV)
+static int max1586_v3_set(struct regulator_dev *rdev, int min_uV, int max_uV,
+			  unsigned *selector)
 {
 	struct max1586_data *max1586 = rdev_get_drvdata(rdev);
 	struct i2c_client *client = max1586->client;
 	unsigned range_uV = max1586->max_uV - max1586->min_uV;
-	unsigned selector;
 	u8 v3_prog;
 
 	if (min_uV > max1586->max_uV || max_uV < max1586->min_uV)
@@ -76,15 +76,15 @@ static int max1586_v3_set(struct regulator_dev *rdev, int min_uV, int max_uV)
 	if (min_uV < max1586->min_uV)
 		min_uV = max1586->min_uV;
 
-	selector = ((min_uV - max1586->min_uV) * MAX1586_V3_MAX_VSEL +
+	*selector = ((min_uV - max1586->min_uV) * MAX1586_V3_MAX_VSEL +
 			range_uV - 1) / range_uV;
-	if (max1586_v3_calc_voltage(max1586, selector) > max_uV)
+	if (max1586_v3_calc_voltage(max1586, *selector) > max_uV)
 		return -EINVAL;
 
 	dev_dbg(&client->dev, "changing voltage v3 to %dmv\n",
-		max1586_v3_calc_voltage(max1586, selector) / 1000);
+		max1586_v3_calc_voltage(max1586, *selector) / 1000);
 
-	v3_prog = I2C_V3_SELECT | (u8) selector;
+	v3_prog = I2C_V3_SELECT | (u8) *selector;
 	return i2c_smbus_write_byte(client, v3_prog);
 }
 
@@ -110,10 +110,10 @@ static int max1586_v6_calc_voltage(unsigned selector)
 	return voltages_uv[selector];
 }
 
-static int max1586_v6_set(struct regulator_dev *rdev, int min_uV, int max_uV)
+static int max1586_v6_set(struct regulator_dev *rdev, int min_uV, int max_uV,
+			  unsigned int *selector)
 {
 	struct i2c_client *client = rdev_get_drvdata(rdev);
-	unsigned selector;
 	u8 v6_prog;
 
 	if (min_uV < MAX1586_V6_MIN_UV || min_uV > MAX1586_V6_MAX_UV)
@@ -122,21 +122,21 @@ static int max1586_v6_set(struct regulator_dev *rdev, int min_uV, int max_uV)
 		return -EINVAL;
 
 	if (min_uV < 1800000)
-		selector = 0;
+		*selector = 0;
 	else if (min_uV < 2500000)
-		selector = 1;
+		*selector = 1;
 	else if (min_uV < 3000000)
-		selector = 2;
+		*selector = 2;
 	else if (min_uV >= 3000000)
-		selector = 3;
+		*selector = 3;
 
-	if (max1586_v6_calc_voltage(selector) > max_uV)
+	if (max1586_v6_calc_voltage(*selector) > max_uV)
 		return -EINVAL;
 
 	dev_dbg(&client->dev, "changing voltage v6 to %dmv\n",
-		max1586_v6_calc_voltage(selector) / 1000);
+		max1586_v6_calc_voltage(*selector) / 1000);
 
-	v6_prog = I2C_V6_SELECT | (u8) selector;
+	v6_prog = I2C_V6_SELECT | (u8) *selector;
 	return i2c_smbus_write_byte(client, v6_prog);
 }
 
diff --git a/drivers/regulator/max8649.c b/drivers/regulator/max8649.c
index 6b60a9c0366b..30eb9e54f7ec 100644
--- a/drivers/regulator/max8649.c
+++ b/drivers/regulator/max8649.c
@@ -155,7 +155,7 @@ static int max8649_get_voltage(struct regulator_dev *rdev)
 }
 
 static int max8649_set_voltage(struct regulator_dev *rdev,
-			       int min_uV, int max_uV)
+			       int min_uV, int max_uV, unsigned *selector)
 {
 	struct max8649_regulator_info *info = rdev_get_drvdata(rdev);
 	unsigned char data, mask;
@@ -168,6 +168,7 @@ static int max8649_set_voltage(struct regulator_dev *rdev,
 	data = (min_uV - MAX8649_DCDC_VMIN + MAX8649_DCDC_STEP - 1)
 		/ MAX8649_DCDC_STEP;
 	mask = MAX8649_VOL_MASK;
+	*selector = data & mask;
 
 	return max8649_set_bits(info->i2c, info->vol_reg, mask, data);
 }
diff --git a/drivers/regulator/max8660.c b/drivers/regulator/max8660.c
index c570e6eb0db2..33f5d9a492ef 100644
--- a/drivers/regulator/max8660.c
+++ b/drivers/regulator/max8660.c
@@ -141,7 +141,8 @@ static int max8660_dcdc_get(struct regulator_dev *rdev)
 	return MAX8660_DCDC_MIN_UV + selector * MAX8660_DCDC_STEP;
 }
 
-static int max8660_dcdc_set(struct regulator_dev *rdev, int min_uV, int max_uV)
+static int max8660_dcdc_set(struct regulator_dev *rdev, int min_uV, int max_uV,
+			    unsigned int *s)
 {
 	struct max8660 *max8660 = rdev_get_drvdata(rdev);
 	u8 reg, selector, bits;
@@ -154,6 +155,7 @@ static int max8660_dcdc_set(struct regulator_dev *rdev, int min_uV, int max_uV)
 
 	selector = (min_uV - (MAX8660_DCDC_MIN_UV - MAX8660_DCDC_STEP + 1))
 			/ MAX8660_DCDC_STEP;
+	*s = selector;
 
 	ret = max8660_dcdc_list(rdev, selector);
 	if (ret < 0 || ret > max_uV)
@@ -196,7 +198,8 @@ static int max8660_ldo5_get(struct regulator_dev *rdev)
 	return MAX8660_LDO5_MIN_UV + selector * MAX8660_LDO5_STEP;
 }
 
-static int max8660_ldo5_set(struct regulator_dev *rdev, int min_uV, int max_uV)
+static int max8660_ldo5_set(struct regulator_dev *rdev, int min_uV, int max_uV,
+			    unsigned int *s)
 {
 	struct max8660 *max8660 = rdev_get_drvdata(rdev);
 	u8 selector;
@@ -213,6 +216,8 @@ static int max8660_ldo5_set(struct regulator_dev *rdev, int min_uV, int max_uV)
 	if (ret < 0 || ret > max_uV)
 		return -EINVAL;
 
+	*s = selector;
+
 	ret = max8660_write(max8660, MAX8660_MDTV2, 0, selector);
 	if (ret)
 		return ret;
@@ -270,7 +275,8 @@ static int max8660_ldo67_get(struct regulator_dev *rdev)
 	return MAX8660_LDO67_MIN_UV + selector * MAX8660_LDO67_STEP;
 }
 
-static int max8660_ldo67_set(struct regulator_dev *rdev, int min_uV, int max_uV)
+static int max8660_ldo67_set(struct regulator_dev *rdev, int min_uV,
+			     int max_uV, unsigned int *s)
 {
 	struct max8660 *max8660 = rdev_get_drvdata(rdev);
 	u8 selector;
@@ -288,6 +294,8 @@ static int max8660_ldo67_set(struct regulator_dev *rdev, int min_uV, int max_uV)
 	if (ret < 0 || ret > max_uV)
 		return -EINVAL;
 
+	*s = selector;
+
 	if (rdev_get_id(rdev) == MAX8660_V6)
 		return max8660_write(max8660, MAX8660_L12VCR, 0xf0, selector);
 	else
diff --git a/drivers/regulator/max8925-regulator.c b/drivers/regulator/max8925-regulator.c
index 552cad85ae5a..8ae147549c6a 100644
--- a/drivers/regulator/max8925-regulator.c
+++ b/drivers/regulator/max8925-regulator.c
@@ -55,7 +55,7 @@ static int max8925_list_voltage(struct regulator_dev *rdev, unsigned index)
 }
 
 static int max8925_set_voltage(struct regulator_dev *rdev,
-			       int min_uV, int max_uV)
+			       int min_uV, int max_uV, unsigned int *selector)
 {
 	struct max8925_regulator_info *info = rdev_get_drvdata(rdev);
 	unsigned char data, mask;
@@ -66,6 +66,7 @@ static int max8925_set_voltage(struct regulator_dev *rdev,
 		return -EINVAL;
 	}
 	data = (min_uV - info->min_uV + info->step_uV - 1) / info->step_uV;
+	*selector = data;
 	data <<= info->vol_shift;
 	mask = ((1 << info->vol_nbits) - 1) << info->vol_shift;
 
diff --git a/drivers/regulator/max8952.c b/drivers/regulator/max8952.c
index 0d5dda4fd911..a8f4ecfb0843 100644
--- a/drivers/regulator/max8952.c
+++ b/drivers/regulator/max8952.c
@@ -133,7 +133,7 @@ static int max8952_get_voltage(struct regulator_dev *rdev)
 }
 
 static int max8952_set_voltage(struct regulator_dev *rdev,
-				int min_uV, int max_uV)
+			       int min_uV, int max_uV, unsigned *selector)
 {
 	struct max8952_data *max8952 = rdev_get_drvdata(rdev);
 	s8 vid = -1, i;
@@ -156,6 +156,7 @@ static int max8952_set_voltage(struct regulator_dev *rdev,
 	if (vid >= 0 && vid < MAX8952_NUM_DVS_MODE) {
 		max8952->vid0 = (vid % 2 == 1);
 		max8952->vid1 = (((vid >> 1) % 2) == 1);
+		*selector = vid;
 		gpio_set_value(max8952->pdata->gpio_vid0, max8952->vid0);
 		gpio_set_value(max8952->pdata->gpio_vid1, max8952->vid1);
 	} else
diff --git a/drivers/regulator/max8998.c b/drivers/regulator/max8998.c
index 5c20756db607..cb28cf8b9397 100644
--- a/drivers/regulator/max8998.c
+++ b/drivers/regulator/max8998.c
@@ -304,7 +304,7 @@ static int max8998_get_voltage(struct regulator_dev *rdev)
 }
 
 static int max8998_set_voltage_ldo(struct regulator_dev *rdev,
-				int min_uV, int max_uV)
+				   int min_uV, int max_uV, unsigned *selector)
 {
 	struct max8998_data *max8998 = rdev_get_drvdata(rdev);
 	struct i2c_client *i2c = max8998->iodev->i2c;
@@ -331,6 +331,8 @@ static int max8998_set_voltage_ldo(struct regulator_dev *rdev,
 	if (desc->min + desc->step*i > max_vol)
 		return -EINVAL;
 
+	*selector = i;
+
 	ret = max8998_get_voltage_register(rdev, &reg, &shift, &mask);
 	if (ret)
 		return ret;
@@ -352,7 +354,7 @@ static inline void buck2_gpio_set(int gpio, int v)
 }
 
 static int max8998_set_voltage_buck(struct regulator_dev *rdev,
-				    int min_uV, int max_uV)
+				    int min_uV, int max_uV, int *selector)
 {
 	struct max8998_data *max8998 = rdev_get_drvdata(rdev);
 	struct max8998_platform_data *pdata =
@@ -384,6 +386,8 @@ static int max8998_set_voltage_buck(struct regulator_dev *rdev,
 	if (desc->min + desc->step*i > max_vol)
 		return -EINVAL;
 
+	*selector = i;
+
 	ret = max8998_get_voltage_register(rdev, &reg, &shift, &mask);
 	if (ret)
 		return ret;
diff --git a/drivers/regulator/mc13783-regulator.c b/drivers/regulator/mc13783-regulator.c
index ecd99f59dba8..47ea99949798 100644
--- a/drivers/regulator/mc13783-regulator.c
+++ b/drivers/regulator/mc13783-regulator.c
@@ -373,7 +373,8 @@ static int mc13783_get_best_voltage_index(struct regulator_dev *rdev,
 }
 
 static int mc13783_regulator_set_voltage(struct regulator_dev *rdev,
-						int min_uV, int max_uV)
+					 int min_uV, int max_uV,
+					 unsigned *selector)
 {
 	struct mc13783_regulator_priv *priv = rdev_get_drvdata(rdev);
 	int value, id = rdev_get_id(rdev);
@@ -388,6 +389,8 @@ static int mc13783_regulator_set_voltage(struct regulator_dev *rdev,
 	if (value < 0)
 		return value;
 
+	*selector = value;
+
 	mc13783_lock(priv->mc13783);
 	ret = mc13783_reg_rmw(priv->mc13783, mc13783_regulators[id].vsel_reg,
 			mc13783_regulators[id].vsel_mask,
@@ -433,13 +436,16 @@ static struct regulator_ops mc13783_regulator_ops = {
 };
 
 static int mc13783_fixed_regulator_set_voltage(struct regulator_dev *rdev,
-						int min_uV, int max_uV)
+					       int min_uV, int max_uV,
+					       unsigned int *selector)
 {
 	int id = rdev_get_id(rdev);
 
 	dev_dbg(rdev_get_dev(rdev), "%s id: %d min_uV: %d max_uV: %d\n",
 		__func__, id, min_uV, max_uV);
 
+	*selector = 0;
+
 	if (min_uV >= mc13783_regulators[id].voltages[0] &&
 	    max_uV <= mc13783_regulators[id].voltages[0])
 		return 0;
diff --git a/drivers/regulator/pcap-regulator.c b/drivers/regulator/pcap-regulator.c
index 29d0566379ae..8dca11694f75 100644
--- a/drivers/regulator/pcap-regulator.c
+++ b/drivers/regulator/pcap-regulator.c
@@ -151,7 +151,8 @@ static struct pcap_regulator vreg_table[] = {
 };
 
 static int pcap_regulator_set_voltage(struct regulator_dev *rdev,
-						int min_uV, int max_uV)
+				      int min_uV, int max_uV,
+				      unsiged *selector)
 {
 	struct pcap_regulator *vreg = &vreg_table[rdev_get_id(rdev)];
 	void *pcap = rdev_get_drvdata(rdev);
@@ -170,10 +171,12 @@ static int pcap_regulator_set_voltage(struct regulator_dev *rdev,
 			i = 0;
 
 		uV = vreg->voltage_table[i] * 1000;
-		if (min_uV <= uV && uV <= max_uV)
+		if (min_uV <= uV && uV <= max_uV) {
+			*selector = i;
 			return ezx_pcap_set_bits(pcap, vreg->reg,
 					(vreg->n_voltages - 1) << vreg->index,
 					i << vreg->index);
+		}
 
 		if (i == 0 && rdev_get_id(rdev) == V1)
 			i = vreg->n_voltages - 1;
diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c
index c8f41dc05b76..69a11d9dd87f 100644
--- a/drivers/regulator/pcf50633-regulator.c
+++ b/drivers/regulator/pcf50633-regulator.c
@@ -108,7 +108,8 @@ static unsigned int ldo_voltage_value(u8 bits)
 }
 
 static int pcf50633_regulator_set_voltage(struct regulator_dev *rdev,
-						int min_uV, int max_uV)
+					  int min_uV, int max_uV,
+					  unsigned *selector)
 {
 	struct pcf50633 *pcf;
 	int regulator_id, millivolts;
@@ -147,6 +148,8 @@ static int pcf50633_regulator_set_voltage(struct regulator_dev *rdev,
 		return -EINVAL;
 	}
 
+	*selector = volt_bits;
+
 	return pcf50633_reg_write(pcf, regnr, volt_bits);
 }
 
diff --git a/drivers/regulator/tps65023-regulator.c b/drivers/regulator/tps65023-regulator.c
index cd6d4fc9d74f..60a7ca5409e9 100644
--- a/drivers/regulator/tps65023-regulator.c
+++ b/drivers/regulator/tps65023-regulator.c
@@ -321,7 +321,8 @@ static int tps65023_dcdc_get_voltage(struct regulator_dev *dev)
 }
 
 static int tps65023_dcdc_set_voltage(struct regulator_dev *dev,
-				int min_uV, int max_uV)
+				     int min_uV, int max_uV,
+				     unsigned *selector)
 {
 	struct tps_pmic *tps = rdev_get_drvdata(dev);
 	int dcdc = rdev_get_id(dev);
@@ -346,6 +347,8 @@ static int tps65023_dcdc_set_voltage(struct regulator_dev *dev,
 			break;
 	}
 
+	*selector = vsel;
+
 	/* write to the register in case we found a match */
 	if (vsel == tps->info[dcdc]->table_len)
 		return -EINVAL;
@@ -371,7 +374,7 @@ static int tps65023_ldo_get_voltage(struct regulator_dev *dev)
 }
 
 static int tps65023_ldo_set_voltage(struct regulator_dev *dev,
-				int min_uV, int max_uV)
+				    int min_uV, int max_uV, unsigned *selector)
 {
 	struct tps_pmic *tps = rdev_get_drvdata(dev);
 	int data, vsel, ldo = rdev_get_id(dev);
@@ -396,6 +399,8 @@ static int tps65023_ldo_set_voltage(struct regulator_dev *dev,
 	if (vsel == tps->info[ldo]->table_len)
 		return -EINVAL;
 
+	*selector = vsel;
+
 	data = tps_65023_reg_read(tps, TPS65023_REG_LDO_CTRL);
 	if (data < 0)
 		return data;
diff --git a/drivers/regulator/tps6507x-regulator.c b/drivers/regulator/tps6507x-regulator.c
index 020f5878d7ff..064755290599 100644
--- a/drivers/regulator/tps6507x-regulator.c
+++ b/drivers/regulator/tps6507x-regulator.c
@@ -369,7 +369,8 @@ static int tps6507x_pmic_dcdc_get_voltage(struct regulator_dev *dev)
 }
 
 static int tps6507x_pmic_dcdc_set_voltage(struct regulator_dev *dev,
-				int min_uV, int max_uV)
+					  int min_uV, int max_uV,
+					  unsigned *selector)
 {
 	struct tps6507x_pmic *tps = rdev_get_drvdata(dev);
 	int data, vsel, dcdc = rdev_get_id(dev);
@@ -415,6 +416,8 @@ static int tps6507x_pmic_dcdc_set_voltage(struct regulator_dev *dev,
 	if (vsel == tps->info[dcdc]->table_len)
 		return -EINVAL;
 
+	*selector = vsel;
+
 	data = tps6507x_pmic_reg_read(tps, reg);
 	if (data < 0)
 		return data;
@@ -450,7 +453,8 @@ static int tps6507x_pmic_ldo_get_voltage(struct regulator_dev *dev)
 }
 
 static int tps6507x_pmic_ldo_set_voltage(struct regulator_dev *dev,
-				int min_uV, int max_uV)
+					 int min_uV, int max_uV,
+					 unsigned *selector)
 {
 	struct tps6507x_pmic *tps = rdev_get_drvdata(dev);
 	int data, vsel, ldo = rdev_get_id(dev);
@@ -483,6 +487,8 @@ static int tps6507x_pmic_ldo_set_voltage(struct regulator_dev *dev,
 	if (vsel == tps->info[ldo]->table_len)
 		return -EINVAL;
 
+	*selector = vsel;
+
 	data = tps6507x_pmic_reg_read(tps, reg);
 	if (data < 0)
 		return data;
diff --git a/drivers/regulator/tps6586x-regulator.c b/drivers/regulator/tps6586x-regulator.c
index 6d20b0454a1d..bb04a75a4c98 100644
--- a/drivers/regulator/tps6586x-regulator.c
+++ b/drivers/regulator/tps6586x-regulator.c
@@ -85,7 +85,8 @@ static int tps6586x_ldo_list_voltage(struct regulator_dev *rdev,
 
 static int __tps6586x_ldo_set_voltage(struct device *parent,
 				      struct tps6586x_regulator *ri,
-				      int min_uV, int max_uV)
+				      int min_uV, int max_uV,
+				      unsigned *selector)
 {
 	int val, uV;
 	uint8_t mask;
@@ -100,6 +101,8 @@ static int __tps6586x_ldo_set_voltage(struct device *parent,
 		/* use the first in-range value */
 		if (min_uV <= uV && uV <= max_uV) {
 
+			*selector = val;
+
 			val <<= ri->volt_shift;
 			mask = ((1 << ri->volt_nbits) - 1) << ri->volt_shift;
 
@@ -111,12 +114,13 @@ static int __tps6586x_ldo_set_voltage(struct device *parent,
 }
 
 static int tps6586x_ldo_set_voltage(struct regulator_dev *rdev,
-				    int min_uV, int max_uV)
+				    int min_uV, int max_uV, unsigned *selector)
 {
 	struct tps6586x_regulator *ri = rdev_get_drvdata(rdev);
 	struct device *parent = to_tps6586x_dev(rdev);
 
-	return __tps6586x_ldo_set_voltage(parent, ri, min_uV, max_uV);
+	return __tps6586x_ldo_set_voltage(parent, ri, min_uV, max_uV,
+					  selector);
 }
 
 static int tps6586x_ldo_get_voltage(struct regulator_dev *rdev)
@@ -140,13 +144,14 @@ static int tps6586x_ldo_get_voltage(struct regulator_dev *rdev)
 }
 
 static int tps6586x_dvm_set_voltage(struct regulator_dev *rdev,
-				    int min_uV, int max_uV)
+				    int min_uV, int max_uV, unsigned *selector)
 {
 	struct tps6586x_regulator *ri = rdev_get_drvdata(rdev);
 	struct device *parent = to_tps6586x_dev(rdev);
 	int ret;
 
-	ret = __tps6586x_ldo_set_voltage(parent, ri, min_uV, max_uV);
+	ret = __tps6586x_ldo_set_voltage(parent, ri, min_uV, max_uV,
+					 selector);
 	if (ret)
 		return ret;
 
diff --git a/drivers/regulator/twl-regulator.c b/drivers/regulator/twl-regulator.c
index a57262a4fa6c..bd332cf1cc3f 100644
--- a/drivers/regulator/twl-regulator.c
+++ b/drivers/regulator/twl-regulator.c
@@ -329,7 +329,8 @@ static int twl4030ldo_list_voltage(struct regulator_dev *rdev, unsigned index)
 }
 
 static int
-twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
+twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV,
+		       unsigned *selector)
 {
 	struct twlreg_info	*info = rdev_get_drvdata(rdev);
 	int			vsel;
@@ -345,9 +346,11 @@ twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
 		/* REVISIT for VAUX2, first match may not be best/lowest */
 
 		/* use the first in-range value */
-		if (min_uV <= uV && uV <= max_uV)
+		if (min_uV <= uV && uV <= max_uV) {
+			*selector = vsel;
 			return twlreg_write(info, TWL_MODULE_PM_RECEIVER,
 							VREG_VOLTAGE, vsel);
+		}
 	}
 
 	return -EDOM;
@@ -389,7 +392,8 @@ static int twl6030ldo_list_voltage(struct regulator_dev *rdev, unsigned index)
 }
 
 static int
-twl6030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
+twl6030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV,
+		       unsigned *selector)
 {
 	struct twlreg_info	*info = rdev_get_drvdata(rdev);
 	int			vsel;
@@ -402,6 +406,7 @@ twl6030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
 	 * mV = 1000mv + 100mv * (vsel - 1)
 	 */
 	vsel = (min_uV/1000 - 1000)/100 + 1;
+	*selector = vsel;
 	return twlreg_write(info, TWL_MODULE_PM_RECEIVER, VREG_VOLTAGE, vsel);
 
 }
diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c
index dbfaf5945e48..71da6b2ad863 100644
--- a/drivers/regulator/wm831x-dcdc.c
+++ b/drivers/regulator/wm831x-dcdc.c
@@ -302,7 +302,7 @@ static int wm831x_buckv_set_dvs(struct regulator_dev *rdev, int state)
 }
 
 static int wm831x_buckv_set_voltage(struct regulator_dev *rdev,
-				    int min_uV, int max_uV)
+				    int min_uV, int max_uV, unsigned *selector)
 {
 	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
 	struct wm831x *wm831x = dcdc->wm831x;
@@ -314,6 +314,8 @@ static int wm831x_buckv_set_voltage(struct regulator_dev *rdev,
 	if (vsel < 0)
 		return vsel;
 
+	*selector = vsel;
+
 	/* If this value is already set then do a GPIO update if we can */
 	if (dcdc->dvs_gpio && dcdc->on_vsel == vsel)
 		return wm831x_buckv_set_dvs(rdev, 0);
@@ -636,7 +638,7 @@ static int wm831x_buckp_list_voltage(struct regulator_dev *rdev,
 }
 
 static int wm831x_buckp_set_voltage_int(struct regulator_dev *rdev, int reg,
-					int min_uV, int max_uV)
+					int min_uV, int max_uV, int *selector)
 {
 	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
 	struct wm831x *wm831x = dcdc->wm831x;
@@ -650,16 +652,20 @@ static int wm831x_buckp_set_voltage_int(struct regulator_dev *rdev, int reg,
 	if (wm831x_buckp_list_voltage(rdev, vsel) > max_uV)
 		return -EINVAL;
 
+	*selector = vsel;
+
 	return wm831x_set_bits(wm831x, reg, WM831X_DC3_ON_VSEL_MASK, vsel);
 }
 
 static int wm831x_buckp_set_voltage(struct regulator_dev *rdev,
-				    int min_uV, int max_uV)
+				    int min_uV, int max_uV,
+				    unsigned *selector)
 {
 	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
 	u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG;
 
-	return wm831x_buckp_set_voltage_int(rdev, reg, min_uV, max_uV);
+	return wm831x_buckp_set_voltage_int(rdev, reg, min_uV, max_uV,
+					    selector);
 }
 
 static int wm831x_buckp_set_suspend_voltage(struct regulator_dev *rdev,
@@ -667,8 +673,9 @@ static int wm831x_buckp_set_suspend_voltage(struct regulator_dev *rdev,
 {
 	struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev);
 	u16 reg = dcdc->base + WM831X_DCDC_SLEEP_CONTROL;
+	unsigned selector;
 
-	return wm831x_buckp_set_voltage_int(rdev, reg, uV, uV);
+	return wm831x_buckp_set_voltage_int(rdev, reg, uV, uV, &selector);
 }
 
 static int wm831x_buckp_get_voltage(struct regulator_dev *rdev)
diff --git a/drivers/regulator/wm831x-ldo.c b/drivers/regulator/wm831x-ldo.c
index 9edf8f692341..9594e7161b4d 100644
--- a/drivers/regulator/wm831x-ldo.c
+++ b/drivers/regulator/wm831x-ldo.c
@@ -113,7 +113,8 @@ static int wm831x_gp_ldo_list_voltage(struct regulator_dev *rdev,
 }
 
 static int wm831x_gp_ldo_set_voltage_int(struct regulator_dev *rdev, int reg,
-					 int min_uV, int max_uV)
+					 int min_uV, int max_uV,
+					 unsigned *selector)
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	struct wm831x *wm831x = ldo->wm831x;
@@ -133,16 +134,20 @@ static int wm831x_gp_ldo_set_voltage_int(struct regulator_dev *rdev, int reg,
 	if (ret < min_uV || ret > max_uV)
 		return -EINVAL;
 
+	*selector = vsel;
+
 	return wm831x_set_bits(wm831x, reg, WM831X_LDO1_ON_VSEL_MASK, vsel);
 }
 
 static int wm831x_gp_ldo_set_voltage(struct regulator_dev *rdev,
-				     int min_uV, int max_uV)
+				     int min_uV, int max_uV,
+				     unsigned *selector)
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	int reg = ldo->base + WM831X_LDO_ON_CONTROL;
 
-	return wm831x_gp_ldo_set_voltage_int(rdev, reg, min_uV, max_uV);
+	return wm831x_gp_ldo_set_voltage_int(rdev, reg, min_uV, max_uV,
+					     selector);
 }
 
 static int wm831x_gp_ldo_set_suspend_voltage(struct regulator_dev *rdev,
@@ -150,8 +155,9 @@ static int wm831x_gp_ldo_set_suspend_voltage(struct regulator_dev *rdev,
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	int reg = ldo->base + WM831X_LDO_SLEEP_CONTROL;
+	unsigned int selector;
 
-	return wm831x_gp_ldo_set_voltage_int(rdev, reg, uV, uV);
+	return wm831x_gp_ldo_set_voltage_int(rdev, reg, uV, uV, &selector);
 }
 
 static int wm831x_gp_ldo_get_voltage(struct regulator_dev *rdev)
@@ -413,7 +419,8 @@ static int wm831x_aldo_list_voltage(struct regulator_dev *rdev,
 }
 
 static int wm831x_aldo_set_voltage_int(struct regulator_dev *rdev, int reg,
-					 int min_uV, int max_uV)
+				       int min_uV, int max_uV,
+				       unsigned *selector)
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	struct wm831x *wm831x = ldo->wm831x;
@@ -433,16 +440,19 @@ static int wm831x_aldo_set_voltage_int(struct regulator_dev *rdev, int reg,
 	if (ret < min_uV || ret > max_uV)
 		return -EINVAL;
 
+	*selector = vsel;
+
 	return wm831x_set_bits(wm831x, reg, WM831X_LDO7_ON_VSEL_MASK, vsel);
 }
 
 static int wm831x_aldo_set_voltage(struct regulator_dev *rdev,
-				     int min_uV, int max_uV)
+				   int min_uV, int max_uV, unsigned *selector)
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	int reg = ldo->base + WM831X_LDO_ON_CONTROL;
 
-	return wm831x_aldo_set_voltage_int(rdev, reg, min_uV, max_uV);
+	return wm831x_aldo_set_voltage_int(rdev, reg, min_uV, max_uV,
+					   selector);
 }
 
 static int wm831x_aldo_set_suspend_voltage(struct regulator_dev *rdev,
@@ -450,8 +460,9 @@ static int wm831x_aldo_set_suspend_voltage(struct regulator_dev *rdev,
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	int reg = ldo->base + WM831X_LDO_SLEEP_CONTROL;
+	unsigned int selector;
 
-	return wm831x_aldo_set_voltage_int(rdev, reg, uV, uV);
+	return wm831x_aldo_set_voltage_int(rdev, reg, uV, uV, &selector);
 }
 
 static int wm831x_aldo_get_voltage(struct regulator_dev *rdev)
@@ -666,7 +677,8 @@ static int wm831x_alive_ldo_list_voltage(struct regulator_dev *rdev,
 
 static int wm831x_alive_ldo_set_voltage_int(struct regulator_dev *rdev,
 					    int reg,
-					    int min_uV, int max_uV)
+					    int min_uV, int max_uV,
+					    unsigned *selector)
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	struct wm831x *wm831x = ldo->wm831x;
@@ -680,16 +692,20 @@ static int wm831x_alive_ldo_set_voltage_int(struct regulator_dev *rdev,
 	if (ret < min_uV || ret > max_uV)
 		return -EINVAL;
 
+	*selector = vsel;
+
 	return wm831x_set_bits(wm831x, reg, WM831X_LDO11_ON_VSEL_MASK, vsel);
 }
 
 static int wm831x_alive_ldo_set_voltage(struct regulator_dev *rdev,
-				     int min_uV, int max_uV)
+					int min_uV, int max_uV,
+					unsigned *selector)
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	int reg = ldo->base + WM831X_ALIVE_LDO_ON_CONTROL;
 
-	return wm831x_alive_ldo_set_voltage_int(rdev, reg, min_uV, max_uV);
+	return wm831x_alive_ldo_set_voltage_int(rdev, reg, min_uV, max_uV,
+						selector);
 }
 
 static int wm831x_alive_ldo_set_suspend_voltage(struct regulator_dev *rdev,
@@ -697,8 +713,9 @@ static int wm831x_alive_ldo_set_suspend_voltage(struct regulator_dev *rdev,
 {
 	struct wm831x_ldo *ldo = rdev_get_drvdata(rdev);
 	int reg = ldo->base + WM831X_ALIVE_LDO_SLEEP_CONTROL;
+	unsigned selector;
 
-	return wm831x_alive_ldo_set_voltage_int(rdev, reg, uV, uV);
+	return wm831x_alive_ldo_set_voltage_int(rdev, reg, uV, uV, &selector);
 }
 
 static int wm831x_alive_ldo_get_voltage(struct regulator_dev *rdev)
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index fe4b8a8a9dfd..7e45b0ddd710 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -360,7 +360,7 @@ int wm8350_isink_set_flash(struct wm8350 *wm8350, int isink, u16 mode,
 EXPORT_SYMBOL_GPL(wm8350_isink_set_flash);
 
 static int wm8350_dcdc_set_voltage(struct regulator_dev *rdev, int min_uV,
-	int max_uV)
+				   int max_uV, unsigned *selector)
 {
 	struct wm8350 *wm8350 = rdev_get_drvdata(rdev);
 	int volt_reg, dcdc = rdev_get_id(rdev), mV,
@@ -397,6 +397,8 @@ static int wm8350_dcdc_set_voltage(struct regulator_dev *rdev, int min_uV,
 		return -EINVAL;
 	}
 
+	*selector = mV;
+
 	/* all DCDCs have same mV bits */
 	val = wm8350_reg_read(wm8350, volt_reg) & ~WM8350_DC1_VSEL_MASK;
 	wm8350_reg_write(wm8350, volt_reg, val | mV);
@@ -754,7 +756,7 @@ static int wm8350_ldo_set_suspend_disable(struct regulator_dev *rdev)
 }
 
 static int wm8350_ldo_set_voltage(struct regulator_dev *rdev, int min_uV,
-	int max_uV)
+				  int max_uV, unsigned *selector)
 {
 	struct wm8350 *wm8350 = rdev_get_drvdata(rdev);
 	int volt_reg, ldo = rdev_get_id(rdev), mV, min_mV = min_uV / 1000,
@@ -797,6 +799,8 @@ static int wm8350_ldo_set_voltage(struct regulator_dev *rdev, int min_uV,
 		return -EINVAL;
 	}
 
+	*selector = mV;
+
 	/* all LDOs have same mV bits */
 	val = wm8350_reg_read(wm8350, volt_reg) & ~WM8350_LDO1_VSEL_MASK;
 	wm8350_reg_write(wm8350, volt_reg, val | mV);
diff --git a/drivers/regulator/wm8400-regulator.c b/drivers/regulator/wm8400-regulator.c
index 924c7eb29ee9..b42d01cef35a 100644
--- a/drivers/regulator/wm8400-regulator.c
+++ b/drivers/regulator/wm8400-regulator.c
@@ -67,7 +67,7 @@ static int wm8400_ldo_get_voltage(struct regulator_dev *dev)
 }
 
 static int wm8400_ldo_set_voltage(struct regulator_dev *dev,
-				  int min_uV, int max_uV)
+				  int min_uV, int max_uV, unsigned *selector)
 {
 	struct wm8400 *wm8400 = rdev_get_drvdata(dev);
 	u16 val;
@@ -93,6 +93,8 @@ static int wm8400_ldo_set_voltage(struct regulator_dev *dev,
 		val += 0xf;
 	}
 
+	*selector = val;
+
 	return wm8400_set_bits(wm8400, WM8400_LDO1_CONTROL + rdev_get_id(dev),
 			       WM8400_LDO1_VSEL_MASK, val);
 }
@@ -156,7 +158,7 @@ static int wm8400_dcdc_get_voltage(struct regulator_dev *dev)
 }
 
 static int wm8400_dcdc_set_voltage(struct regulator_dev *dev,
-				  int min_uV, int max_uV)
+				   int min_uV, int max_uV, unsigned *selector)
 {
 	struct wm8400 *wm8400 = rdev_get_drvdata(dev);
 	u16 val;
@@ -171,6 +173,8 @@ static int wm8400_dcdc_set_voltage(struct regulator_dev *dev,
 		return -EINVAL;
 	BUG_ON(850000 + (25000 * val) < min_uV);
 
+	*selector = val;
+
 	return wm8400_set_bits(wm8400, WM8400_DCDC1_CONTROL_1 + offset,
 			       WM8400_DC1_VSEL_MASK, val);
 }
diff --git a/drivers/regulator/wm8994-regulator.c b/drivers/regulator/wm8994-regulator.c
index 03713bc66e4a..1b162e699368 100644
--- a/drivers/regulator/wm8994-regulator.c
+++ b/drivers/regulator/wm8994-regulator.c
@@ -101,7 +101,7 @@ static int wm8994_ldo1_get_voltage(struct regulator_dev *rdev)
 }
 
 static int wm8994_ldo1_set_voltage(struct regulator_dev *rdev,
-				   int min_uV, int max_uV)
+				   int min_uV, int max_uV, unsigned *s)
 {
 	struct wm8994_ldo *ldo = rdev_get_drvdata(rdev);
 	int selector, v;
@@ -111,6 +111,7 @@ static int wm8994_ldo1_set_voltage(struct regulator_dev *rdev,
 	if (v < 0 || v > max_uV)
 		return -EINVAL;
 
+	*s = selector;
 	selector <<= WM8994_LDO1_VSEL_SHIFT;
 
 	return wm8994_set_bits(ldo->wm8994, WM8994_LDO_1,
@@ -152,7 +153,7 @@ static int wm8994_ldo2_get_voltage(struct regulator_dev *rdev)
 }
 
 static int wm8994_ldo2_set_voltage(struct regulator_dev *rdev,
-				   int min_uV, int max_uV)
+				   int min_uV, int max_uV, unsigned *s)
 {
 	struct wm8994_ldo *ldo = rdev_get_drvdata(rdev);
 	int selector, v;
@@ -162,6 +163,7 @@ static int wm8994_ldo2_set_voltage(struct regulator_dev *rdev,
 	if (v < 0 || v > max_uV)
 		return -EINVAL;
 
+	*s = selector;
 	selector <<= WM8994_LDO2_VSEL_SHIFT;
 
 	return wm8994_set_bits(ldo->wm8994, WM8994_LDO_2,
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 592cd7c642c2..4275cd475eac 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -79,7 +79,8 @@ struct regulator_ops {
 	int (*list_voltage) (struct regulator_dev *, unsigned selector);
 
 	/* get/set regulator voltage */
-	int (*set_voltage) (struct regulator_dev *, int min_uV, int max_uV);
+	int (*set_voltage) (struct regulator_dev *, int min_uV, int max_uV,
+			    unsigned *selector);
 	int (*get_voltage) (struct regulator_dev *);
 
 	/* get/set regulator current  */
-- 
cgit v1.2.3-71-gd317


From f8c12fe329c8da9f50d8b2b1183eeaa4d587e747 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 29 Nov 2010 15:55:17 +0000
Subject: regulator: Copy constraints from regulators when initialising them

Currently the regulator API uses the constraints structure passed in to
the core throughout the lifetime of the object. This means that it is not
possible to mark the constraints as __initdata so if the kernel supports
many boards the constraints for all of them are kept around throughout the
lifetime of the system, consuming memory needlessly. By copying constraints
that are actually used we allow the use of __initdata, saving memory when
multiple boards are supported.

This also means the constraints can be const.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c         | 19 ++++++++++++-------
 include/linux/regulator/driver.h |  2 +-
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 64a56a7a1f9d..40cf7b9ea943 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -813,23 +813,26 @@ static int machine_constraints_voltage(struct regulator_dev *rdev,
  * set_mode.
  */
 static int set_machine_constraints(struct regulator_dev *rdev,
-	struct regulation_constraints *constraints)
+	const struct regulation_constraints *constraints)
 {
 	int ret = 0;
 	const char *name;
 	struct regulator_ops *ops = rdev->desc->ops;
 
-	rdev->constraints = constraints;
+	rdev->constraints = kmemdup(constraints, sizeof(*constraints),
+				    GFP_KERNEL);
+	if (!rdev->constraints)
+		return -ENOMEM;
 
 	name = rdev_get_name(rdev);
 
-	ret = machine_constraints_voltage(rdev, constraints);
+	ret = machine_constraints_voltage(rdev, rdev->constraints);
 	if (ret != 0)
 		goto out;
 
 	/* do we need to setup our suspend state */
 	if (constraints->initial_state) {
-		ret = suspend_prepare(rdev, constraints->initial_state);
+		ret = suspend_prepare(rdev, rdev->constraints->initial_state);
 		if (ret < 0) {
 			pr_err("failed to set suspend state for %s\n",
 				name);
@@ -846,7 +849,7 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 			goto out;
 		}
 
-		ret = ops->set_mode(rdev, constraints->initial_mode);
+		ret = ops->set_mode(rdev, rdev->constraints->initial_mode);
 		if (ret < 0) {
 			pr_err("failed to set initial mode for %s: %d\n",
 				name, ret);
@@ -857,7 +860,8 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 	/* If the constraints say the regulator should be on at this point
 	 * and we have control then make sure it is enabled.
 	 */
-	if ((constraints->always_on || constraints->boot_on) && ops->enable) {
+	if ((rdev->constraints->always_on || rdev->constraints->boot_on) &&
+	    ops->enable) {
 		ret = ops->enable(rdev);
 		if (ret < 0) {
 			pr_err("failed to enable %s\n", name);
@@ -2289,7 +2293,7 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
  * Returns 0 on success.
  */
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-	struct device *dev, struct regulator_init_data *init_data,
+	struct device *dev, const struct regulator_init_data *init_data,
 	void *driver_data)
 {
 	static atomic_t regulator_no = ATOMIC_INIT(0);
@@ -2444,6 +2448,7 @@ void regulator_unregister(struct regulator_dev *rdev)
 	if (rdev->supply)
 		sysfs_remove_link(&rdev->dev.kobj, "supply");
 	device_unregister(&rdev->dev);
+	kfree(rdev->constraints);
 	mutex_unlock(&regulator_list_mutex);
 }
 EXPORT_SYMBOL_GPL(regulator_unregister);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4275cd475eac..cce575359712 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -192,7 +192,7 @@ struct regulator_dev {
 };
 
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-	struct device *dev, struct regulator_init_data *init_data,
+	struct device *dev, const struct regulator_init_data *init_data,
 	void *driver_data);
 void regulator_unregister(struct regulator_dev *rdev);
 
-- 
cgit v1.2.3-71-gd317


From 476c2d83c7ffb2429b2a504fbdb4326fc8a9d0e8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 10 Dec 2010 17:28:07 +0000
Subject: regulator: Allow drivers to report voltages as selectors

Since drivers already have to provide an API for translating selectors
into voltages they may as well just report the selector values directly
to the core API rather than implement the lookup themselves. The old
interface is left in place for now, but may be removed in future.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c         | 25 ++++++++++++++++++++++---
 include/linux/regulator/driver.h |  3 +++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 671eb53b5cdc..b362dbde80f7 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -577,7 +577,9 @@ static void drms_uA_update(struct regulator_dev *rdev)
 
 	err = regulator_check_drms(rdev);
 	if (err < 0 || !rdev->desc->ops->get_optimum_mode ||
-	    !rdev->desc->ops->get_voltage || !rdev->desc->ops->set_mode)
+	    (!rdev->desc->ops->get_voltage &&
+	     !rdev->desc->ops->get_voltage_sel) ||
+	    !rdev->desc->ops->set_mode)
 		return;
 
 	/* get output voltage */
@@ -1682,7 +1684,14 @@ EXPORT_SYMBOL_GPL(regulator_set_voltage);
 
 static int _regulator_get_voltage(struct regulator_dev *rdev)
 {
-	/* sanity check */
+	int sel;
+
+	if (rdev->desc->ops->get_voltage_sel) {
+		sel = rdev->desc->ops->get_voltage_sel(rdev);
+		if (sel < 0)
+			return sel;
+		return rdev->desc->ops->list_voltage(rdev, sel);
+	}
 	if (rdev->desc->ops->get_voltage)
 		return rdev->desc->ops->get_voltage(rdev);
 	else
@@ -2191,7 +2200,7 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 	int			status = 0;
 
 	/* some attributes need specific methods to be displayed */
-	if (ops->get_voltage) {
+	if (ops->get_voltage || ops->get_voltage_sel) {
 		status = device_create_file(dev, &dev_attr_microvolts);
 		if (status < 0)
 			return status;
@@ -2327,6 +2336,16 @@ struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
 	if (!init_data)
 		return ERR_PTR(-EINVAL);
 
+	/* Only one of each should be implemented */
+	WARN_ON(regulator_desc->ops->get_voltage &&
+		regulator_desc->ops->get_voltage_sel);
+
+	/* If we're using selectors we must implement list_voltage. */
+	if (regulator_desc->ops->get_voltage_sel &&
+	    !regulator_desc->ops->list_voltage) {
+		return ERR_PTR(-EINVAL);
+	}
+
 	rdev = kzalloc(sizeof(struct regulator_dev), GFP_KERNEL);
 	if (rdev == NULL)
 		return ERR_PTR(-ENOMEM);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index cce575359712..bf3e653591b9 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -43,6 +43,8 @@ enum regulator_status {
  * @set_voltage: Set the voltage for the regulator within the range specified.
  *               The driver should select the voltage closest to min_uV.
  * @get_voltage: Return the currently configured voltage for the regulator.
+ * @get_voltage_sel: Return the currently configured voltage selector for the
+ *                   regulator.
  * @list_voltage: Return one of the supported voltages, in microvolts; zero
  *	if the selector indicates a voltage that is unusable on this system;
  *	or negative errno.  Selectors range from zero to one less than
@@ -82,6 +84,7 @@ struct regulator_ops {
 	int (*set_voltage) (struct regulator_dev *, int min_uV, int max_uV,
 			    unsigned *selector);
 	int (*get_voltage) (struct regulator_dev *);
+	int (*get_voltage_sel) (struct regulator_dev *);
 
 	/* get/set regulator current  */
 	int (*set_current_limit) (struct regulator_dev *,
-- 
cgit v1.2.3-71-gd317


From cb189b07d57b574cc14382e2130960b0a0193c23 Mon Sep 17 00:00:00 2001
From: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Date: Fri, 10 Dec 2010 11:08:40 +0100
Subject: regulators: Moved define for number of regulators in ab8500

The define for number of regulators is moved from ab8500-core to
ab8500-regulator so that the regulator driver can be updated
independently of ab8500-core. This also changes the platform
configuration structure of ab8500-core so that it contains a
pointer to the regulator_init_data array plus number of
regulators instead of an fixed size array of pointers to
regulator_init_data.

Signed-off-by: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/ab8500.c       |  8 +++++++-
 include/linux/mfd/ab8500.h       |  5 ++---
 include/linux/regulator/ab8500.h | 24 +++++++++++++-----------
 3 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 2f4ec0facef1..5670775f6c9b 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -370,6 +370,12 @@ static __devinit int ab8500_regulator_probe(struct platform_device *pdev)
 	}
 	pdata = dev_get_platdata(ab8500->dev);
 
+	/* make sure the platform data has the correct size */
+	if (pdata->num_regulator != ARRAY_SIZE(ab8500_regulator_info)) {
+		dev_err(&pdev->dev, "platform configuration error\n");
+		return -EINVAL;
+	}
+
 	/* register all regulators */
 	for (i = 0; i < ARRAY_SIZE(ab8500_regulator_info); i++) {
 		struct ab8500_regulator_info *info = NULL;
@@ -380,7 +386,7 @@ static __devinit int ab8500_regulator_probe(struct platform_device *pdev)
 		info->ab8500 = ab8500;
 
 		info->regulator = regulator_register(&info->desc, &pdev->dev,
-				pdata->regulator[i], info);
+				&pdata->regulator[i], info);
 		if (IS_ERR(info->regulator)) {
 			err = PTR_ERR(info->regulator);
 			dev_err(&pdev->dev, "failed to register regulator %s\n",
diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h
index d63b6050b183..85cf2c28fac6 100644
--- a/include/linux/mfd/ab8500.h
+++ b/include/linux/mfd/ab8500.h
@@ -99,8 +99,6 @@
 #define AB8500_NR_IRQS			104
 #define AB8500_NUM_IRQ_REGS		13
 
-#define AB8500_NUM_REGULATORS   15
-
 /**
  * struct ab8500 - ab8500 internal structure
  * @dev: parent device
@@ -145,7 +143,8 @@ struct regulator_init_data;
 struct ab8500_platform_data {
 	int irq_base;
 	void (*init) (struct ab8500 *);
-	struct regulator_init_data *regulator[AB8500_NUM_REGULATORS];
+	int num_regulator;
+	struct regulator_init_data *regulator;
 };
 
 extern int __devinit ab8500_init(struct ab8500 *ab8500);
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index f509877c2ed4..6a210f1511fc 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -11,15 +11,17 @@
 #define __LINUX_MFD_AB8500_REGULATOR_H
 
 /* AB8500 regulators */
-#define AB8500_LDO_AUX1         0
-#define AB8500_LDO_AUX2         1
-#define AB8500_LDO_AUX3         2
-#define AB8500_LDO_INTCORE      3
-#define AB8500_LDO_TVOUT        4
-#define AB8500_LDO_AUDIO	5
-#define AB8500_LDO_ANAMIC1      6
-#define AB8500_LDO_ANAMIC2      7
-#define AB8500_LDO_DMIC         8
-#define AB8500_LDO_ANA          9
-
+enum ab8500_regulator_id {
+	AB8500_LDO_AUX1,
+	AB8500_LDO_AUX2,
+	AB8500_LDO_AUX3,
+	AB8500_LDO_INTCORE,
+	AB8500_LDO_TVOUT,
+	AB8500_LDO_AUDIO,
+	AB8500_LDO_ANAMIC1,
+	AB8500_LDO_ANAMIC2,
+	AB8500_LDO_DMIC,
+	AB8500_LDO_ANA,
+	AB8500_NUM_REGULATORS,
+};
 #endif
-- 
cgit v1.2.3-71-gd317


From 57c78e359a35c69eca4c88f107500f74ef7f0acf Mon Sep 17 00:00:00 2001
From: Yong Shen <yong.shen@linaro.org>
Date: Tue, 14 Dec 2010 14:00:53 +0800
Subject: Change the register name definitions for mc13783

To make mc13783 and mc13892 share code, the register names should be
changed to fit the new macro definitions in the comming patch.

Signed-off-by: Yong Shen <yong.shen@linaro.org>
Acked-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 arch/arm/mach-imx/mach-pcm038.c       |  4 +--
 arch/arm/mach-mx3/mach-mx31_3ds.c     |  4 +--
 arch/arm/mach-mx3/mach-mx31moboard.c  |  4 +--
 drivers/regulator/mc13783-regulator.c | 30 ++++++++--------
 include/linux/mfd/mc13783.h           | 67 ++++++++++++++++++-----------------
 5 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-imx/mach-pcm038.c b/arch/arm/mach-imx/mach-pcm038.c
index f667a262dfc1..505614803bc6 100644
--- a/arch/arm/mach-imx/mach-pcm038.c
+++ b/arch/arm/mach-imx/mach-pcm038.c
@@ -254,10 +254,10 @@ static struct regulator_init_data cam_data = {
 
 static struct mc13783_regulator_init_data pcm038_regulators[] = {
 	{
-		.id = MC13783_REGU_VCAM,
+		.id = MC13783_REG_VCAM,
 		.init_data = &cam_data,
 	}, {
-		.id = MC13783_REGU_VMMC1,
+		.id = MC13783_REG_VMMC1,
 		.init_data = &sdhc1_data,
 	},
 };
diff --git a/arch/arm/mach-mx3/mach-mx31_3ds.c b/arch/arm/mach-mx3/mach-mx31_3ds.c
index 4e516b49a901..899a969e92fa 100644
--- a/arch/arm/mach-mx3/mach-mx31_3ds.c
+++ b/arch/arm/mach-mx3/mach-mx31_3ds.c
@@ -140,10 +140,10 @@ static struct regulator_init_data gpo_init = {
 
 static struct mc13783_regulator_init_data mx31_3ds_regulators[] = {
 	{
-		.id = MC13783_REGU_PWGT1SPI, /* Power Gate for ARM core. */
+		.id = MC13783_REG_PWGT1SPI, /* Power Gate for ARM core. */
 		.init_data = &pwgtx_init,
 	}, {
-		.id = MC13783_REGU_PWGT2SPI, /* Power Gate for L2 Cache. */
+		.id = MC13783_REG_PWGT2SPI, /* Power Gate for L2 Cache. */
 		.init_data = &pwgtx_init,
 	}, {
 
diff --git a/arch/arm/mach-mx3/mach-mx31moboard.c b/arch/arm/mach-mx3/mach-mx31moboard.c
index 203d21a510aa..1aa8d65fccbb 100644
--- a/arch/arm/mach-mx3/mach-mx31moboard.c
+++ b/arch/arm/mach-mx3/mach-mx31moboard.c
@@ -216,11 +216,11 @@ static struct regulator_init_data cam_vreg_data = {
 
 static struct mc13783_regulator_init_data moboard_regulators[] = {
 	{
-		.id = MC13783_REGU_VMMC1,
+		.id = MC13783_REG_VMMC1,
 		.init_data = &sdhc_vreg_data,
 	},
 	{
-		.id = MC13783_REGU_VCAM,
+		.id = MC13783_REG_VCAM,
 		.init_data = &cam_vreg_data,
 	},
 };
diff --git a/drivers/regulator/mc13783-regulator.c b/drivers/regulator/mc13783-regulator.c
index e99917a63ed4..6a6e1d63d86f 100644
--- a/drivers/regulator/mc13783-regulator.c
+++ b/drivers/regulator/mc13783-regulator.c
@@ -228,15 +228,15 @@ static struct regulator_ops mc13783_gpo_regulator_ops;
 	}
 
 #define MC13783_DEFINE_SW(_name, _reg, _vsel_reg, _voltages)		\
-	MC13783_DEFINE(SW, _name, _reg, _vsel_reg, _voltages)
+	MC13783_DEFINE(REG, _name, _reg, _vsel_reg, _voltages)
 #define MC13783_DEFINE_REGU(_name, _reg, _vsel_reg, _voltages)		\
-	MC13783_DEFINE(REGU, _name, _reg, _vsel_reg, _voltages)
+	MC13783_DEFINE(REG, _name, _reg, _vsel_reg, _voltages)
 
 static struct mc13783_regulator mc13783_regulators[] = {
 	MC13783_DEFINE_SW(SW3, SWITCHERS5, SWITCHERS5, mc13783_sw3_val),
 
-	MC13783_FIXED_DEFINE(REGU, VAUDIO, REGULATORMODE0, mc13783_vaudio_val),
-	MC13783_FIXED_DEFINE(REGU, VIOHI, REGULATORMODE0, mc13783_viohi_val),
+	MC13783_FIXED_DEFINE(REG, VAUDIO, REGULATORMODE0, mc13783_vaudio_val),
+	MC13783_FIXED_DEFINE(REG, VIOHI, REGULATORMODE0, mc13783_viohi_val),
 	MC13783_DEFINE_REGU(VIOLO, REGULATORMODE0, REGULATORSETTING0,	\
 			    mc13783_violo_val),
 	MC13783_DEFINE_REGU(VDIG, REGULATORMODE0, REGULATORSETTING0,	\
@@ -255,7 +255,7 @@ static struct mc13783_regulator mc13783_regulators[] = {
 			    mc13783_vesim_val),
 	MC13783_DEFINE_REGU(VCAM, REGULATORMODE1, REGULATORSETTING0,	\
 			    mc13783_vcam_val),
-	MC13783_FIXED_DEFINE(REGU, VRFBG, REGULATORMODE1, mc13783_vrfbg_val),
+	MC13783_FIXED_DEFINE(REG, VRFBG, REGULATORMODE1, mc13783_vrfbg_val),
 	MC13783_DEFINE_REGU(VVIB, REGULATORMODE1, REGULATORSETTING1,	\
 			    mc13783_vvib_val),
 	MC13783_DEFINE_REGU(VRF1, REGULATORMODE1, REGULATORSETTING1,	\
@@ -266,12 +266,12 @@ static struct mc13783_regulator mc13783_regulators[] = {
 			    mc13783_vmmc_val),
 	MC13783_DEFINE_REGU(VMMC2, REGULATORMODE1, REGULATORSETTING1,	\
 			    mc13783_vmmc_val),
-	MC13783_GPO_DEFINE(REGU, GPO1, POWERMISC, mc13783_gpo_val),
-	MC13783_GPO_DEFINE(REGU, GPO2, POWERMISC, mc13783_gpo_val),
-	MC13783_GPO_DEFINE(REGU, GPO3, POWERMISC, mc13783_gpo_val),
-	MC13783_GPO_DEFINE(REGU, GPO4, POWERMISC, mc13783_gpo_val),
-	MC13783_GPO_DEFINE(REGU, PWGT1SPI, POWERMISC, mc13783_pwgtdrv_val),
-	MC13783_GPO_DEFINE(REGU, PWGT2SPI, POWERMISC, mc13783_pwgtdrv_val),
+	MC13783_GPO_DEFINE(REG, GPO1, POWERMISC, mc13783_gpo_val),
+	MC13783_GPO_DEFINE(REG, GPO2, POWERMISC, mc13783_gpo_val),
+	MC13783_GPO_DEFINE(REG, GPO3, POWERMISC, mc13783_gpo_val),
+	MC13783_GPO_DEFINE(REG, GPO4, POWERMISC, mc13783_gpo_val),
+	MC13783_GPO_DEFINE(REG, PWGT1SPI, POWERMISC, mc13783_pwgtdrv_val),
+	MC13783_GPO_DEFINE(REG, PWGT2SPI, POWERMISC, mc13783_pwgtdrv_val),
 };
 
 struct mc13783_regulator_priv {
@@ -508,8 +508,8 @@ static int mc13783_gpo_regulator_enable(struct regulator_dev *rdev)
 	dev_dbg(rdev_get_dev(rdev), "%s id: %d\n", __func__, id);
 
 	/* Power Gate enable value is 0 */
-	if (id == MC13783_REGU_PWGT1SPI ||
-	    id == MC13783_REGU_PWGT2SPI)
+	if (id == MC13783_REG_PWGT1SPI ||
+	    id == MC13783_REG_PWGT2SPI)
 		en_val = 0;
 
 	mc13783_lock(priv->mc13783);
@@ -530,8 +530,8 @@ static int mc13783_gpo_regulator_disable(struct regulator_dev *rdev)
 	dev_dbg(rdev_get_dev(rdev), "%s id: %d\n", __func__, id);
 
 	/* Power Gate disable value is 1 */
-	if (id == MC13783_REGU_PWGT1SPI ||
-	    id == MC13783_REGU_PWGT2SPI)
+	if (id == MC13783_REG_PWGT1SPI ||
+	    id == MC13783_REG_PWGT2SPI)
 		dis_val = mc13783_regulators[id].enable_bit;
 
 	mc13783_lock(priv->mc13783);
diff --git a/include/linux/mfd/mc13783.h b/include/linux/mfd/mc13783.h
index b4c741e352c2..7d0f3d6a0002 100644
--- a/include/linux/mfd/mc13783.h
+++ b/include/linux/mfd/mc13783.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright 2010 Yong Shen <yong.shen@linaro.org>
  * Copyright 2009-2010 Pengutronix
  * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
  *
@@ -122,39 +123,39 @@ int mc13783_adc_do_conversion(struct mc13783 *mc13783, unsigned int mode,
 		unsigned int channel, unsigned int *sample);
 
 
-#define	MC13783_SW_SW1A		0
-#define	MC13783_SW_SW1B		1
-#define	MC13783_SW_SW2A		2
-#define	MC13783_SW_SW2B		3
-#define	MC13783_SW_SW3		4
-#define	MC13783_SW_PLL		5
-#define	MC13783_REGU_VAUDIO	6
-#define	MC13783_REGU_VIOHI	7
-#define	MC13783_REGU_VIOLO	8
-#define	MC13783_REGU_VDIG	9
-#define	MC13783_REGU_VGEN	10
-#define	MC13783_REGU_VRFDIG	11
-#define	MC13783_REGU_VRFREF	12
-#define	MC13783_REGU_VRFCP	13
-#define	MC13783_REGU_VSIM	14
-#define	MC13783_REGU_VESIM	15
-#define	MC13783_REGU_VCAM	16
-#define	MC13783_REGU_VRFBG	17
-#define	MC13783_REGU_VVIB	18
-#define	MC13783_REGU_VRF1	19
-#define	MC13783_REGU_VRF2	20
-#define	MC13783_REGU_VMMC1	21
-#define	MC13783_REGU_VMMC2	22
-#define	MC13783_REGU_GPO1	23
-#define	MC13783_REGU_GPO2	24
-#define	MC13783_REGU_GPO3	25
-#define	MC13783_REGU_GPO4	26
-#define	MC13783_REGU_V1		27
-#define	MC13783_REGU_V2		28
-#define	MC13783_REGU_V3		29
-#define	MC13783_REGU_V4		30
-#define	MC13783_REGU_PWGT1SPI	31
-#define	MC13783_REGU_PWGT2SPI	32
+#define	MC13783_REG_SW1A		0
+#define	MC13783_REG_SW1B		1
+#define	MC13783_REG_SW2A		2
+#define	MC13783_REG_SW2B		3
+#define	MC13783_REG_SW3		4
+#define	MC13783_REG_PLL		5
+#define	MC13783_REG_VAUDIO	6
+#define	MC13783_REG_VIOHI	7
+#define	MC13783_REG_VIOLO	8
+#define	MC13783_REG_VDIG	9
+#define	MC13783_REG_VGEN	10
+#define	MC13783_REG_VRFDIG	11
+#define	MC13783_REG_VRFREF	12
+#define	MC13783_REG_VRFCP	13
+#define	MC13783_REG_VSIM	14
+#define	MC13783_REG_VESIM	15
+#define	MC13783_REG_VCAM	16
+#define	MC13783_REG_VRFBG	17
+#define	MC13783_REG_VVIB	18
+#define	MC13783_REG_VRF1	19
+#define	MC13783_REG_VRF2	20
+#define	MC13783_REG_VMMC1	21
+#define	MC13783_REG_VMMC2	22
+#define	MC13783_REG_GPO1	23
+#define	MC13783_REG_GPO2	24
+#define	MC13783_REG_GPO3	25
+#define	MC13783_REG_GPO4	26
+#define	MC13783_REG_V1		27
+#define	MC13783_REG_V2		28
+#define	MC13783_REG_V3		29
+#define	MC13783_REG_V4		30
+#define	MC13783_REG_PWGT1SPI	31
+#define	MC13783_REG_PWGT2SPI	32
 
 #define MC13783_IRQ_ADCDONE	MC13XXX_IRQ_ADCDONE
 #define MC13783_IRQ_ADCBISDONE	MC13XXX_IRQ_ADCBISDONE
-- 
cgit v1.2.3-71-gd317


From 5e428d5cecc3f109b52e993a1bd91f82137867b3 Mon Sep 17 00:00:00 2001
From: Yong Shen <yong.shen@linaro.org>
Date: Tue, 14 Dec 2010 14:00:55 +0800
Subject: regulator: support PMIC mc13892

add support for mc13892, tested on mx51 babbage board

Signed-off-by: Arnaud Patard <arnaud.patard@rtp-net.org>
Signed-off-by: Yong Shen <yong.shen@linaro.org>
Acked-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/Kconfig             |   8 +
 drivers/regulator/Makefile            |   1 +
 drivers/regulator/mc13892-regulator.c | 635 ++++++++++++++++++++++++++++++++++
 include/linux/mfd/mc13892.h           |  39 +++
 4 files changed, 683 insertions(+)
 create mode 100644 drivers/regulator/mc13892-regulator.c
 create mode 100644 include/linux/mfd/mc13892.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index ce17a038b0b8..485a9bc88ca9 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -197,6 +197,14 @@ config REGULATOR_MC13783
 	  Say y here to support the regulators found on the Freescale MC13783
 	  PMIC.
 
+config REGULATOR_MC13892
+	tristate "Support regulators on Freescale MC13892 PMIC"
+	depends on MFD_MC13XXX
+	select REGULATOR_MC13XXX_CORE
+	help
+	  Say y here to support the regulators found on the Freescale MC13892
+	  PMIC.
+
 config REGULATOR_AB3100
 	tristate "ST-Ericsson AB3100 Regulator functions"
 	depends on AB3100_CORE
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 0b33bac4f2ab..0b5e88c2b8d7 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
 obj-$(CONFIG_REGULATOR_PCF50633) += pcf50633-regulator.o
 obj-$(CONFIG_REGULATOR_PCAP) += pcap-regulator.o
 obj-$(CONFIG_REGULATOR_MC13783) += mc13783-regulator.o
+obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o
 obj-$(CONFIG_REGULATOR_MC13XXX_CORE) +=  mc13xxx-regulator-core.o
 obj-$(CONFIG_REGULATOR_AB3100) += ab3100.o
 
diff --git a/drivers/regulator/mc13892-regulator.c b/drivers/regulator/mc13892-regulator.c
new file mode 100644
index 000000000000..ca1d30adad99
--- /dev/null
+++ b/drivers/regulator/mc13892-regulator.c
@@ -0,0 +1,635 @@
+/*
+ * Regulator Driver for Freescale MC13892 PMIC
+ *
+ * Copyright 2010 Yong Shen <yong.shen@linaro.org>
+ *
+ * Based on draft driver from Arnaud Patard <arnaud.patard@rtp-net.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/mfd/mc13892.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/driver.h>
+#include <linux/platform_device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include "mc13xxx.h"
+
+#define MC13892_REVISION			7
+
+#define MC13892_POWERCTL0			13
+#define MC13892_POWERCTL0_USEROFFSPI		3
+#define MC13892_POWERCTL0_VCOINCELLVSEL		20
+#define MC13892_POWERCTL0_VCOINCELLVSEL_M	(7<<20)
+#define MC13892_POWERCTL0_VCOINCELLEN		(1<<23)
+
+#define MC13892_SWITCHERS0_SWxHI		(1<<23)
+
+#define MC13892_SWITCHERS0			24
+#define MC13892_SWITCHERS0_SW1VSEL		0
+#define MC13892_SWITCHERS0_SW1VSEL_M		(0x1f<<0)
+#define MC13892_SWITCHERS0_SW1HI		(1<<23)
+#define MC13892_SWITCHERS0_SW1EN		0
+
+#define MC13892_SWITCHERS1			25
+#define MC13892_SWITCHERS1_SW2VSEL		0
+#define MC13892_SWITCHERS1_SW2VSEL_M		(0x1f<<0)
+#define MC13892_SWITCHERS1_SW2HI		(1<<23)
+#define MC13892_SWITCHERS1_SW2EN		0
+
+#define MC13892_SWITCHERS2			26
+#define MC13892_SWITCHERS2_SW3VSEL		0
+#define MC13892_SWITCHERS2_SW3VSEL_M		(0x1f<<0)
+#define MC13892_SWITCHERS2_SW3HI		(1<<23)
+#define MC13892_SWITCHERS2_SW3EN		0
+
+#define MC13892_SWITCHERS3			27
+#define MC13892_SWITCHERS3_SW4VSEL		0
+#define MC13892_SWITCHERS3_SW4VSEL_M		(0x1f<<0)
+#define MC13892_SWITCHERS3_SW4HI		(1<<23)
+#define MC13892_SWITCHERS3_SW4EN		0
+
+#define MC13892_SWITCHERS4			28
+#define MC13892_SWITCHERS4_SW1MODE		0
+#define MC13892_SWITCHERS4_SW1MODE_AUTO		(8<<0)
+#define MC13892_SWITCHERS4_SW1MODE_M		(0xf<<0)
+#define MC13892_SWITCHERS4_SW2MODE		10
+#define MC13892_SWITCHERS4_SW2MODE_AUTO		(8<<10)
+#define MC13892_SWITCHERS4_SW2MODE_M		(0xf<<10)
+
+#define MC13892_SWITCHERS5			29
+#define MC13892_SWITCHERS5_SW3MODE		0
+#define MC13892_SWITCHERS5_SW3MODE_AUTO		(8<<0)
+#define MC13892_SWITCHERS5_SW3MODE_M		(0xf<<0)
+#define MC13892_SWITCHERS5_SW4MODE		8
+#define MC13892_SWITCHERS5_SW4MODE_AUTO		(8<<8)
+#define MC13892_SWITCHERS5_SW4MODE_M		(0xf<<8)
+#define MC13892_SWITCHERS5_SWBSTEN		(1<<20)
+
+#define MC13892_REGULATORSETTING0		30
+#define MC13892_REGULATORSETTING0_VGEN1VSEL	0
+#define MC13892_REGULATORSETTING0_VDIGVSEL	4
+#define MC13892_REGULATORSETTING0_VGEN2VSEL	6
+#define MC13892_REGULATORSETTING0_VPLLVSEL	9
+#define MC13892_REGULATORSETTING0_VUSB2VSEL	11
+#define MC13892_REGULATORSETTING0_VGEN3VSEL	14
+#define MC13892_REGULATORSETTING0_VCAMVSEL	16
+
+#define MC13892_REGULATORSETTING0_VGEN1VSEL_M	(3<<0)
+#define MC13892_REGULATORSETTING0_VDIGVSEL_M	(3<<4)
+#define MC13892_REGULATORSETTING0_VGEN2VSEL_M	(7<<6)
+#define MC13892_REGULATORSETTING0_VPLLVSEL_M	(3<<9)
+#define MC13892_REGULATORSETTING0_VUSB2VSEL_M	(3<<11)
+#define MC13892_REGULATORSETTING0_VGEN3VSEL_M	(1<<14)
+#define MC13892_REGULATORSETTING0_VCAMVSEL_M	(3<<16)
+
+#define MC13892_REGULATORSETTING1		31
+#define MC13892_REGULATORSETTING1_VVIDEOVSEL	2
+#define MC13892_REGULATORSETTING1_VAUDIOVSEL	4
+#define MC13892_REGULATORSETTING1_VSDVSEL	6
+
+#define MC13892_REGULATORSETTING1_VVIDEOVSEL_M	(3<<2)
+#define MC13892_REGULATORSETTING1_VAUDIOVSEL_M	(3<<4)
+#define MC13892_REGULATORSETTING1_VSDVSEL_M	(7<<6)
+
+#define MC13892_REGULATORMODE0			32
+#define MC13892_REGULATORMODE0_VGEN1EN		(1<<0)
+#define MC13892_REGULATORMODE0_VGEN1STDBY	(1<<1)
+#define MC13892_REGULATORMODE0_VGEN1MODE	(1<<2)
+#define MC13892_REGULATORMODE0_VIOHIEN		(1<<3)
+#define MC13892_REGULATORMODE0_VIOHISTDBY	(1<<4)
+#define MC13892_REGULATORMODE0_VIOHIMODE	(1<<5)
+#define MC13892_REGULATORMODE0_VDIGEN		(1<<9)
+#define MC13892_REGULATORMODE0_VDIGSTDBY	(1<<10)
+#define MC13892_REGULATORMODE0_VDIGMODE		(1<<11)
+#define MC13892_REGULATORMODE0_VGEN2EN		(1<<12)
+#define MC13892_REGULATORMODE0_VGEN2STDBY	(1<<13)
+#define MC13892_REGULATORMODE0_VGEN2MODE	(1<<14)
+#define MC13892_REGULATORMODE0_VPLLEN		(1<<15)
+#define MC13892_REGULATORMODE0_VPLLSTDBY	(1<<16)
+#define MC13892_REGULATORMODE0_VPLLMODE		(1<<17)
+#define MC13892_REGULATORMODE0_VUSB2EN		(1<<18)
+#define MC13892_REGULATORMODE0_VUSB2STDBY	(1<<19)
+#define MC13892_REGULATORMODE0_VUSB2MODE	(1<<20)
+
+#define MC13892_REGULATORMODE1			33
+#define MC13892_REGULATORMODE1_VGEN3EN		(1<<0)
+#define MC13892_REGULATORMODE1_VGEN3STDBY	(1<<1)
+#define MC13892_REGULATORMODE1_VGEN3MODE	(1<<2)
+#define MC13892_REGULATORMODE1_VCAMEN		(1<<6)
+#define MC13892_REGULATORMODE1_VCAMSTDBY	(1<<7)
+#define MC13892_REGULATORMODE1_VCAMMODE		(1<<8)
+#define MC13892_REGULATORMODE1_VCAMCONFIGEN	(1<<9)
+#define MC13892_REGULATORMODE1_VVIDEOEN		(1<<12)
+#define MC13892_REGULATORMODE1_VVIDEOSTDBY	(1<<13)
+#define MC13892_REGULATORMODE1_VVIDEOMODE	(1<<14)
+#define MC13892_REGULATORMODE1_VAUDIOEN		(1<<15)
+#define MC13892_REGULATORMODE1_VAUDIOSTDBY	(1<<16)
+#define MC13892_REGULATORMODE1_VAUDIOMODE	(1<<17)
+#define MC13892_REGULATORMODE1_VSDEN		(1<<18)
+#define MC13892_REGULATORMODE1_VSDSTDBY		(1<<19)
+#define MC13892_REGULATORMODE1_VSDMODE		(1<<20)
+
+#define MC13892_POWERMISC			34
+#define MC13892_POWERMISC_GPO1EN		(1<<6)
+#define MC13892_POWERMISC_GPO2EN		(1<<8)
+#define MC13892_POWERMISC_GPO3EN		(1<<10)
+#define MC13892_POWERMISC_GPO4EN		(1<<12)
+#define MC13892_POWERMISC_PWGT1SPIEN		(1<<15)
+#define MC13892_POWERMISC_PWGT2SPIEN		(1<<16)
+#define MC13892_POWERMISC_GPO4ADINEN		(1<<21)
+
+#define MC13892_POWERMISC_PWGTSPI_M		(3 << 15)
+
+#define MC13892_USB1				50
+#define MC13892_USB1_VUSBEN			(1<<3)
+
+static const int mc13892_vcoincell[] = {
+	2500000, 2700000, 2800000, 2900000, 3000000, 3100000,
+	3200000, 3300000,
+};
+
+static const int mc13892_sw1[] = {
+	600000,   625000,  650000,  675000,  700000,  725000,
+	750000,   775000,  800000,  825000,  850000,  875000,
+	900000,   925000,  950000,  975000, 1000000, 1025000,
+	1050000, 1075000, 1100000, 1125000, 1150000, 1175000,
+	1200000, 1225000, 1250000, 1275000, 1300000, 1325000,
+	1350000, 1375000
+};
+
+static const int mc13892_sw[] = {
+	600000,   625000,  650000,  675000,  700000,  725000,
+	750000,   775000,  800000,  825000,  850000,  875000,
+	900000,   925000,  950000,  975000, 1000000, 1025000,
+	1050000, 1075000, 1100000, 1125000, 1150000, 1175000,
+	1200000, 1225000, 1250000, 1275000, 1300000, 1325000,
+	1350000, 1375000, 1400000, 1425000, 1450000, 1475000,
+	1500000, 1525000, 1550000, 1575000, 1600000, 1625000,
+	1650000, 1675000, 1700000, 1725000, 1750000, 1775000,
+	1800000, 1825000, 1850000, 1875000
+};
+
+static const int mc13892_swbst[] = {
+	5000000,
+};
+
+static const int mc13892_viohi[] = {
+	2775000,
+};
+
+static const int mc13892_vpll[] = {
+	1050000, 1250000, 1650000, 1800000,
+};
+
+static const int mc13892_vdig[] = {
+	1050000, 1250000, 1650000, 1800000,
+};
+
+static const int mc13892_vsd[] = {
+	1800000, 2000000, 2600000, 2700000,
+	2800000, 2900000, 3000000, 3150000,
+};
+
+static const int mc13892_vusb2[] = {
+	2400000, 2600000, 2700000, 2775000,
+};
+
+static const int mc13892_vvideo[] = {
+	2700000, 2775000, 2500000, 2600000,
+};
+
+static const int mc13892_vaudio[] = {
+	2300000, 2500000, 2775000, 3000000,
+};
+
+static const int mc13892_vcam[] = {
+	2500000, 2600000, 2750000, 3000000,
+};
+
+static const int mc13892_vgen1[] = {
+	1200000, 1500000, 2775000, 3150000,
+};
+
+static const int mc13892_vgen2[] = {
+	1200000, 1500000, 1600000, 1800000,
+	2700000, 2800000, 3000000, 3150000,
+};
+
+static const int mc13892_vgen3[] = {
+	1800000, 2900000,
+};
+
+static const int mc13892_vusb[] = {
+	3300000,
+};
+
+static const int mc13892_gpo[] = {
+	2750000,
+};
+
+static const int mc13892_pwgtdrv[] = {
+	5000000,
+};
+
+static struct regulator_ops mc13892_gpo_regulator_ops;
+/* sw regulators need special care due to the "hi bit" */
+static struct regulator_ops mc13892_sw_regulator_ops;
+
+
+#define MC13892_FIXED_DEFINE(name, reg, voltages)		\
+	MC13xxx_FIXED_DEFINE(MC13892_, name, reg, voltages,	\
+			mc13xxx_fixed_regulator_ops)
+
+#define MC13892_GPO_DEFINE(name, reg, voltages)			\
+	MC13xxx_GPO_DEFINE(MC13892_, name, reg, voltages,	\
+			mc13892_gpo_regulator_ops)
+
+#define MC13892_SW_DEFINE(name, reg, vsel_reg, voltages)	\
+	MC13xxx_DEFINE(MC13892_, name, reg, vsel_reg, voltages, \
+			mc13892_sw_regulator_ops)
+
+#define MC13892_DEFINE_REGU(name, reg, vsel_reg, voltages)	\
+	MC13xxx_DEFINE(MC13892_, name, reg, vsel_reg, voltages, \
+			mc13xxx_regulator_ops)
+
+static struct mc13xxx_regulator mc13892_regulators[] = {
+	MC13892_DEFINE_REGU(VCOINCELL, POWERCTL0, POWERCTL0, mc13892_vcoincell),
+	MC13892_SW_DEFINE(SW1, SWITCHERS0, SWITCHERS0, mc13892_sw1),
+	MC13892_SW_DEFINE(SW2, SWITCHERS1, SWITCHERS1, mc13892_sw),
+	MC13892_SW_DEFINE(SW3, SWITCHERS2, SWITCHERS2, mc13892_sw),
+	MC13892_SW_DEFINE(SW4, SWITCHERS3, SWITCHERS3, mc13892_sw),
+	MC13892_FIXED_DEFINE(SWBST, SWITCHERS5, mc13892_swbst),
+	MC13892_FIXED_DEFINE(VIOHI, REGULATORMODE0, mc13892_viohi),
+	MC13892_DEFINE_REGU(VPLL, REGULATORMODE0, REGULATORSETTING0,	\
+		mc13892_vpll),
+	MC13892_DEFINE_REGU(VDIG, REGULATORMODE0, REGULATORSETTING0,	\
+		mc13892_vdig),
+	MC13892_DEFINE_REGU(VSD, REGULATORMODE1, REGULATORSETTING1,	\
+		mc13892_vsd),
+	MC13892_DEFINE_REGU(VUSB2, REGULATORMODE0, REGULATORSETTING0,	\
+		mc13892_vusb2),
+	MC13892_DEFINE_REGU(VVIDEO, REGULATORMODE1, REGULATORSETTING1,	\
+		mc13892_vvideo),
+	MC13892_DEFINE_REGU(VAUDIO, REGULATORMODE1, REGULATORSETTING1,	\
+		mc13892_vaudio),
+	MC13892_DEFINE_REGU(VCAM, REGULATORMODE1, REGULATORSETTING0,	\
+		mc13892_vcam),
+	MC13892_DEFINE_REGU(VGEN1, REGULATORMODE0, REGULATORSETTING0,	\
+		mc13892_vgen1),
+	MC13892_DEFINE_REGU(VGEN2, REGULATORMODE0, REGULATORSETTING0,	\
+		mc13892_vgen2),
+	MC13892_DEFINE_REGU(VGEN3, REGULATORMODE1, REGULATORSETTING0,	\
+		mc13892_vgen3),
+	MC13892_FIXED_DEFINE(VUSB, USB1, mc13892_vusb),
+	MC13892_GPO_DEFINE(GPO1, POWERMISC, mc13892_gpo),
+	MC13892_GPO_DEFINE(GPO2, POWERMISC, mc13892_gpo),
+	MC13892_GPO_DEFINE(GPO3, POWERMISC, mc13892_gpo),
+	MC13892_GPO_DEFINE(GPO4, POWERMISC, mc13892_gpo),
+	MC13892_GPO_DEFINE(PWGT1SPI, POWERMISC, mc13892_pwgtdrv),
+	MC13892_GPO_DEFINE(PWGT2SPI, POWERMISC, mc13892_pwgtdrv),
+};
+
+int mc13892_powermisc_rmw(struct mc13xxx_regulator_priv *priv, u32 mask,
+		u32 val)
+{
+	struct mc13xxx *mc13892 = priv->mc13xxx;
+	int ret;
+	u32 valread;
+
+	BUG_ON(val & ~mask);
+
+	ret = mc13xxx_reg_read(mc13892, MC13892_POWERMISC, &valread);
+	if (ret)
+		return ret;
+
+	/* Update the stored state for Power Gates. */
+	priv->powermisc_pwgt_state =
+		(priv->powermisc_pwgt_state & ~mask) | val;
+	priv->powermisc_pwgt_state &= MC13892_POWERMISC_PWGTSPI_M;
+
+	/* Construct the new register value */
+	valread = (valread & ~mask) | val;
+	/* Overwrite the PWGTxEN with the stored version */
+	valread = (valread & ~MC13892_POWERMISC_PWGTSPI_M) |
+		priv->powermisc_pwgt_state;
+
+	return mc13xxx_reg_write(mc13892, MC13892_POWERMISC, valread);
+}
+
+static int mc13892_gpo_regulator_enable(struct regulator_dev *rdev)
+{
+	struct mc13xxx_regulator_priv *priv = rdev_get_drvdata(rdev);
+	int id = rdev_get_id(rdev);
+	int ret;
+	u32 en_val = mc13892_regulators[id].enable_bit;
+	u32 mask = mc13892_regulators[id].enable_bit;
+
+	dev_dbg(rdev_get_dev(rdev), "%s id: %d\n", __func__, id);
+
+	/* Power Gate enable value is 0 */
+	if (id == MC13892_PWGT1SPI || id == MC13892_PWGT2SPI)
+		en_val = 0;
+
+	if (id == MC13892_GPO4)
+		mask |= MC13892_POWERMISC_GPO4ADINEN;
+
+	mc13xxx_lock(priv->mc13xxx);
+	ret = mc13892_powermisc_rmw(priv, mask, en_val);
+	mc13xxx_unlock(priv->mc13xxx);
+
+	return ret;
+}
+
+static int mc13892_gpo_regulator_disable(struct regulator_dev *rdev)
+{
+	struct mc13xxx_regulator_priv *priv = rdev_get_drvdata(rdev);
+	int id = rdev_get_id(rdev);
+	int ret;
+	u32 dis_val = 0;
+
+	dev_dbg(rdev_get_dev(rdev), "%s id: %d\n", __func__, id);
+
+	/* Power Gate disable value is 1 */
+	if (id == MC13892_PWGT1SPI || id == MC13892_PWGT2SPI)
+		dis_val = mc13892_regulators[id].enable_bit;
+
+	mc13xxx_lock(priv->mc13xxx);
+	ret = mc13892_powermisc_rmw(priv, mc13892_regulators[id].enable_bit,
+		dis_val);
+	mc13xxx_unlock(priv->mc13xxx);
+
+	return ret;
+}
+
+static int mc13892_gpo_regulator_is_enabled(struct regulator_dev *rdev)
+{
+	struct mc13xxx_regulator_priv *priv = rdev_get_drvdata(rdev);
+	int ret, id = rdev_get_id(rdev);
+	unsigned int val;
+
+	mc13xxx_lock(priv->mc13xxx);
+	ret = mc13xxx_reg_read(priv->mc13xxx, mc13892_regulators[id].reg, &val);
+	mc13xxx_unlock(priv->mc13xxx);
+
+	if (ret)
+		return ret;
+
+	/* Power Gates state is stored in powermisc_pwgt_state
+	 * where the meaning of bits is negated */
+	val = (val & ~MC13892_POWERMISC_PWGTSPI_M) |
+		(priv->powermisc_pwgt_state ^ MC13892_POWERMISC_PWGTSPI_M);
+
+	return (val & mc13892_regulators[id].enable_bit) != 0;
+}
+
+
+static struct regulator_ops mc13892_gpo_regulator_ops = {
+	.enable = mc13892_gpo_regulator_enable,
+	.disable = mc13892_gpo_regulator_disable,
+	.is_enabled = mc13892_gpo_regulator_is_enabled,
+	.list_voltage = mc13xxx_regulator_list_voltage,
+	.set_voltage = mc13xxx_fixed_regulator_set_voltage,
+	.get_voltage = mc13xxx_fixed_regulator_get_voltage,
+};
+
+static int mc13892_sw_regulator_get_voltage(struct regulator_dev *rdev)
+{
+	struct mc13xxx_regulator_priv *priv = rdev_get_drvdata(rdev);
+	int ret, id = rdev_get_id(rdev);
+	unsigned int val, hi;
+
+	dev_dbg(rdev_get_dev(rdev), "%s id: %d\n", __func__, id);
+
+	mc13xxx_lock(priv->mc13xxx);
+	ret = mc13xxx_reg_read(priv->mc13xxx,
+		mc13892_regulators[id].vsel_reg, &val);
+	mc13xxx_unlock(priv->mc13xxx);
+	if (ret)
+		return ret;
+
+	hi  = val & MC13892_SWITCHERS0_SWxHI;
+	val = (val & mc13892_regulators[id].vsel_mask)
+		>> mc13892_regulators[id].vsel_shift;
+
+	dev_dbg(rdev_get_dev(rdev), "%s id: %d val: %d\n", __func__, id, val);
+
+	if (hi)
+		val = (25000 * val) + 1100000;
+	else
+		val = (25000 * val) + 600000;
+
+	return val;
+}
+
+static int mc13892_sw_regulator_set_voltage(struct regulator_dev *rdev,
+		int min_uV, int max_uV, unsigned *selector)
+{
+	struct mc13xxx_regulator_priv *priv = rdev_get_drvdata(rdev);
+	int hi, value, val, mask, id = rdev_get_id(rdev);
+	int ret;
+
+	dev_dbg(rdev_get_dev(rdev), "%s id: %d min_uV: %d max_uV: %d\n",
+		__func__, id, min_uV, max_uV);
+
+	/* Find the best index */
+	value = mc13xxx_get_best_voltage_index(rdev, min_uV, max_uV);
+	dev_dbg(rdev_get_dev(rdev), "%s best value: %d\n", __func__, value);
+	if (value < 0)
+		return value;
+
+	value = mc13892_regulators[id].voltages[value];
+
+	mc13xxx_lock(priv->mc13xxx);
+	ret = mc13xxx_reg_read(priv->mc13xxx,
+		mc13892_regulators[id].vsel_reg, &val);
+	if (ret)
+		goto err;
+
+	hi  = val & MC13892_SWITCHERS0_SWxHI;
+	if (value > 1375)
+		hi = 1;
+	if (value < 1100)
+		hi = 0;
+
+	if (hi) {
+		value = (value - 1100000) / 25000;
+		value |= MC13892_SWITCHERS0_SWxHI;
+	} else
+		value = (value - 600000) / 25000;
+
+	mask = mc13892_regulators[id].vsel_mask | MC13892_SWITCHERS0_SWxHI;
+	ret = mc13xxx_reg_rmw(priv->mc13xxx, mc13892_regulators[id].vsel_reg,
+			mask, value << mc13892_regulators[id].vsel_shift);
+err:
+	mc13xxx_unlock(priv->mc13xxx);
+
+	return ret;
+}
+
+static struct regulator_ops mc13892_sw_regulator_ops = {
+	.is_enabled = mc13xxx_sw_regulator_is_enabled,
+	.list_voltage = mc13xxx_regulator_list_voltage,
+	.set_voltage = mc13892_sw_regulator_set_voltage,
+	.get_voltage = mc13892_sw_regulator_get_voltage,
+};
+
+static int mc13892_vcam_set_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	unsigned int en_val = 0;
+	struct mc13xxx_regulator_priv *priv = rdev_get_drvdata(rdev);
+	int ret, id = rdev_get_id(rdev);
+
+	if (mode == REGULATOR_MODE_FAST)
+		en_val = MC13892_REGULATORMODE1_VCAMCONFIGEN;
+
+	mc13xxx_lock(priv->mc13xxx);
+	ret = mc13xxx_reg_rmw(priv->mc13xxx, mc13892_regulators[id].reg,
+		MC13892_REGULATORMODE1_VCAMCONFIGEN, en_val);
+	mc13xxx_unlock(priv->mc13xxx);
+
+	return ret;
+}
+
+unsigned int mc13892_vcam_get_mode(struct regulator_dev *rdev)
+{
+	struct mc13xxx_regulator_priv *priv = rdev_get_drvdata(rdev);
+	int ret, id = rdev_get_id(rdev);
+	unsigned int val;
+
+	mc13xxx_lock(priv->mc13xxx);
+	ret = mc13xxx_reg_read(priv->mc13xxx, mc13892_regulators[id].reg, &val);
+	mc13xxx_unlock(priv->mc13xxx);
+
+	if (ret)
+		return ret;
+
+	if (val & MC13892_REGULATORMODE1_VCAMCONFIGEN)
+		return REGULATOR_MODE_FAST;
+
+	return REGULATOR_MODE_NORMAL;
+}
+
+
+static int __devinit mc13892_regulator_probe(struct platform_device *pdev)
+{
+	struct mc13xxx_regulator_priv *priv;
+	struct mc13xxx *mc13892 = dev_get_drvdata(pdev->dev.parent);
+	struct mc13xxx_regulator_platform_data *pdata =
+		dev_get_platdata(&pdev->dev);
+	struct mc13xxx_regulator_init_data *init_data;
+	int i, ret;
+	u32 val;
+
+	priv = kzalloc(sizeof(*priv) +
+		pdata->num_regulators * sizeof(priv->regulators[0]),
+		GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->mc13xxx_regulators = mc13892_regulators;
+	priv->mc13xxx = mc13892;
+
+	mc13xxx_lock(mc13892);
+	ret = mc13xxx_reg_read(mc13892, MC13892_REVISION, &val);
+	if (ret)
+		goto err_free;
+
+	/* enable switch auto mode */
+	if ((val & 0x0000FFFF) == 0x45d0) {
+		ret = mc13xxx_reg_rmw(mc13892, MC13892_SWITCHERS4,
+			MC13892_SWITCHERS4_SW1MODE_M |
+			MC13892_SWITCHERS4_SW2MODE_M,
+			MC13892_SWITCHERS4_SW1MODE_AUTO |
+			MC13892_SWITCHERS4_SW2MODE_AUTO);
+		if (ret)
+			goto err_free;
+
+		mc13xxx_reg_rmw(mc13892, MC13892_SWITCHERS5,
+			MC13892_SWITCHERS5_SW3MODE_M |
+			MC13892_SWITCHERS5_SW4MODE_M,
+			MC13892_SWITCHERS5_SW3MODE_AUTO |
+			MC13892_SWITCHERS5_SW4MODE_AUTO);
+		if (ret)
+			goto err_free;
+	}
+	mc13xxx_unlock(mc13892);
+
+	mc13892_regulators[MC13892_VCAM].desc.ops->set_mode
+		= mc13892_vcam_set_mode;
+	mc13892_regulators[MC13892_VCAM].desc.ops->get_mode
+		= mc13892_vcam_get_mode;
+	for (i = 0; i < pdata->num_regulators; i++) {
+		init_data = &pdata->regulators[i];
+		priv->regulators[i] = regulator_register(
+			&mc13892_regulators[init_data->id].desc,
+			&pdev->dev, init_data->init_data, priv);
+
+		if (IS_ERR(priv->regulators[i])) {
+			dev_err(&pdev->dev, "failed to register regulator %s\n",
+				mc13892_regulators[i].desc.name);
+			ret = PTR_ERR(priv->regulators[i]);
+			goto err;
+		}
+	}
+
+	platform_set_drvdata(pdev, priv);
+
+	return 0;
+err:
+	while (--i >= 0)
+		regulator_unregister(priv->regulators[i]);
+
+err_free:
+	mc13xxx_unlock(mc13892);
+	kfree(priv);
+
+	return ret;
+}
+
+static int __devexit mc13892_regulator_remove(struct platform_device *pdev)
+{
+	struct mc13xxx_regulator_priv *priv = platform_get_drvdata(pdev);
+	struct mc13xxx_regulator_platform_data *pdata =
+		dev_get_platdata(&pdev->dev);
+	int i;
+
+	platform_set_drvdata(pdev, NULL);
+
+	for (i = 0; i < pdata->num_regulators; i++)
+		regulator_unregister(priv->regulators[i]);
+
+	kfree(priv);
+	return 0;
+}
+
+static struct platform_driver mc13892_regulator_driver = {
+	.driver	= {
+		.name	= "mc13892-regulator",
+		.owner	= THIS_MODULE,
+	},
+	.remove	= __devexit_p(mc13892_regulator_remove),
+	.probe	= mc13892_regulator_probe,
+};
+
+static int __init mc13892_regulator_init(void)
+{
+	return platform_driver_register(&mc13892_regulator_driver);
+}
+subsys_initcall(mc13892_regulator_init);
+
+static void __exit mc13892_regulator_exit(void)
+{
+	platform_driver_unregister(&mc13892_regulator_driver);
+}
+module_exit(mc13892_regulator_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Yong Shen <yong.shen@linaro.org>");
+MODULE_DESCRIPTION("Regulator Driver for Freescale MC13892 PMIC");
+MODULE_ALIAS("platform:mc13892-regulator");
diff --git a/include/linux/mfd/mc13892.h b/include/linux/mfd/mc13892.h
new file mode 100644
index 000000000000..a00f2bec178c
--- /dev/null
+++ b/include/linux/mfd/mc13892.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2010 Yong Shen <yong.shen@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ */
+
+#ifndef __LINUX_MFD_MC13892_H
+#define __LINUX_MFD_MC13892_H
+
+#include <linux/mfd/mc13xxx.h>
+
+#define MC13892_SW1		0
+#define MC13892_SW2		1
+#define MC13892_SW3		2
+#define MC13892_SW4		3
+#define MC13892_SWBST	4
+#define MC13892_VIOHI	5
+#define MC13892_VPLL	6
+#define MC13892_VDIG	7
+#define MC13892_VSD	8
+#define MC13892_VUSB2	9
+#define MC13892_VVIDEO	10
+#define MC13892_VAUDIO	11
+#define MC13892_VCAM	12
+#define MC13892_VGEN1	13
+#define MC13892_VGEN2	14
+#define MC13892_VGEN3	15
+#define MC13892_VUSB	16
+#define MC13892_GPO1	17
+#define MC13892_GPO2	18
+#define MC13892_GPO3	19
+#define MC13892_GPO4	20
+#define MC13892_PWGT1SPI	21
+#define MC13892_PWGT2SPI	22
+#define MC13892_VCOINCELL	23
+
+#endif
-- 
cgit v1.2.3-71-gd317


From e8eef82b2c652d031bee9dff9762325672f5a1e0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Sun, 12 Dec 2010 14:36:17 +0000
Subject: regulator: Provide a selector based set_voltage_sel() operation

Many regulator drivers implement voltage setting by looping through a
table of possible values, normally because the set of available voltages
can't be mapped onto selectors with simple calcuation. Factor out these
loops by providing a variant of set_voltage() which takes a selector rather
than a voltage range as an argument and implementing a loop through the
available selectors in the core.

This is not going to be suitable for use with all devices as when the
regulator voltage can be mapped onto selector values with a simple
calculation the linear scan through the available values will be more
expensive than just doing the calculation, especially for regulators
that provide fine grained voltage control.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c         | 37 +++++++++++++++++++++++++++++++++++--
 include/linux/regulator/driver.h |  3 +++
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 23c5f7c80bff..a0579f069002 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1637,6 +1637,32 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev,
 								 selector);
 		else
 			selector = -1;
+	} else if (rdev->desc->ops->set_voltage_sel) {
+		int best_val = INT_MAX;
+		int i;
+
+		selector = 0;
+
+		/* Find the smallest voltage that falls within the specified
+		 * range.
+		 */
+		for (i = 0; i < rdev->desc->n_voltages; i++) {
+			ret = rdev->desc->ops->list_voltage(rdev, i);
+			if (ret < 0)
+				continue;
+
+			if (ret < best_val && ret >= min_uV && ret <= max_uV) {
+				best_val = ret;
+				selector = i;
+			}
+		}
+
+		if (best_val != INT_MAX) {
+			ret = rdev->desc->ops->set_voltage_sel(rdev, selector);
+			selector = best_val;
+		} else {
+			ret = -EINVAL;
+		}
 	} else {
 		ret = -EINVAL;
 	}
@@ -1672,7 +1698,8 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
 	mutex_lock(&rdev->mutex);
 
 	/* sanity check */
-	if (!rdev->desc->ops->set_voltage) {
+	if (!rdev->desc->ops->set_voltage &&
+	    !rdev->desc->ops->set_voltage_sel) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2256,7 +2283,7 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 		return status;
 
 	/* constraints need specific supporting methods */
-	if (ops->set_voltage) {
+	if (ops->set_voltage || ops->set_voltage_sel) {
 		status = device_create_file(dev, &dev_attr_min_microvolts);
 		if (status < 0)
 			return status;
@@ -2354,12 +2381,18 @@ struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
 	/* Only one of each should be implemented */
 	WARN_ON(regulator_desc->ops->get_voltage &&
 		regulator_desc->ops->get_voltage_sel);
+	WARN_ON(regulator_desc->ops->set_voltage &&
+		regulator_desc->ops->set_voltage_sel);
 
 	/* If we're using selectors we must implement list_voltage. */
 	if (regulator_desc->ops->get_voltage_sel &&
 	    !regulator_desc->ops->list_voltage) {
 		return ERR_PTR(-EINVAL);
 	}
+	if (regulator_desc->ops->set_voltage_sel &&
+	    !regulator_desc->ops->list_voltage) {
+		return ERR_PTR(-EINVAL);
+	}
 
 	rdev = kzalloc(sizeof(struct regulator_dev), GFP_KERNEL);
 	if (rdev == NULL)
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index bf3e653591b9..975ae06cb634 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -42,6 +42,8 @@ enum regulator_status {
  *
  * @set_voltage: Set the voltage for the regulator within the range specified.
  *               The driver should select the voltage closest to min_uV.
+ * @set_voltage_sel: Set the voltage for the regulator using the specified
+ *                   selector.
  * @get_voltage: Return the currently configured voltage for the regulator.
  * @get_voltage_sel: Return the currently configured voltage selector for the
  *                   regulator.
@@ -83,6 +85,7 @@ struct regulator_ops {
 	/* get/set regulator voltage */
 	int (*set_voltage) (struct regulator_dev *, int min_uV, int max_uV,
 			    unsigned *selector);
+	int (*set_voltage_sel) (struct regulator_dev *, unsigned selector);
 	int (*get_voltage) (struct regulator_dev *);
 	int (*get_voltage_sel) (struct regulator_dev *);
 
-- 
cgit v1.2.3-71-gd317


From 606a25628187ce863b48d43ca42bc0cbe8342de9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 16 Dec 2010 15:49:36 +0000
Subject: regulator: Add API to re-apply voltage to hardware

When cooperating with an external control source the regulator setup
may be changed underneath the API. Currently consumers can just redo
the regulator_set_voltage() to restore a previously set configuration
but provide an explicit API for doing this as optimsations in the
regulator_set_voltage() implementation will shortly prevent that.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c           | 47 ++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/consumer.h |  1 +
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 3d72cc8e2f3e..a12cba32460e 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1727,6 +1727,53 @@ out:
 }
 EXPORT_SYMBOL_GPL(regulator_set_voltage);
 
+/**
+ * regulator_sync_voltage - re-apply last regulator output voltage
+ * @regulator: regulator source
+ *
+ * Re-apply the last configured voltage.  This is intended to be used
+ * where some external control source the consumer is cooperating with
+ * has caused the configured voltage to change.
+ */
+int regulator_sync_voltage(struct regulator *regulator)
+{
+	struct regulator_dev *rdev = regulator->rdev;
+	int ret, min_uV, max_uV;
+
+	mutex_lock(&rdev->mutex);
+
+	if (!rdev->desc->ops->set_voltage &&
+	    !rdev->desc->ops->set_voltage_sel) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* This is only going to work if we've had a voltage configured. */
+	if (!regulator->min_uV && !regulator->max_uV) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	min_uV = regulator->min_uV;
+	max_uV = regulator->max_uV;
+
+	/* This should be a paranoia check... */
+	ret = regulator_check_voltage(rdev, &min_uV, &max_uV);
+	if (ret < 0)
+		goto out;
+
+	ret = regulator_check_consumers(rdev, &min_uV, &max_uV);
+	if (ret < 0)
+		goto out;
+
+	ret = _regulator_do_set_voltage(rdev, min_uV, max_uV);
+
+out:
+	mutex_unlock(&rdev->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regulator_sync_voltage);
+
 static int _regulator_get_voltage(struct regulator_dev *rdev)
 {
 	int sel;
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index ebd747265294..7954f6bd7edb 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -154,6 +154,7 @@ int regulator_is_supported_voltage(struct regulator *regulator,
 				   int min_uV, int max_uV);
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV);
 int regulator_get_voltage(struct regulator *regulator);
+int regulator_sync_voltage(struct regulator *regulator);
 int regulator_set_current_limit(struct regulator *regulator,
 			       int min_uA, int max_uA);
 int regulator_get_current_limit(struct regulator *regulator);
-- 
cgit v1.2.3-71-gd317


From 1130e5b3ff4a7f3f54a48d46e9d0d81b47765bd8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 21 Dec 2010 23:49:31 +0000
Subject: regulator: Add initial per-regulator debugfs support

We only expose the use and open counts to userspace, providing a tiny
bit of insight into what the API is up to.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c         | 35 +++++++++++++++++++++++++++++++++++
 include/linux/regulator/driver.h |  8 ++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 6066a8f2d5bb..9fa20957847d 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -17,6 +17,7 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/err.h>
@@ -47,6 +48,10 @@ static LIST_HEAD(regulator_map_list);
 static bool has_full_constraints;
 static bool board_wants_dummy_regulator;
 
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *debugfs_root;
+#endif
+
 /*
  * struct regulator_map
  *
@@ -2404,6 +2409,23 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 	return status;
 }
 
+static void rdev_init_debugfs(struct regulator_dev *rdev)
+{
+#ifdef CONFIG_DEBUG_FS
+	rdev->debugfs = debugfs_create_dir(rdev_get_name(rdev), debugfs_root);
+	if (IS_ERR(rdev->debugfs) || !rdev->debugfs) {
+		rdev_warn(rdev, "Failed to create debugfs directory\n");
+		rdev->debugfs = NULL;
+		return;
+	}
+
+	debugfs_create_u32("use_count", 0444, rdev->debugfs,
+			   &rdev->use_count);
+	debugfs_create_u32("open_count", 0444, rdev->debugfs,
+			   &rdev->open_count);
+#endif
+}
+
 /**
  * regulator_register - register regulator
  * @regulator_desc: regulator to register
@@ -2548,6 +2570,8 @@ struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
 	}
 
 	list_add(&rdev->list, &regulator_list);
+
+	rdev_init_debugfs(rdev);
 out:
 	mutex_unlock(&regulator_list_mutex);
 	return rdev;
@@ -2580,6 +2604,9 @@ void regulator_unregister(struct regulator_dev *rdev)
 		return;
 
 	mutex_lock(&regulator_list_mutex);
+#ifdef CONFIG_DEBUG_FS
+	debugfs_remove_recursive(rdev->debugfs);
+#endif
 	WARN_ON(rdev->open_count);
 	unset_regulator_supplies(rdev);
 	list_del(&rdev->list);
@@ -2723,6 +2750,14 @@ static int __init regulator_init(void)
 
 	ret = class_register(&regulator_class);
 
+#ifdef CONFIG_DEBUG_FS
+	debugfs_root = debugfs_create_dir("regulator", NULL);
+	if (IS_ERR(debugfs_root) || !debugfs_root) {
+		pr_warn("regulator: Failed to create debugfs directory\n");
+		debugfs_root = NULL;
+	}
+#endif
+
 	regulator_dummy_init();
 
 	return ret;
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 975ae06cb634..b8ed16a33c47 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -175,9 +175,9 @@ struct regulator_desc {
  */
 struct regulator_dev {
 	struct regulator_desc *desc;
-	int use_count;
-	int open_count;
 	int exclusive;
+	u32 use_count;
+	u32 open_count;
 
 	/* lists we belong to */
 	struct list_head list; /* list of all regulators */
@@ -195,6 +195,10 @@ struct regulator_dev {
 	struct regulator_dev *supply;	/* for tree */
 
 	void *reg_data;		/* regulator_dev data */
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs;
+#endif
 };
 
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-- 
cgit v1.2.3-71-gd317


From d247632c08c674864d438733280422ddb7130ff8 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 12 Jan 2011 02:34:59 -0500
Subject: cpuidle: delete NOP CPUIDLE_FLAG_POLL

it serves no purpose

Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/cpuidle/cpuidle.c | 2 +-
 include/linux/cpuidle.h   | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 37e446041306..dd5f1eafd6f2 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -186,7 +186,7 @@ static void poll_idle_init(struct cpuidle_device *dev)
 	state->exit_latency = 0;
 	state->target_residency = 0;
 	state->power_usage = -1;
-	state->flags = CPUIDLE_FLAG_POLL;
+	state->flags = 0;
 	state->enter = poll_idle;
 }
 #else
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1be416bbbb82..dee32853b0da 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -48,7 +48,6 @@ struct cpuidle_state {
 /* Idle State Flags */
 #define CPUIDLE_FLAG_TIME_VALID	(0x01) /* is residency time measurable? */
 #define CPUIDLE_FLAG_CHECK_BM	(0x02) /* BM activity will exit state */
-#define CPUIDLE_FLAG_POLL	(0x10) /* no latency, no savings */
 #define CPUIDLE_FLAG_SHALLOW	(0x20) /* low latency, minimal savings */
 #define CPUIDLE_FLAG_BALANCED	(0x40) /* medium latency, moderate savings */
 #define CPUIDLE_FLAG_DEEP	(0x80) /* high latency, large savings */
-- 
cgit v1.2.3-71-gd317


From 642f11c53f17aee991d97d14e6922172849ef227 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 12 Jan 2011 02:45:00 -0500
Subject: cpuidle: delete unused CPUIDLE_FLAG_SHALLOW, BALANCED, DEEP
 definitions

Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/cpuidle.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index dee32853b0da..c25295337382 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -48,9 +48,6 @@ struct cpuidle_state {
 /* Idle State Flags */
 #define CPUIDLE_FLAG_TIME_VALID	(0x01) /* is residency time measurable? */
 #define CPUIDLE_FLAG_CHECK_BM	(0x02) /* BM activity will exit state */
-#define CPUIDLE_FLAG_SHALLOW	(0x20) /* low latency, minimal savings */
-#define CPUIDLE_FLAG_BALANCED	(0x40) /* medium latency, moderate savings */
-#define CPUIDLE_FLAG_DEEP	(0x80) /* high latency, large savings */
 #define CPUIDLE_FLAG_IGNORE	(0x100) /* ignore during this idle period */
 #define CPUIDLE_FLAG_TLB_FLUSHED (0x200) /* tlb will be flushed */
 
-- 
cgit v1.2.3-71-gd317


From 956d033fb2eb3f8818260cdf01644bf4dc1a9e33 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 12 Jan 2011 02:51:20 -0500
Subject: cpuidle: CPUIDLE_FLAG_TLB_FLUSHED is specific to intel_idle

Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/idle/intel_idle.c | 8 ++++++++
 include/linux/cpuidle.h   | 1 -
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 21d387132dbc..8256309deaad 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -81,6 +81,14 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state);
 
 static struct cpuidle_state *cpuidle_state_table;
 
+/*
+ * Set this flag for states where the HW flushes the TLB for us
+ * and so we don't need cross-calls to keep it consistent.
+ * If this flag is set, SW flushes the TLB, so even if the
+ * HW doesn't do the flushing, this flag is safe to use.
+ */
+#define CPUIDLE_FLAG_TLB_FLUSHED	0x10000
+
 /*
  * States are indexed by the cstate number,
  * which is also the index into the MWAIT hint array.
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index c25295337382..6be722c725d5 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -49,7 +49,6 @@ struct cpuidle_state {
 #define CPUIDLE_FLAG_TIME_VALID	(0x01) /* is residency time measurable? */
 #define CPUIDLE_FLAG_CHECK_BM	(0x02) /* BM activity will exit state */
 #define CPUIDLE_FLAG_IGNORE	(0x100) /* ignore during this idle period */
-#define CPUIDLE_FLAG_TLB_FLUSHED (0x200) /* tlb will be flushed */
 
 #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
 
-- 
cgit v1.2.3-71-gd317


From 5392083748a340f68052c0b83a7ce28b10324972 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 12 Jan 2011 02:56:15 -0500
Subject: cpuidle: CPUIDLE_FLAG_CHECK_BM is omap3_idle specific

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/arm/mach-omap2/cpuidle34xx.c | 2 ++
 include/linux/cpuidle.h           | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index 0d50b45d041c..f9be4e3078bc 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -47,6 +47,8 @@
 
 #define OMAP3_STATE_MAX OMAP3_STATE_C7
 
+#define CPUIDLE_FLAG_CHECK_BM	0x10000	/* use omap3_enter_idle_bm() */
+
 struct omap3_processor_cx {
 	u8 valid;
 	u8 type;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 6be722c725d5..36719ead50e8 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -47,7 +47,6 @@ struct cpuidle_state {
 
 /* Idle State Flags */
 #define CPUIDLE_FLAG_TIME_VALID	(0x01) /* is residency time measurable? */
-#define CPUIDLE_FLAG_CHECK_BM	(0x02) /* BM activity will exit state */
 #define CPUIDLE_FLAG_IGNORE	(0x100) /* ignore during this idle period */
 
 #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
-- 
cgit v1.2.3-71-gd317


From f00c9e44ad1a9660fe8cd3ca15b6cd9497172eab Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Sep 2010 17:38:58 +0200
Subject: quota: Fix deadlock during path resolution

As Al Viro pointed out path resolution during Q_QUOTAON calls to quotactl
is prone to deadlocks. We hold s_umount semaphore for reading during the
path resolution and resolution itself may need to acquire the semaphore
for writing when e. g. autofs mountpoint is passed.

Solve the problem by performing the resolution before we get hold of the
superblock (and thus s_umount semaphore). The whole thing is complicated
by the fact that some filesystems (OCFS2) ignore the path argument. So to
distinguish between filesystem which want the path and which do not we
introduce new .quota_on_meta callback which does not get the path. OCFS2
then uses this callback instead of old .quota_on.

CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Christoph Hellwig <hch@lst.de>
CC: Ted Ts'o <tytso@mit.edu>
CC: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c          | 25 +++++++------------------
 fs/ext4/super.c          | 25 +++++++------------------
 fs/ocfs2/super.c         |  5 ++---
 fs/quota/dquot.c         | 18 ++----------------
 fs/quota/quota.c         | 41 +++++++++++++++++++++++++++--------------
 fs/reiserfs/super.c      | 17 ++++++-----------
 include/linux/quota.h    |  5 ++++-
 include/linux/quotaops.h |  4 +---
 8 files changed, 56 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b7d0554631e4..0e0d391626be 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -755,7 +755,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-				char *path);
+			 struct path *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
@@ -2885,27 +2885,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name)
+			 struct path *path)
 {
 	int err;
-	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
-
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
-		path_put(&path);
+	if (path->mnt->mnt_sb != sb)
 		return -EXDEV;
-	}
 	/* Journaling quota? */
 	if (EXT3_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			ext3_msg(sb, KERN_WARNING,
 				"warning: Quota file not on filesystem root. "
 				"Journaled quota will not work.");
@@ -2915,7 +2908,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext3_should_journal_data(path.dentry->d_inode)) {
+	if (ext3_should_journal_data(path->dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -2923,15 +2916,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
 		err = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-		if (err) {
-			path_put(&path);
+		if (err)
 			return err;
-		}
 	}
 
-	err = dquot_quota_on_path(sb, type, format_id, &path);
-	path_put(&path);
-	return err;
+	return dquot_quota_on(sb, type, format_id, path);
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 29c80f6d8b27..0f10ccd6bfc0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1162,7 +1162,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-				char *path);
+			 struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -4566,27 +4566,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name)
+			 struct path *path)
 {
 	int err;
-	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
-
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
-		path_put(&path);
+	if (path->mnt->mnt_sb != sb)
 		return -EXDEV;
-	}
 	/* Journaling quota? */
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			ext4_msg(sb, KERN_WARNING,
 				"Quota file not on filesystem root. "
 				"Journaled quota will not work");
@@ -4597,7 +4590,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	 * all updates to the file when we bypass pagecache...
 	 */
 	if (EXT4_SB(sb)->s_journal &&
-	    ext4_should_journal_data(path.dentry->d_inode)) {
+	    ext4_should_journal_data(path->dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -4605,15 +4598,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
 		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-		if (err) {
-			path_put(&path);
+		if (err)
 			return err;
-		}
 	}
 
-	err = dquot_quota_on_path(sb, type, format_id, &path);
-	path_put(&path);
-	return err;
+	return dquot_quota_on(sb, type, format_id, path);
 }
 
 static int ext4_quota_off(struct super_block *sb, int type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 17ff46fa8a10..31c3ffd2f8d0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -993,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 }
 
 /* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-			  char *path)
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
 {
 	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
 					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1013,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
 }
 
 static const struct quotactl_ops ocfs2_quotactl_ops = {
-	.quota_on	= ocfs2_quota_on,
+	.quota_on_meta	= ocfs2_quota_on,
 	.quota_off	= ocfs2_quota_off,
 	.quota_sync	= dquot_quota_sync,
 	.get_info	= dquot_get_dqinfo,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 84becd3e4772..a2a622e079f0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2189,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_resume);
 
-int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
-		      struct path *path)
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
+		   struct path *path)
 {
 	int error = security_quota_on(path->dentry);
 	if (error)
@@ -2204,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
 					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
-EXPORT_SYMBOL(dquot_quota_on_path);
-
-int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-{
-	struct path path;
-	int error;
-
-	error = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (!error) {
-		error = dquot_quota_on_path(sb, type, format_id, &path);
-		path_put(&path);
-	}
-	return error;
-}
 EXPORT_SYMBOL(dquot_quota_on);
 
 /*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1edb..b34bdb25490c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
 }
 
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
-		         void __user *addr)
+		         struct path *path)
 {
-	char *pathname;
-	int ret = -ENOSYS;
-
-	pathname = getname(addr);
-	if (IS_ERR(pathname))
-		return PTR_ERR(pathname);
-	if (sb->s_qcop->quota_on)
-		ret = sb->s_qcop->quota_on(sb, type, id, pathname);
-	putname(pathname);
-	return ret;
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_on_meta)
+		return sb->s_qcop->quota_on_meta(sb, type, id);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	return sb->s_qcop->quota_on(sb, type, id, path);
 }
 
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
-		       void __user *addr)
+		       void __user *addr, struct path *path)
 {
 	int ret;
 
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 
 	switch (cmd) {
 	case Q_QUOTAON:
-		return quota_quotaon(sb, type, cmd, id, addr);
+		return quota_quotaon(sb, type, cmd, id, path);
 	case Q_QUOTAOFF:
 		if (!sb->s_qcop->quota_off)
 			return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
+	struct path path, *pathp = NULL;
 	int ret;
 
 	cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 		return -ENODEV;
 	}
 
+	/*
+	 * Path for quotaon has to be resolved before grabbing superblock
+	 * because that gets s_umount sem which is also possibly needed by path
+	 * resolution (think about autofs) and thus deadlocks could arise.
+	 */
+	if (cmds == Q_QUOTAON) {
+		ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+		if (ret)
+			pathp = ERR_PTR(ret);
+		else
+			pathp = &path;
+	}
+
 	sb = quotactl_block(special);
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
 
-	ret = do_quotactl(sb, type, cmds, id, addr);
+	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
 
 	drop_super(sb);
+	if (pathp && !IS_ERR(pathp))
+		path_put(pathp);
 	return ret;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2575682a9ead..0aab04f46827 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -632,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
 
 static const struct dquot_operations reiserfs_quota_operations = {
 	.write_dquot = reiserfs_write_dquot,
@@ -2048,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     char *name)
+			     struct path *path)
 {
 	int err;
-	struct path path;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
 
 	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
+	if (path->mnt->mnt_sb != sb) {
 		err = -EXDEV;
 		goto out;
 	}
-	inode = path.dentry->d_inode;
+	inode = path->dentry->d_inode;
 	/* We must not pack tails for quota files on reiserfs for quota IO to work */
 	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		err = reiserfs_unpack(inode, NULL);
@@ -2082,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	/* Journaling quota? */
 	if (REISERFS_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			reiserfs_warning(sb, "super-6521",
 				 "Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
@@ -2101,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		if (err)
 			goto out;
 	}
-	err = dquot_quota_on_path(sb, type, format_id, &path);
+	err = dquot_quota_on(sb, type, format_id, path);
 out:
-	path_put(&path);
 	return err;
 }
 
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 94c1f03b50eb..9a85412e0db6 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -322,9 +322,12 @@ struct dquot_operations {
 	qsize_t *(*get_reserved_space) (struct inode *);
 };
 
+struct path;
+
 /* Operations handling requests from userspace */
 struct quotactl_ops {
-	int (*quota_on)(struct super_block *, int, int, char *);
+	int (*quota_on)(struct super_block *, int, int, struct path *);
+	int (*quota_on_meta)(struct super_block *, int, int);
 	int (*quota_off)(struct super_block *, int);
 	int (*quota_sync)(struct super_block *, int, int);
 	int (*get_info)(struct super_block *, int, struct if_dqinfo *);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 223b14cd129c..eb354f6f26b3 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -76,11 +76,9 @@ int dquot_mark_dquot_dirty(struct dquot *dquot);
 
 int dquot_file_open(struct inode *inode, struct file *file);
 
-int dquot_quota_on(struct super_block *sb, int type, int format_id,
-	char *path);
 int dquot_enable(struct inode *inode, int type, int format_id,
 	unsigned int flags);
-int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
  	struct path *path);
 int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
  	int format_id, int type);
-- 
cgit v1.2.3-71-gd317


From 6fed05c9c9812b5882bc708f4da4fa8d5df2875c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 12 Jan 2011 22:03:20 +0100
Subject: ACPI / PM: Fix build problems for !CONFIG_ACPI related to NVS rework

The recent rework of the NVS saving/restoring code introduced two
build issues for !CONFIG_ACPI, a warning in drivers/acpi/internal.h
and an error in arch/x86/kernel/e820.c.

Fix them by providing suitable static inline definitions of the
relevant functions.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/internal.h |  2 +-
 include/linux/acpi.h    | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 7c23b76e8eca..6d69bbedf97a 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -90,7 +90,7 @@ void suspend_nvs_restore(void);
 static inline int acpi_sleep_proc_init(void) { return 0; }
 static inline int suspend_nvs_alloc(void) { return 0; }
 static inline void suspend_nvs_free(void) {}
-static inline int suspend_nvs_save(void) {}
+static inline int suspend_nvs_save(void) { return 0; }
 static inline void suspend_nvs_restore(void) {}
 #endif
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fa7ed6a983d0..eb176bb1b15b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -254,15 +254,6 @@ void __init acpi_old_suspend_ordering(void);
 void __init acpi_nvs_nosave(void);
 #endif /* CONFIG_PM_SLEEP */
 
-#ifdef CONFIG_ACPI_SLEEP
-int suspend_nvs_register(unsigned long start, unsigned long size);
-#else
-static inline int suspend_nvs_register(unsigned long a, unsigned long b)
-{
-	return 0;
-}
-#endif
-
 struct acpi_osc_context {
 	char *uuid_str; /* uuid string */
 	int rev;
@@ -361,4 +352,14 @@ static inline int acpi_table_parse(char *id,
 	return -1;
 }
 #endif	/* !CONFIG_ACPI */
+
+#ifdef CONFIG_ACPI_SLEEP
+int suspend_nvs_register(unsigned long start, unsigned long size);
+#else
+static inline int suspend_nvs_register(unsigned long a, unsigned long b)
+{
+	return 0;
+}
+#endif
+
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3-71-gd317


From da995a8aee044bc5d0847e19e351cd48a2cb8bcc Mon Sep 17 00:00:00 2001
From: Aleksey Senin <alex@senin.name>
Date: Thu, 2 Dec 2010 11:44:49 +0000
Subject: IB/mlx4: Handle protocol field in multicast table

The newest device firmware stores IB vs. Ethernet protocol in two bits
in members_count field of multicast group table (0: Infiniband, 1:
Ethernet).  When changing the QP members count for a multicast group,
it important not to reset this information.  When calling multicast
attach first time, the protocol type should be specified.  In this
patch we always set it IB, but in the future we will handle Ethernet
too.  When looking for a QP, the protocol type shoud be checked too.

Signed-off-by: Aleksey Senin <alekseys@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/main.c |  9 +++++----
 drivers/net/mlx4/mcg.c            | 23 +++++++++++++----------
 include/linux/mlx4/device.h       | 10 ++++++++--
 include/linux/mlx4/driver.h       |  6 +-----
 4 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d68d849ab866..c7a6213c6996 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -623,8 +623,9 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 
-	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags &
-				    MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK));
+	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw,
+				    !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+				    MLX4_PROTOCOL_IB);
 	if (err)
 		return err;
 
@@ -635,7 +636,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	return 0;
 
 err_add:
-	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw);
+	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROTOCOL_IB);
 	return err;
 }
 
@@ -665,7 +666,7 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	struct mlx4_ib_gid_entry *ge;
 
 	err = mlx4_multicast_detach(mdev->dev,
-				    &mqp->mqp, gid->raw);
+				    &mqp->mqp, gid->raw, MLX4_PROTOCOL_IB);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index c4f88b7ef7b6..79cf42db2ea9 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -95,7 +95,8 @@ static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox
  * entry in hash chain and *mgm holds end of hash chain.
  */
 static int find_mgm(struct mlx4_dev *dev,
-		    u8 *gid, struct mlx4_cmd_mailbox *mgm_mailbox,
+		    u8 *gid, enum mlx4_protocol protocol,
+		    struct mlx4_cmd_mailbox *mgm_mailbox,
 		    u16 *hash, int *prev, int *index)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -134,7 +135,8 @@ static int find_mgm(struct mlx4_dev *dev,
 			return err;
 		}
 
-		if (!memcmp(mgm->gid, gid, 16))
+		if (!memcmp(mgm->gid, gid, 16) &&
+		    be32_to_cpu(mgm->members_count) >> 30 == protocol)
 			return err;
 
 		*prev = *index;
@@ -146,7 +148,7 @@ static int find_mgm(struct mlx4_dev *dev,
 }
 
 int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  int block_mcast_loopback)
+			  int block_mcast_loopback, enum mlx4_protocol protocol)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
@@ -165,7 +167,7 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 
 	mutex_lock(&priv->mcg_table.mutex);
 
-	err = find_mgm(dev, gid, mailbox, &hash, &prev, &index);
+	err = find_mgm(dev, gid, protocol, mailbox, &hash, &prev, &index);
 	if (err)
 		goto out;
 
@@ -187,7 +189,7 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 		memcpy(mgm->gid, gid, 16);
 	}
 
-	members_count = be32_to_cpu(mgm->members_count);
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
 	if (members_count == MLX4_QP_PER_MGM) {
 		mlx4_err(dev, "MGM at index %x is full.\n", index);
 		err = -ENOMEM;
@@ -207,7 +209,7 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 	else
 		mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
 
-	mgm->members_count       = cpu_to_be32(members_count);
+	mgm->members_count = cpu_to_be32(members_count | (u32) protocol << 30);
 
 	err = mlx4_WRITE_MCG(dev, index, mailbox);
 	if (err)
@@ -242,7 +244,8 @@ out:
 }
 EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
 
-int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol protocol)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
@@ -260,7 +263,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 
 	mutex_lock(&priv->mcg_table.mutex);
 
-	err = find_mgm(dev, gid, mailbox, &hash, &prev, &index);
+	err = find_mgm(dev, gid, protocol, mailbox, &hash, &prev, &index);
 	if (err)
 		goto out;
 
@@ -270,7 +273,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 		goto out;
 	}
 
-	members_count = be32_to_cpu(mgm->members_count);
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
 	for (loc = -1, i = 0; i < members_count; ++i)
 		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn)
 			loc = i;
@@ -282,7 +285,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 	}
 
 
-	mgm->members_count = cpu_to_be32(--members_count);
+	mgm->members_count = cpu_to_be32(--members_count | (u32) protocol << 30);
 	mgm->qp[loc]       = mgm->qp[i - 1];
 	mgm->qp[i - 1]     = 0;
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index a7b15bc7648e..049214642036 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -144,6 +144,11 @@ enum {
 	MLX4_STAT_RATE_OFFSET	= 5
 };
 
+enum mlx4_protocol {
+	MLX4_PROTOCOL_IB,
+	MLX4_PROTOCOL_EN,
+};
+
 enum {
 	MLX4_MTT_FLAG_PRESENT		= 1
 };
@@ -500,8 +505,9 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
 
 int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  int block_mcast_loopback);
-int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+			  int block_mcast_loopback, enum mlx4_protocol protocol);
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol protocol);
 
 int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index);
 void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index);
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index f407cd4bfb34..e1eebf78caba 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -34,6 +34,7 @@
 #define MLX4_DRIVER_H
 
 #include <linux/device.h>
+#include <linux/mlx4/device.h>
 
 struct mlx4_dev;
 
@@ -44,11 +45,6 @@ enum mlx4_dev_event {
 	MLX4_DEV_EVENT_PORT_REINIT,
 };
 
-enum mlx4_protocol {
-	MLX4_PROTOCOL_IB,
-	MLX4_PROTOCOL_EN,
-};
-
 struct mlx4_interface {
 	void *			(*add)	 (struct mlx4_dev *dev);
 	void			(*remove)(struct mlx4_dev *dev, void *context);
-- 
cgit v1.2.3-71-gd317


From 6c0f3af72cb1622a66962a1180c36ef8c41be8e2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Nov 2010 11:14:34 -0800
Subject: ceph: add dir_layout to inode

Add a ceph_dir_layout to the inode, and calculate dentry hash values based
on the parent directory's specified dir_hash function.  This is needed
because the old default Linux dcache hash function is extremely week and
leads to a poor distribution of files among dir fragments.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c                | 20 ++++++++++++++++++++
 fs/ceph/export.c             |  2 +-
 fs/ceph/inode.c              |  2 ++
 fs/ceph/super.h              |  2 ++
 include/linux/ceph/ceph_fs.h | 16 +++++++++++++---
 net/ceph/ceph_hash.c         |  3 +++
 6 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d902948a90d8..562f9884a4d9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1216,6 +1216,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	}
 }
 
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+	struct inode *dir = dn->d_parent->d_inode;
+	struct ceph_inode_info *dci = ceph_inode(dir);
+
+	switch (dci->i_dir_layout.dl_dir_hash) {
+	case 0:	/* for backward compat */
+	case CEPH_STR_HASH_LINUX:
+		return dn->d_name.hash;
+
+	default:
+		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+				     dn->d_name.name, dn->d_name.len);
+	}
+}
+
 const struct file_operations ceph_dir_fops = {
 	.read = ceph_read_dir,
 	.readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d9426992..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 		dout("encode_fh %p connectable\n", dentry);
 		cfh->ino = ceph_ino(dentry->d_inode);
 		cfh->parent_ino = ceph_ino(parent->d_inode);
-		cfh->parent_name_hash = parent->d_name.hash;
+		cfh->parent_name_hash = ceph_dentry_hash(parent);
 		*max_len = connected_handle_length;
 		type = 2;
 	} else if (*max_len >= handle_length) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bf1286588f26..045283ce4413 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_release_count = 0;
 	ci->i_symlink = NULL;
 
+	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+
 	ci->i_fragtree = RB_ROOT;
 	mutex_init(&ci->i_fragtree_mutex);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7f01728a4657..6e0826695112 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
 	unsigned i_ceph_flags;
 	unsigned long i_release_count;
 
+	struct ceph_dir_layout i_dir_layout;
 	struct ceph_file_layout i_layout;
 	char *i_symlink;
 
@@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
 
 /*
  * our d_ops vary depending on whether the inode is live,
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index c3c74aef289d..09dcc0c2ffd5 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -43,6 +43,10 @@
 #define CEPH_FEATURE_NOSRCADDR      (1<<1)
 #define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
 #define CEPH_FEATURE_FLOCK          (1<<3)
+#define CEPH_FEATURE_SUBSCRIBE2     (1<<4)
+#define CEPH_FEATURE_MONNAMES       (1<<5)
+#define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
+#define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
 
 
 /*
@@ -55,10 +59,10 @@ struct ceph_file_layout {
 	__le32 fl_stripe_count;    /* over this many objects */
 	__le32 fl_object_size;     /* until objects are this big, then move to
 				      new objects */
-	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+	__le32 fl_cas_hash;        /* UNUSED.  0 = none; 1 = sha256 */
 
 	/* pg -> disk layout */
-	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+	__le32 fl_object_stripe_unit;  /* UNUSED.  for per-object parity, if any */
 
 	/* object -> pg layout */
 	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
@@ -69,6 +73,12 @@ struct ceph_file_layout {
 
 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 
+struct ceph_dir_layout {
+	__u8   dl_dir_hash;   /* see ceph_hash.h for ids */
+	__u8   dl_unused1;
+	__u16  dl_unused2;
+	__u32  dl_unused3;
+} __attribute__ ((packed));
 
 /* crypto algorithms */
 #define CEPH_CRYPTO_NONE 0x0
@@ -457,7 +467,7 @@ struct ceph_mds_reply_inode {
 	struct ceph_timespec rctime;
 	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
 } __attribute__ ((packed));
-/* followed by frag array, then symlink string, then xattr blob */
+/* followed by frag array, symlink string, dir layout, xattr blob */
 
 /* reply_lease follows dname, and reply_inode */
 struct ceph_mds_reply_lease {
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
index 815ef8826796..0a1b53bce76d 100644
--- a/net/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -1,5 +1,6 @@
 
 #include <linux/ceph/types.h>
+#include <linux/module.h>
 
 /*
  * Robert Jenkin's hash function.
@@ -104,6 +105,7 @@ unsigned ceph_str_hash(int type, const char *s, unsigned len)
 		return -1;
 	}
 }
+EXPORT_SYMBOL(ceph_str_hash);
 
 const char *ceph_str_hash_name(int type)
 {
@@ -116,3 +118,4 @@ const char *ceph_str_hash_name(int type)
 		return "unknown";
 	}
 }
+EXPORT_SYMBOL(ceph_str_hash_name);
-- 
cgit v1.2.3-71-gd317


From f363e45fd1184219b472ea549cb7e192e24ef4d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Jan 2011 14:49:46 +0100
Subject: net/ceph: make ceph_msgr_wq non-reentrant

ceph messenger code does a rather complex dancing around multithread
workqueue to make sure the same work item isn't executed concurrently
on different CPUs.  This restriction can be provided by workqueue with
WQ_NON_REENTRANT.

Make ceph_msgr_wq non-reentrant workqueue with the default concurrency
level and remove the QUEUED/BUSY logic.

* This removes backoff handling in con_work() but it couldn't reliably
  block execution of con_work() to begin with - queue_con() can be
  called after the work started but before BUSY is set.  It seems that
  it was an optimization for a rather cold path and can be safely
  removed.

* The number of concurrent work items is bound by the number of
  connections and connetions are independent from each other.  With
  the default concurrency level, different connections will be
  executed independently.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Sage Weil <sage@newdream.net>
Cc: ceph-devel@vger.kernel.org
Signed-off-by: Sage Weil <sage@newdream.net>
---
 include/linux/ceph/messenger.h |  5 -----
 net/ceph/messenger.c           | 46 ++----------------------------------------
 2 files changed, 2 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index a108b425fee2..c3011beac30d 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -110,17 +110,12 @@ struct ceph_msg_pos {
 
 /*
  * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
  */
 #define LOSSYTX         0  /* we can close channel or drop messages on errors */
 #define CONNECTING	1
 #define NEGOTIATING	2
 #define KEEPALIVE_PENDING      3
 #define WRITE_PENDING	4  /* we have data ready to send */
-#define QUEUED          5  /* there is work queued on this connection */
-#define BUSY            6  /* work is being done */
 #define STANDBY		8  /* no outgoing messages, socket closed.  we keep
 			    * the ceph_connection around to maintain shared
 			    * state with the peer. */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b6ff4a1519ab..dff633d62e5b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -96,7 +96,7 @@ struct workqueue_struct *ceph_msgr_wq;
 
 int ceph_msgr_init(void)
 {
-	ceph_msgr_wq = create_workqueue("ceph-msgr");
+	ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
 	if (!ceph_msgr_wq) {
 		pr_err("msgr_init failed to create workqueue\n");
 		return -ENOMEM;
@@ -1920,20 +1920,6 @@ bad_tag:
 /*
  * Atomically queue work on a connection.  Bump @con reference to
  * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY.  It
- * clears QUEUED and does it's thing.  When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work.  If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
  */
 static void queue_con(struct ceph_connection *con)
 {
@@ -1948,11 +1934,7 @@ static void queue_con(struct ceph_connection *con)
 		return;
 	}
 
-	set_bit(QUEUED, &con->state);
-	if (test_bit(BUSY, &con->state)) {
-		dout("queue_con %p - already BUSY\n", con);
-		con->ops->put(con);
-	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+	if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
 		dout("queue_con %p - already queued\n", con);
 		con->ops->put(con);
 	} else {
@@ -1967,15 +1949,6 @@ static void con_work(struct work_struct *work)
 {
 	struct ceph_connection *con = container_of(work, struct ceph_connection,
 						   work.work);
-	int backoff = 0;
-
-more:
-	if (test_and_set_bit(BUSY, &con->state) != 0) {
-		dout("con_work %p BUSY already set\n", con);
-		goto out;
-	}
-	dout("con_work %p start, clearing QUEUED\n", con);
-	clear_bit(QUEUED, &con->state);
 
 	mutex_lock(&con->mutex);
 
@@ -1994,28 +1967,13 @@ more:
 	    try_read(con) < 0 ||
 	    try_write(con) < 0) {
 		mutex_unlock(&con->mutex);
-		backoff = 1;
 		ceph_fault(con);     /* error/fault path */
 		goto done_unlocked;
 	}
 
 done:
 	mutex_unlock(&con->mutex);
-
 done_unlocked:
-	clear_bit(BUSY, &con->state);
-	dout("con->state=%lu\n", con->state);
-	if (test_bit(QUEUED, &con->state)) {
-		if (!backoff || test_bit(OPENING, &con->state)) {
-			dout("con_work %p QUEUED reset, looping\n", con);
-			goto more;
-		}
-		dout("con_work %p QUEUED reset, but just faulted\n", con);
-		clear_bit(QUEUED, &con->state);
-	}
-	dout("con_work %p done\n", con);
-
-out:
 	con->ops->put(con);
 }
 
-- 
cgit v1.2.3-71-gd317


From c8aebb0c9f8c7471643d5f8ba68328de8013005f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Dec 2010 10:22:30 -0500
Subject: per-superblock default ->d_op

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c        | 4 ++++
 include/linux/fs.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 5699d4c027cb..5ec58267b5bb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1320,6 +1320,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		__dget_dlock(parent);
 		dentry->d_parent = parent;
 		dentry->d_sb = parent->d_sb;
+		d_set_d_op(dentry, dentry->d_sb->s_d_op);
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 		spin_unlock(&parent->d_lock);
 	}
@@ -1335,6 +1336,7 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 	struct dentry *dentry = d_alloc(NULL, name);
 	if (dentry) {
 		dentry->d_sb = sb;
+		d_set_d_op(dentry, dentry->d_sb->s_d_op);
 		dentry->d_parent = dentry;
 		dentry->d_flags |= DCACHE_DISCONNECTED;
 	}
@@ -1507,6 +1509,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 		res = d_alloc(NULL, &name);
 		if (res) {
 			res->d_sb = root_inode->i_sb;
+			d_set_d_op(res, res->d_sb->s_d_op);
 			res->d_parent = res;
 			d_instantiate(res, root_inode);
 		}
@@ -1567,6 +1570,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	/* attach a disconnected dentry */
 	spin_lock(&tmp->d_lock);
 	tmp->d_sb = inode->i_sb;
+	d_set_d_op(tmp, tmp->d_sb->s_d_op);
 	tmp->d_inode = inode;
 	tmp->d_flags |= DCACHE_DISCONNECTED;
 	list_add(&tmp->d_alias, &inode->i_dentry);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f84d9928bdb1..3e4c27486e74 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1423,6 +1423,7 @@ struct super_block {
 	 * generic_show_options()
 	 */
 	char __rcu *s_options;
+	const struct dentry_operations *s_d_op; /* default d_op for dentries */
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
-- 
cgit v1.2.3-71-gd317


From 9501e4c48e1561443e1a16c2c9917ec6c63118a4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 12 Jan 2011 16:25:02 -0500
Subject: switch coda

Coda ->d_revalidate() actually checks for root, ->d_delete() is irrelevant.
So we can use the same d_op for all coda dentries

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/dir.c              | 4 +---
 fs/coda/inode.c            | 1 +
 include/linux/coda_linux.h | 2 ++
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 29badd91360f..9df71f0eb218 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -61,7 +61,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
 
-static const struct dentry_operations coda_dentry_operations =
+const struct dentry_operations coda_dentry_operations =
 {
 	.d_revalidate	= coda_dentry_revalidate,
 	.d_delete	= coda_dentry_delete,
@@ -126,8 +126,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 		return ERR_PTR(error);
 
 exit:
-	d_set_d_op(entry, &coda_dentry_operations);
-
 	if (inode && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 50dc7d189f56..bd7fde2721a8 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -193,6 +193,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_blocksize_bits = 12;
 	sb->s_magic = CODA_SUPER_MAGIC;
 	sb->s_op = &coda_super_operations;
+	sb->s_d_op = &coda_dentry_operations;
 	sb->s_bdi = &vc->bdi;
 
 	/* get root fid from Venus: this needs the root inode */
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index 4ccc59c1ea82..490c56b40864 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -27,6 +27,8 @@ extern const struct inode_operations coda_dir_inode_operations;
 extern const struct inode_operations coda_file_inode_operations;
 extern const struct inode_operations coda_ioctl_inode_operations;
 
+extern const struct dentry_operations coda_dentry_operations;
+
 extern const struct address_space_operations coda_file_aops;
 extern const struct address_space_operations coda_symlink_aops;
 
-- 
cgit v1.2.3-71-gd317


From 31a203df9c109480fc6d48ba0a68763e89199acb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 12 Jan 2011 16:36:09 -0500
Subject: take coda-private headers out of include/linux

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/magic-number.txt |   2 +-
 fs/coda/cache.c                |   5 +-
 fs/coda/cnode.c                |   3 +-
 fs/coda/coda_cache.h           |  22 +++++++++
 fs/coda/coda_fs_i.h            |  58 +++++++++++++++++++++++
 fs/coda/coda_linux.c           |   3 +-
 fs/coda/coda_linux.h           | 101 +++++++++++++++++++++++++++++++++++++++++
 fs/coda/dir.c                  |   5 +-
 fs/coda/file.c                 |   3 +-
 fs/coda/inode.c                |   5 +-
 fs/coda/pioctl.c               |   4 +-
 fs/coda/psdev.c                |   4 +-
 fs/coda/symlink.c              |   4 +-
 fs/coda/upcall.c               |   5 +-
 include/linux/coda_cache.h     |  22 ---------
 include/linux/coda_fs_i.h      |  58 -----------------------
 include/linux/coda_linux.h     | 101 -----------------------------------------
 17 files changed, 199 insertions(+), 206 deletions(-)
 create mode 100644 fs/coda/coda_cache.h
 create mode 100644 fs/coda/coda_fs_i.h
 create mode 100644 fs/coda/coda_linux.h
 delete mode 100644 include/linux/coda_cache.h
 delete mode 100644 include/linux/coda_fs_i.h
 delete mode 100644 include/linux/coda_linux.h

(limited to 'include/linux')

diff --git a/Documentation/magic-number.txt b/Documentation/magic-number.txt
index 505f19607542..4b12abcb2ad3 100644
--- a/Documentation/magic-number.txt
+++ b/Documentation/magic-number.txt
@@ -150,7 +150,7 @@ NBD_REPLY_MAGIC       0x96744668  nbd_reply         include/linux/nbd.h
 STL_BOARDMAGIC        0xa2267f52  stlbrd            include/linux/stallion.h
 ENI155_MAGIC          0xa54b872d  midway_eprom	    drivers/atm/eni.h
 SCI_MAGIC             0xbabeface  gs_port           drivers/char/sh-sci.h
-CODA_MAGIC            0xC0DAC0DA  coda_file_info    include/linux/coda_fs_i.h
+CODA_MAGIC            0xC0DAC0DA  coda_file_info    fs/coda/coda_fs_i.h
 DPMEM_MAGIC           0xc0ffee11  gdt_pci_sram      drivers/scsi/gdth.h
 STLI_PORTMAGIC        0xe671c7a1  stliport          include/linux/istallion.h
 YAM_MAGIC             0xF10A7654  yam_port          drivers/net/hamradio/yam.c
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 5525e1c660fd..690157876184 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -20,10 +20,9 @@
 #include <linux/spinlock.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 static atomic_t permission_epoch = ATOMIC_INIT(0);
 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 602240569c89..6475877b0763 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
 #include <linux/time.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 {
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 000000000000..c910b5eb1ceb
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+
+#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 000000000000..e35071b1de0e
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ *  coda_fs_i.h
+ *
+ *  Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+	struct CodaFid	   c_fid;	/* Coda identifier */
+	u_short	           c_flags;     /* flags (see below) */
+	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
+	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
+	vuid_t		   c_uid;	/* fsuid for cached permissions */
+	unsigned int       c_cached_perm; /* cached access permissions */
+	spinlock_t	   c_lock;
+	struct inode	   vfs_inode;
+};
+
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+	int		   cfi_magic;	  /* magic number */
+	struct file	  *cfi_container; /* container file for this cnode */
+	unsigned int	   cfi_mapcount;  /* nr of times this file is mapped */
+};
+
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+
+/* flags */
+#define C_VATTR       0x1   /* Validity of vattr in inode */
+#define C_FLUSH       0x2   /* used after a flush */
+#define C_DYING       0x4   /* from venus (which died) */
+#define C_PURGE       0x8
+
+int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+
+#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e3..2bdbcc11b373 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
 #include <linux/string.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 
 /* initialize the debugging variables */
 int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 000000000000..9b0c5323890b
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
+/* 
+ * Coda File System, Linux Kernel module
+ * 
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>		
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+
+extern const struct dentry_operations coda_dentry_operations;
+
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_revalidate_inode(struct dentry *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+
+/* this file:  heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_isroot(struct inode *i);
+int coda_iscontrol(const char *name, size_t length);
+
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+
+#define CODA_ALLOC(ptr, cast, size) do { \
+    if (size < PAGE_SIZE) \
+        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+    else \
+        ptr = (cast)vmalloc((unsigned long) size); \
+    if (!ptr) \
+        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+    else memset( ptr, 0, size ); \
+} while (0)
+
+
+#define CODA_FREE(ptr,size) \
+    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+
+/* inode to cnode access functions */
+
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+	return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+	return &(ITOC(inode)->c_fid);
+}
+
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+	return coda_f2s(&(ITOC(inode)->c_fid));
+}
+
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+
+	spin_lock(&cii->c_lock);
+	cii->c_flags |= flag;
+	spin_unlock(&cii->c_lock);
+}		
+
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 9df71f0eb218..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,10 +23,9 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 #include "coda_int.h"
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c8b50ba4366a..0433057be330 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -21,10 +21,9 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include "coda_linux.h"
 #include "coda_int.h"
 
 static ssize_t
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index bd7fde2721a8..261d86f9caec 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -28,10 +28,9 @@
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 #include "coda_int.h"
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 741f0bd03918..6cbb3afb36dc 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,10 +19,10 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include "coda_linux.h"
+
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 62647a8595e4..8f616e0e252c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -43,10 +43,10 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include "coda_linux.h"
+
 #include "coda_int.h"
 
 /* statistics */
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index af78f007a2b0..ab94ef63caef 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -16,9 +16,9 @@
 #include <linux/pagemap.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+
+#include "coda_linux.h"
 
 static int coda_symlink_filler(struct file *file, struct page *page)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c3563cab9758..9727e0c52579 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,10 +33,9 @@
 #include <linux/vfs.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 #include "coda_int.h"
 
diff --git a/include/linux/coda_cache.h b/include/linux/coda_cache.h
deleted file mode 100644
index c910b5eb1ceb..000000000000
--- a/include/linux/coda_cache.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Coda filesystem -- Linux Minicache
- *
- * Copyright (C) 1989 - 1997 Carnegie Mellon University
- *
- * Carnegie Mellon University encourages users of this software to
- * contribute improvements to the Coda project. Contact Peter Braam
- * <coda@cs.cmu.edu>
- */
-
-#ifndef _CFSNC_HEADER_
-#define _CFSNC_HEADER_
-
-/* credential cache */
-void coda_cache_enter(struct inode *inode, int mask);
-void coda_cache_clear_inode(struct inode *);
-void coda_cache_clear_all(struct super_block *sb);
-int coda_cache_check(struct inode *inode, int mask);
-
-/* for downcalls and attributes and lookups */
-void coda_flag_inode_children(struct inode *inode, int flag);
-
-#endif /* _CFSNC_HEADER_ */
diff --git a/include/linux/coda_fs_i.h b/include/linux/coda_fs_i.h
deleted file mode 100644
index e35071b1de0e..000000000000
--- a/include/linux/coda_fs_i.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  coda_fs_i.h
- *
- *  Copyright (C) 1998 Carnegie Mellon University
- *
- */
-
-#ifndef _LINUX_CODA_FS_I
-#define _LINUX_CODA_FS_I
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/coda.h>
-
-/*
- * coda fs inode data
- * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
- * c_cached_perm.
- * vfs_inode is set only when the inode is created and never changes.
- * c_fid is set when the inode is created and should be considered immutable.
- */
-struct coda_inode_info {
-	struct CodaFid	   c_fid;	/* Coda identifier */
-	u_short	           c_flags;     /* flags (see below) */
-	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
-	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
-	vuid_t		   c_uid;	/* fsuid for cached permissions */
-	unsigned int       c_cached_perm; /* cached access permissions */
-	spinlock_t	   c_lock;
-	struct inode	   vfs_inode;
-};
-
-/*
- * coda fs file private data
- */
-#define CODA_MAGIC 0xC0DAC0DA
-struct coda_file_info {
-	int		   cfi_magic;	  /* magic number */
-	struct file	  *cfi_container; /* container file for this cnode */
-	unsigned int	   cfi_mapcount;  /* nr of times this file is mapped */
-};
-
-#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
-
-/* flags */
-#define C_VATTR       0x1   /* Validity of vattr in inode */
-#define C_FLUSH       0x2   /* used after a flush */
-#define C_DYING       0x4   /* from venus (which died) */
-#define C_PURGE       0x8
-
-int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
-struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
-struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
-void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
-
-#endif
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
deleted file mode 100644
index 490c56b40864..000000000000
--- a/include/linux/coda_linux.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* 
- * Coda File System, Linux Kernel module
- * 
- * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
- * Linux modifications (C) 1996, Peter J. Braam
- * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
- *
- * Carnegie Mellon University encourages users of this software to
- * contribute improvements to the Coda project.
- */
-
-#ifndef _LINUX_CODA_FS
-#define _LINUX_CODA_FS
-
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/wait.h>		
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/coda_fs_i.h>
-
-/* operations */
-extern const struct inode_operations coda_dir_inode_operations;
-extern const struct inode_operations coda_file_inode_operations;
-extern const struct inode_operations coda_ioctl_inode_operations;
-
-extern const struct dentry_operations coda_dentry_operations;
-
-extern const struct address_space_operations coda_file_aops;
-extern const struct address_space_operations coda_symlink_aops;
-
-extern const struct file_operations coda_dir_operations;
-extern const struct file_operations coda_file_operations;
-extern const struct file_operations coda_ioctl_operations;
-
-/* operations shared over more than one file */
-int coda_open(struct inode *i, struct file *f);
-int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask, unsigned int flags);
-int coda_revalidate_inode(struct dentry *);
-int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-int coda_setattr(struct dentry *, struct iattr *);
-
-/* this file:  heloers */
-char *coda_f2s(struct CodaFid *f);
-int coda_isroot(struct inode *i);
-int coda_iscontrol(const char *name, size_t length);
-
-void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
-void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
-unsigned short coda_flags_to_cflags(unsigned short);
-
-/* sysctl.h */
-void coda_sysctl_init(void);
-void coda_sysctl_clean(void);
-
-#define CODA_ALLOC(ptr, cast, size) do { \
-    if (size < PAGE_SIZE) \
-        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
-    else \
-        ptr = (cast)vmalloc((unsigned long) size); \
-    if (!ptr) \
-        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
-    else memset( ptr, 0, size ); \
-} while (0)
-
-
-#define CODA_FREE(ptr,size) \
-    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
-
-/* inode to cnode access functions */
-
-static inline struct coda_inode_info *ITOC(struct inode *inode)
-{
-	return list_entry(inode, struct coda_inode_info, vfs_inode);
-}
-
-static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
-{
-	return &(ITOC(inode)->c_fid);
-}
-
-static __inline__ char *coda_i2s(struct inode *inode)
-{
-	return coda_f2s(&(ITOC(inode)->c_fid));
-}
-
-/* this will not zap the inode away */
-static __inline__ void coda_flag_inode(struct inode *inode, int flag)
-{
-	struct coda_inode_info *cii = ITOC(inode);
-
-	spin_lock(&cii->c_lock);
-	cii->c_flags |= flag;
-	spin_unlock(&cii->c_lock);
-}		
-
-#endif
-- 
cgit v1.2.3-71-gd317


From c74a1cbb3cac348f276fabc381758f5b0b4713b2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 12 Jan 2011 16:59:34 -0500
Subject: pass default dentry_operations to mount_pseudo()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c |  6 ++++--
 drivers/mtd/mtdchar.c      |  2 +-
 fs/anon_inodes.c           | 21 +++++++++++----------
 fs/block_dev.c             |  2 +-
 fs/libfs.c                 |  4 +++-
 fs/pipe.c                  |  4 ++--
 include/linux/fs.h         |  4 +++-
 net/socket.c               | 30 +++++++++++++++---------------
 8 files changed, 40 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index d92d5b5161fc..ac76da099a6d 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -617,11 +617,14 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	return get_unmapped_area(file, addr, len, pgoff, flags);
 }
 
+/* forward declaration */
+static static const struct dentry_operations pfmfs_dentry_operations;
 
 static struct dentry *
 pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
+	return mount_pseudo(fs_type, "pfm:", NULL, &pfmfs_dentry_operations,
+			PFMFS_MAGIC);
 }
 
 static struct file_system_type pfm_fs_type = {
@@ -2232,7 +2235,6 @@ pfm_alloc_file(pfm_context_t *ctx)
 	}
 	path.mnt = mntget(pfmfs_mnt);
 
-	d_set_d_op(path.dentry, &pfmfs_dentry_operations);
 	d_add(path.dentry, inode);
 
 	file = alloc_file(&path, FMODE_READ, &pfm_file_ops);
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index f511dd15fd31..ee4bb3330bdf 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1134,7 +1134,7 @@ static const struct file_operations mtd_fops = {
 static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "mtd_inode:", NULL, MTD_INODE_FS_MAGIC);
+	return mount_pseudo(fs_type, "mtd_inode:", NULL, NULL, MTD_INODE_FS_MAGIC);
 }
 
 static struct file_system_type mtd_inodefs_type = {
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 5fd38112a6ca..549a53cc0283 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
 
-static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
-}
-
 /*
  * anon_inodefs_dname() is called from d_path().
  */
@@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 				dentry->d_name.name);
 }
 
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+	.d_dname	= anon_inodefs_dname,
+};
+
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "anon_inode:", NULL,
+			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
+
 static struct file_system_type anon_inode_fs_type = {
 	.name		= "anon_inodefs",
 	.mount		= anon_inodefs_mount,
 	.kill_sb	= kill_anon_super,
 };
-static const struct dentry_operations anon_inodefs_dentry_operations = {
-	.d_dname	= anon_inodefs_dname,
-};
 
 /*
  * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name,
 	 */
 	ihold(anon_inode_inode);
 
-	d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
 	d_instantiate(path.dentry, anon_inode_inode);
 
 	error = -ENFILE;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 771f23527010..88da70355aa3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -473,7 +473,7 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
 }
 
 static struct file_system_type bd_type = {
diff --git a/fs/libfs.c b/fs/libfs.c
index 889311e3d06b..c88eab55aec9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,8 @@ static const struct super_operations simple_super_operations = {
  * will never be mountable)
  */
 struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
-	const struct super_operations *ops, unsigned long magic)
+	const struct super_operations *ops,
+	const struct dentry_operations *dops, unsigned long magic)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	struct dentry *dentry;
@@ -254,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
 	dentry->d_parent = dentry;
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
+	s->s_d_op = dops;
 	s->s_flags |= MS_ACTIVE;
 	return dget(s->s_root);
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 68f1f8e4e23b..6b0255a74f36 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1004,7 +1004,6 @@ struct file *create_write_pipe(int flags)
 		goto err_inode;
 	path.mnt = mntget(pipe_mnt);
 
-	d_set_d_op(path.dentry, &pipefs_dentry_operations);
 	d_instantiate(path.dentry, inode);
 
 	err = -ENFILE;
@@ -1266,7 +1265,8 @@ static const struct super_operations pipefs_ops = {
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC);
+	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+			&pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 
 static struct file_system_type pipe_fs_type = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e4c27486e74..c0701288d204 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1835,7 +1835,9 @@ struct super_block *sget(struct file_system_type *type,
 			int (*set)(struct super_block *,void *),
 			void *data);
 extern struct dentry *mount_pseudo(struct file_system_type *, char *,
-	const struct super_operations *ops, unsigned long);
+	const struct super_operations *ops,
+	const struct dentry_operations *dops,
+	unsigned long);
 extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 
 static inline void sb_mark_dirty(struct super_block *sb)
diff --git a/net/socket.c b/net/socket.c
index ccc576a6a508..ac2219f90d5d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -306,20 +306,6 @@ static const struct super_operations sockfs_ops = {
 	.statfs		= simple_statfs,
 };
 
-static struct dentry *sockfs_mount(struct file_system_type *fs_type,
-			 int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
-}
-
-static struct vfsmount *sock_mnt __read_mostly;
-
-static struct file_system_type sock_fs_type = {
-	.name =		"sockfs",
-	.mount =	sockfs_mount,
-	.kill_sb =	kill_anon_super,
-};
-
 /*
  * sockfs_dname() is called from d_path().
  */
@@ -333,6 +319,21 @@ static const struct dentry_operations sockfs_dentry_operations = {
 	.d_dname  = sockfs_dname,
 };
 
+static struct dentry *sockfs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "socket:", &sockfs_ops,
+		&sockfs_dentry_operations, SOCKFS_MAGIC);
+}
+
+static struct vfsmount *sock_mnt __read_mostly;
+
+static struct file_system_type sock_fs_type = {
+	.name =		"sockfs",
+	.mount =	sockfs_mount,
+	.kill_sb =	kill_anon_super,
+};
+
 /*
  *	Obtains the first available file descriptor and sets it up for use.
  *
@@ -368,7 +369,6 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
 	}
 	path.mnt = mntget(sock_mnt);
 
-	d_set_d_op(path.dentry, &sockfs_dentry_operations);
 	d_instantiate(path.dentry, SOCK_INODE(sock));
 	SOCK_INODE(sock)->i_fop = &socket_file_ops;
 
-- 
cgit v1.2.3-71-gd317


From 0378c4051a621303ae919f1cee832206a4c1aa68 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 12 Jan 2011 17:25:03 -0500
Subject: switch ncpfs

merge dentry_operations for root and non-root

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/dir.c         | 15 ++++-----------
 fs/ncpfs/inode.c       |  2 +-
 include/linux/ncp_fs.h |  2 +-
 3 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 28f136d4aaec..119accd07dd5 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -82,7 +82,7 @@ static int ncp_compare_dentry(const struct dentry *, const struct inode *,
 		unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
 
-static const struct dentry_operations ncp_dentry_operations =
+const struct dentry_operations ncp_dentry_operations =
 {
 	.d_revalidate	= ncp_lookup_validate,
 	.d_hash		= ncp_hash_dentry,
@@ -90,14 +90,6 @@ static const struct dentry_operations ncp_dentry_operations =
 	.d_delete	= ncp_delete_dentry,
 };
 
-const struct dentry_operations ncp_root_dentry_operations =
-{
-	.d_hash		= ncp_hash_dentry,
-	.d_compare	= ncp_compare_dentry,
-	.d_delete	= ncp_delete_dentry,
-};
-
-
 #define ncp_namespace(i)	(NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
 
 static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
@@ -309,6 +301,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 	int res, val = 0, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
+	if (dentry == dentry->d_sb->s_root)
+		return 1;
+
 	if (nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
@@ -637,7 +632,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 		entry->ino = iunique(dir->i_sb, 2);
 		inode = ncp_iget(dir->i_sb, entry);
 		if (inode) {
-			d_set_d_op(newdent, &ncp_dentry_operations);
 			d_instantiate(newdent, inode);
 			if (!hashed)
 				d_rehash(newdent);
@@ -893,7 +887,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
 	if (inode) {
 		ncp_new_dentry(dentry);
 add_entry:
-		d_set_d_op(dentry, &ncp_dentry_operations);
 		d_add(dentry, inode);
 		error = 0;
 	}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9b39a5dd4131..8b8bebbb9601 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -544,6 +544,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_blocksize_bits = 10;
 	sb->s_magic = NCP_SUPER_MAGIC;
 	sb->s_op = &ncp_sops;
+	sb->s_d_op = &ncp_dentry_operations;
 	sb->s_bdi = &server->bdi;
 
 	server = NCP_SBP(sb);
@@ -723,7 +724,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_root = d_alloc_root(root_inode);
         if (!sb->s_root)
 		goto out_no_root;
-	d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
 	return 0;
 
 out_no_root:
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index 1c27f201c856..2d79c8a58598 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -204,7 +204,7 @@ void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
 /* linux/fs/ncpfs/dir.c */
 extern const struct inode_operations ncp_dir_inode_operations;
 extern const struct file_operations ncp_dir_operations;
-extern const struct dentry_operations ncp_root_dentry_operations;
+extern const struct dentry_operations ncp_dentry_operations;
 int ncp_conn_logged_in(struct super_block *);
 int ncp_date_dos2unix(__le16 time, __le16 date);
 void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
-- 
cgit v1.2.3-71-gd317


From 32c419d95f3d1da891ab9bd032a214ee05b94ed4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 12 Jan 2011 17:37:47 -0500
Subject: move internal-only parts of ncpfs headers to fs/ncpfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/dir.c            |   4 +-
 fs/ncpfs/file.c           |   3 +-
 fs/ncpfs/inode.c          |   4 +-
 fs/ncpfs/ioctl.c          |   4 +-
 fs/ncpfs/mmap.c           |   4 +-
 fs/ncpfs/ncp_fs.h         |  98 ++++++++++++++++++++++++++
 fs/ncpfs/ncp_fs_i.h       |  29 ++++++++
 fs/ncpfs/ncp_fs_sb.h      | 176 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ncpfs/ncplib_kernel.c  |   2 +-
 fs/ncpfs/ncplib_kernel.h  |   2 -
 fs/ncpfs/ncpsign_kernel.c |   1 +
 fs/ncpfs/ncpsign_kernel.h |   2 -
 fs/ncpfs/sock.c           |   2 +-
 fs/ncpfs/symlink.c        |   4 +-
 include/linux/ncp_fs.h    | 100 --------------------------
 include/linux/ncp_fs_i.h  |  29 --------
 include/linux/ncp_fs_sb.h | 164 ------------------------------------------
 include/linux/ncp_mount.h |  22 ------
 18 files changed, 313 insertions(+), 337 deletions(-)
 create mode 100644 fs/ncpfs/ncp_fs.h
 create mode 100644 fs/ncpfs/ncp_fs_i.h
 create mode 100644 fs/ncpfs/ncp_fs_sb.h
 delete mode 100644 include/linux/ncp_fs_i.h
 delete mode 100644 include/linux/ncp_fs_sb.h

(limited to 'include/linux')

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 119accd07dd5..f6946bb5cb55 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -21,9 +21,7 @@
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 
-#include <linux/ncp_fs.h>
-
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 static void ncp_read_volume_list(struct file *, void *, filldir_t,
 				struct ncp_cache_control *);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index cb50aaf981df..0ed65e0c3dfe 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,8 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 
-#include <linux/ncp_fs.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 static int ncp_fsync(struct file *file, int datasync)
 {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8b8bebbb9601..00a1d1c3d3a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -31,11 +31,9 @@
 #include <linux/seq_file.h>
 #include <linux/namei.h>
 
-#include <linux/ncp_fs.h>
-
 #include <net/sock.h>
 
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 #include "getopt.h"
 
 #define NCP_DEFAULT_FILE_MODE 0600
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d40a547e3377..790e92a9ec63 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,11 +20,9 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 
-#include <linux/ncp_fs.h>
-
 #include <asm/uaccess.h>
 
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 /* maximum limit for ncp_objectname_ioctl */
 #define NCP_OBJECT_NAME_MAX_LEN	4096
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1ee..a7c07b44b100 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/fcntl.h>
-#include <linux/ncp_fs.h>
 
-#include "ncplib_kernel.h"
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+#include "ncp_fs.h"
+
 /*
  * Fill in the supplied page for mmap
  * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 000000000000..31831afe1c3b
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+
+/* define because it is easy to change PRINTK to {*}PRINTK */
+#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
+
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define PPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define PPRINTK(format, args...)
+#endif
+
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+#if DEBUG_NCP > 0
+#define DPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if DEBUG_NCP > 1
+#define DDPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DDPRINTK(format, args...)
+#endif
+
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+
+
+struct ncp_entry_info {
+	struct nw_info_struct	i;
+	ino_t			ino;
+	int			opened;
+	int			access;
+	unsigned int		volume;
+	__u8			file_handle[6];
+};
+
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+#define NCP_SERVER(inode)	NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+	return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+	void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+	return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+
+#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 000000000000..4b0bec477846
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
+/*
+ *  ncp_fs_i.h
+ *
+ *  Copyright (C) 1995 Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+	__le32	dirEntNum;
+	__le32	DosDirNum;
+	__u8	volNumber;
+	__le32	nwattr;
+	struct mutex open_mutex;
+	atomic_t	opened;
+	int	access;
+	int	flags;
+#define NCPI_KLUDGE_SYMLINK	0x0001
+	__u8	file_handle[6];
+	struct inode vfs_inode;
+};
+
+#endif	/* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 000000000000..4af803f13516
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
+/*
+ *  ncp_fs_sb.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+
+#define NCP_DEFAULT_OPTIONS 0		/* 2 for packet signatures */
+
+struct sock;
+
+struct ncp_mount_data_kernel {
+	unsigned long    flags;		/* NCP_MOUNT_* flags */
+	unsigned int	 int_flags;	/* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE	0x0001
+	__kernel_uid32_t mounted_uid;	/* Who may umount() this filesystem? */
+	struct pid      *wdog_pid;	/* Who cares for our watchdog packets? */
+	unsigned int     ncp_fd;	/* The socket to the ncp port */
+	unsigned int     time_out;	/* How long should I wait after
+					   sending a NCP request? */
+	unsigned int     retry_count;	/* And how often should I retry? */
+	unsigned char	 mounted_vol[NCP_VOLNAME_LEN + 1];
+	__kernel_uid32_t uid;
+	__kernel_gid32_t gid;
+	__kernel_mode_t  file_mode;
+	__kernel_mode_t  dir_mode;
+	int		 info_fd;
+};
+
+struct ncp_server {
+
+	struct ncp_mount_data_kernel m;	/* Nearly all of the mount data is of
+					   interest for us later, so we store
+					   it completely. */
+
+	__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+
+	struct file *ncp_filp;	/* File pointer to ncp socket */
+	struct socket *ncp_sock;/* ncp socket */
+	struct file *info_filp;
+	struct socket *info_sock;
+
+	u8 sequence;
+	u8 task;
+	u16 connection;		/* Remote connection number */
+
+	u8 completion;		/* Status message from server */
+	u8 conn_status;		/* Bit 4 = 1 ==> Server going down, no
+				   requests allowed anymore.
+				   Bit 0 = 1 ==> Server is down. */
+
+	int buffer_size;	/* Negotiated bufsize */
+
+	int reply_size;		/* Size of last reply */
+
+	int packet_size;
+	unsigned char *packet;	/* Here we prepare requests and
+				   receive replies */
+	unsigned char *txbuf;	/* Storage for current request */
+	unsigned char *rxbuf;	/* Storage for reply to current request */
+
+	int lock;		/* To prevent mismatch in protocols. */
+	struct mutex mutex;
+
+	int current_size;	/* for packet preparation */
+	int has_subfunction;
+	int ncp_reply_size;
+
+	int root_setuped;
+	struct mutex root_setup_lock;
+
+	/* info for packet signing */
+	int sign_wanted;	/* 1=Server needs signed packets */
+	int sign_active;	/* 0=don't do signing, 1=do */
+	char sign_root[8];	/* generated from password and encr. key */
+	char sign_last[16];	
+
+	/* Authentication info: NDS or BINDERY, username */
+	struct {
+		int	auth_type;
+		size_t	object_name_len;
+		void*	object_name;
+		int	object_type;
+	} auth;
+	/* Password info */
+	struct {
+		size_t	len;
+		void*	data;
+	} priv;
+	struct rw_semaphore auth_rwsem;
+
+	/* nls info: codepage for volume and charset for I/O */
+	struct nls_table *nls_vol;
+	struct nls_table *nls_io;
+
+	/* maximum age in jiffies */
+	atomic_t dentry_ttl;
+
+	/* miscellaneous */
+	unsigned int flags;
+
+	spinlock_t requests_lock;	/* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+
+	void (*data_ready)(struct sock* sk, int len);
+	void (*error_report)(struct sock* sk);
+	void (*write_space)(struct sock* sk);	/* STREAM mode only */
+	struct {
+		struct work_struct tq;		/* STREAM/DGRAM: data/error ready */
+		struct ncp_request_reply* creq;	/* STREAM/DGRAM: awaiting reply from this request */
+		struct mutex creq_mutex;	/* DGRAM only: lock accesses to rcv.creq */
+
+		unsigned int state;		/* STREAM only: receiver state */
+		struct {
+			__u32 magic __packed;
+			__u32 len __packed;
+			__u16 type __packed;
+			__u16 p1 __packed;
+			__u16 p2 __packed;
+			__u16 p3 __packed;
+			__u16 type2 __packed;
+		} buf;				/* STREAM only: temporary buffer */
+		unsigned char* ptr;		/* STREAM only: pointer to data */
+		size_t len;			/* STREAM only: length of data to receive */
+	} rcv;
+	struct {
+		struct list_head requests;	/* STREAM only: queued requests */
+		struct work_struct tq;		/* STREAM only: transmitter ready */
+		struct ncp_request_reply* creq;	/* STREAM only: currently transmitted entry */
+	} tx;
+	struct timer_list timeout_tm;		/* DGRAM only: timeout timer */
+	struct work_struct timeout_tq;		/* DGRAM only: associated queue, we run timers from process context */
+	int timeout_last;			/* DGRAM only: current timeout length */
+	int timeout_retries;			/* DGRAM only: retries left */
+	struct {
+		size_t len;
+		__u8 data[128];
+	} unexpected_packet;
+	struct backing_dev_info bdi;
+};
+
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+
+#define NCP_FLAG_UTF8	1
+
+#define NCP_CLR_FLAG(server, flag)	((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag)	((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag)	((server)->flags & (flag))
+
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+	return ((server->conn_status & 0x11) == 0);
+}
+
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+	server->conn_status |= 0x01;
+}
+
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index a95615a0b6ac..981a95617fc9 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
 
 
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 static inline void assert_server_locked(struct ncp_server *server)
 {
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 1220df75ff22..09881e6aa5ad 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
 #include <linux/ctype.h>
 #endif /* CONFIG_NCPFS_NLS */
 
-#include <linux/ncp_fs.h>
-
 #define NCP_MIN_SYMLINK_SIZE	8
 #define NCP_MAX_SYMLINK_SIZE	512
 
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index d8b2d7e6910b..08907599dcd2 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <linux/ncp.h>
 #include <linux/bitops.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
 
 /* i386: 32-bit, little endian, handles mis-alignment */
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381cc..d9a1438bb1f6 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
 #ifndef _NCPSIGN_KERNEL_H
 #define _NCPSIGN_KERNEL_H
 
-#include <linux/ncp_fs.h>
-
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
 int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 668bd267346e..3a1587222c8a 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
 #include <linux/poll.h>
 #include <linux/file.h>
 
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
 
 #include "ncpsign_kernel.h"
 
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b337..661f861d80c6 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
 
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/ncp_fs.h>
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include "ncplib_kernel.h"
-
+#include "ncp_fs.h"
 
 /* these magic numbers must appear in the symlink file -- this makes it a bit
    more resilient against the magic attributes being set on random files. */
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index 2d79c8a58598..e13eefef0653 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -143,104 +143,4 @@ struct ncp_nls_ioctl
 #define NCP_MAXPATHLEN 255
 #define NCP_MAXNAMELEN 14
 
-#ifdef __KERNEL__
-
-#include <linux/ncp_fs_i.h>
-#include <linux/ncp_fs_sb.h>
-
-/* define because it is easy to change PRINTK to {*}PRINTK */
-#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
-
-#undef NCPFS_PARANOIA
-#ifdef NCPFS_PARANOIA
-#define PPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define PPRINTK(format, args...)
-#endif
-
-#ifndef DEBUG_NCP
-#define DEBUG_NCP 0
-#endif
-#if DEBUG_NCP > 0
-#define DPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define DPRINTK(format, args...)
-#endif
-#if DEBUG_NCP > 1
-#define DDPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define DDPRINTK(format, args...)
-#endif
-
-#define NCP_MAX_RPC_TIMEOUT (6*HZ)
-
-
-struct ncp_entry_info {
-	struct nw_info_struct	i;
-	ino_t			ino;
-	int			opened;
-	int			access;
-	unsigned int		volume;
-	__u8			file_handle[6];
-};
-
-static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-#define NCP_SERVER(inode)	NCP_SBP((inode)->i_sb)
-static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
-{
-	return container_of(inode, struct ncp_inode_info, vfs_inode);
-}
-
-/* linux/fs/ncpfs/inode.c */
-int ncp_notify_change(struct dentry *, struct iattr *);
-struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
-void ncp_update_inode(struct inode *, struct ncp_entry_info *);
-void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
-
-/* linux/fs/ncpfs/dir.c */
-extern const struct inode_operations ncp_dir_inode_operations;
-extern const struct file_operations ncp_dir_operations;
-extern const struct dentry_operations ncp_dentry_operations;
-int ncp_conn_logged_in(struct super_block *);
-int ncp_date_dos2unix(__le16 time, __le16 date);
-void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
-
-/* linux/fs/ncpfs/ioctl.c */
-long ncp_ioctl(struct file *, unsigned int, unsigned long);
-long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
-
-/* linux/fs/ncpfs/sock.c */
-int ncp_request2(struct ncp_server *server, int function,
-	void* reply, int max_reply_size);
-static inline int ncp_request(struct ncp_server *server, int function) {
-	return ncp_request2(server, function, server->packet, server->packet_size);
-}
-int ncp_connect(struct ncp_server *server);
-int ncp_disconnect(struct ncp_server *server);
-void ncp_lock_server(struct ncp_server *server);
-void ncp_unlock_server(struct ncp_server *server);
-
-/* linux/fs/ncpfs/symlink.c */
-#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
-extern const struct address_space_operations ncp_symlink_aops;
-int ncp_symlink(struct inode*, struct dentry*, const char*);
-#endif
-
-/* linux/fs/ncpfs/file.c */
-extern const struct inode_operations ncp_file_inode_operations;
-extern const struct file_operations ncp_file_operations;
-int ncp_make_open(struct inode *, int);
-
-/* linux/fs/ncpfs/mmap.c */
-int ncp_mmap(struct file *, struct vm_area_struct *);
-
-/* linux/fs/ncpfs/ncplib_kernel.c */
-int ncp_make_closed(struct inode *);
-
-#endif				/* __KERNEL__ */
-
 #endif				/* _LINUX_NCP_FS_H */
diff --git a/include/linux/ncp_fs_i.h b/include/linux/ncp_fs_i.h
deleted file mode 100644
index 4b0bec477846..000000000000
--- a/include/linux/ncp_fs_i.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  ncp_fs_i.h
- *
- *  Copyright (C) 1995 Volker Lendecke
- *
- */
-
-#ifndef _LINUX_NCP_FS_I
-#define _LINUX_NCP_FS_I
-
-/*
- * This is the ncpfs part of the inode structure. This must contain
- * all the information we need to work with an inode after creation.
- */
-struct ncp_inode_info {
-	__le32	dirEntNum;
-	__le32	DosDirNum;
-	__u8	volNumber;
-	__le32	nwattr;
-	struct mutex open_mutex;
-	atomic_t	opened;
-	int	access;
-	int	flags;
-#define NCPI_KLUDGE_SYMLINK	0x0001
-	__u8	file_handle[6];
-	struct inode vfs_inode;
-};
-
-#endif	/* _LINUX_NCP_FS_I */
diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h
deleted file mode 100644
index d64b0e894336..000000000000
--- a/include/linux/ncp_fs_sb.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- *  ncp_fs_sb.h
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *
- */
-
-#ifndef _NCP_FS_SB
-#define _NCP_FS_SB
-
-#include <linux/types.h>
-#include <linux/ncp_mount.h>
-#include <linux/net.h>
-#include <linux/mutex.h>
-#include <linux/backing-dev.h>
-
-#ifdef __KERNEL__
-
-#include <linux/workqueue.h>
-
-#define NCP_DEFAULT_OPTIONS 0		/* 2 for packet signatures */
-
-struct sock;
-
-struct ncp_server {
-
-	struct ncp_mount_data_kernel m;	/* Nearly all of the mount data is of
-					   interest for us later, so we store
-					   it completely. */
-
-	__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
-
-	struct file *ncp_filp;	/* File pointer to ncp socket */
-	struct socket *ncp_sock;/* ncp socket */
-	struct file *info_filp;
-	struct socket *info_sock;
-
-	u8 sequence;
-	u8 task;
-	u16 connection;		/* Remote connection number */
-
-	u8 completion;		/* Status message from server */
-	u8 conn_status;		/* Bit 4 = 1 ==> Server going down, no
-				   requests allowed anymore.
-				   Bit 0 = 1 ==> Server is down. */
-
-	int buffer_size;	/* Negotiated bufsize */
-
-	int reply_size;		/* Size of last reply */
-
-	int packet_size;
-	unsigned char *packet;	/* Here we prepare requests and
-				   receive replies */
-	unsigned char *txbuf;	/* Storage for current request */
-	unsigned char *rxbuf;	/* Storage for reply to current request */
-
-	int lock;		/* To prevent mismatch in protocols. */
-	struct mutex mutex;
-
-	int current_size;	/* for packet preparation */
-	int has_subfunction;
-	int ncp_reply_size;
-
-	int root_setuped;
-	struct mutex root_setup_lock;
-
-	/* info for packet signing */
-	int sign_wanted;	/* 1=Server needs signed packets */
-	int sign_active;	/* 0=don't do signing, 1=do */
-	char sign_root[8];	/* generated from password and encr. key */
-	char sign_last[16];	
-
-	/* Authentication info: NDS or BINDERY, username */
-	struct {
-		int	auth_type;
-		size_t	object_name_len;
-		void*	object_name;
-		int	object_type;
-	} auth;
-	/* Password info */
-	struct {
-		size_t	len;
-		void*	data;
-	} priv;
-	struct rw_semaphore auth_rwsem;
-
-	/* nls info: codepage for volume and charset for I/O */
-	struct nls_table *nls_vol;
-	struct nls_table *nls_io;
-
-	/* maximum age in jiffies */
-	atomic_t dentry_ttl;
-
-	/* miscellaneous */
-	unsigned int flags;
-
-	spinlock_t requests_lock;	/* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
-
-	void (*data_ready)(struct sock* sk, int len);
-	void (*error_report)(struct sock* sk);
-	void (*write_space)(struct sock* sk);	/* STREAM mode only */
-	struct {
-		struct work_struct tq;		/* STREAM/DGRAM: data/error ready */
-		struct ncp_request_reply* creq;	/* STREAM/DGRAM: awaiting reply from this request */
-		struct mutex creq_mutex;	/* DGRAM only: lock accesses to rcv.creq */
-
-		unsigned int state;		/* STREAM only: receiver state */
-		struct {
-			__u32 magic __packed;
-			__u32 len __packed;
-			__u16 type __packed;
-			__u16 p1 __packed;
-			__u16 p2 __packed;
-			__u16 p3 __packed;
-			__u16 type2 __packed;
-		} buf;				/* STREAM only: temporary buffer */
-		unsigned char* ptr;		/* STREAM only: pointer to data */
-		size_t len;			/* STREAM only: length of data to receive */
-	} rcv;
-	struct {
-		struct list_head requests;	/* STREAM only: queued requests */
-		struct work_struct tq;		/* STREAM only: transmitter ready */
-		struct ncp_request_reply* creq;	/* STREAM only: currently transmitted entry */
-	} tx;
-	struct timer_list timeout_tm;		/* DGRAM only: timeout timer */
-	struct work_struct timeout_tq;		/* DGRAM only: associated queue, we run timers from process context */
-	int timeout_last;			/* DGRAM only: current timeout length */
-	int timeout_retries;			/* DGRAM only: retries left */
-	struct {
-		size_t len;
-		__u8 data[128];
-	} unexpected_packet;
-	struct backing_dev_info bdi;
-};
-
-extern void ncp_tcp_rcv_proc(struct work_struct *work);
-extern void ncp_tcp_tx_proc(struct work_struct *work);
-extern void ncpdgram_rcv_proc(struct work_struct *work);
-extern void ncpdgram_timeout_proc(struct work_struct *work);
-extern void ncpdgram_timeout_call(unsigned long server);
-extern void ncp_tcp_data_ready(struct sock* sk, int len);
-extern void ncp_tcp_write_space(struct sock* sk);
-extern void ncp_tcp_error_report(struct sock* sk);
-
-#define NCP_FLAG_UTF8	1
-
-#define NCP_CLR_FLAG(server, flag)	((server)->flags &= ~(flag))
-#define NCP_SET_FLAG(server, flag)	((server)->flags |= (flag))
-#define NCP_IS_FLAG(server, flag)	((server)->flags & (flag))
-
-static inline int ncp_conn_valid(struct ncp_server *server)
-{
-	return ((server->conn_status & 0x11) == 0);
-}
-
-static inline void ncp_invalidate_conn(struct ncp_server *server)
-{
-	server->conn_status |= 0x01;
-}
-
-#endif				/* __KERNEL__ */
-
-#endif
- 
diff --git a/include/linux/ncp_mount.h b/include/linux/ncp_mount.h
index a2b549eb1eca..dfcbea2d889f 100644
--- a/include/linux/ncp_mount.h
+++ b/include/linux/ncp_mount.h
@@ -68,26 +68,4 @@ struct ncp_mount_data_v4 {
 
 #define NCP_MOUNT_VERSION_V5	(5)	/* Text only */
 
-#ifdef __KERNEL__
-
-struct ncp_mount_data_kernel {
-	unsigned long    flags;		/* NCP_MOUNT_* flags */
-	unsigned int	 int_flags;	/* internal flags */
-#define NCP_IMOUNT_LOGGEDIN_POSSIBLE	0x0001
-	__kernel_uid32_t mounted_uid;	/* Who may umount() this filesystem? */
-	struct pid      *wdog_pid;	/* Who cares for our watchdog packets? */
-	unsigned int     ncp_fd;	/* The socket to the ncp port */
-	unsigned int     time_out;	/* How long should I wait after
-					   sending a NCP request? */
-	unsigned int     retry_count;	/* And how often should I retry? */
-	unsigned char	 mounted_vol[NCP_VOLNAME_LEN + 1];
-	__kernel_uid32_t uid;
-	__kernel_gid32_t gid;
-	__kernel_mode_t  file_mode;
-	__kernel_mode_t  dir_mode;
-	int		 info_fd;
-};
-
-#endif /* __KERNEL__ */
-
 #endif
-- 
cgit v1.2.3-71-gd317


From 79124f18b335172e1916075c633745e12dae1dac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 17 Nov 2010 20:46:15 -0500
Subject: fs: add hole punching to fallocate

Hole punching has already been implemented by XFS and OCFS2, and has the
potential to be implemented on both BTRFS and EXT4 so we need a generic way to
get to this feature.  The simplest way in my mind is to add FALLOC_FL_PUNCH_HOLE
to fallocate() since it already looks like the normal fallocate() operation.
I've tested this patch with XFS and BTRFS to make sure XFS did what it's
supposed to do and that BTRFS failed like it was supposed to.  Thank you,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c              | 7 ++++++-
 include/linux/falloc.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023d..5b6ef7e2859e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		return -EINVAL;
 
 	/* Return error if mode is not supported */
-	if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	/* Punch hole must have keep size set */
+	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+	    !(mode & FALLOC_FL_KEEP_SIZE))
 		return -EOPNOTSUPP;
 
 	if (!(file->f_mode & FMODE_WRITE))
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 3c155107d61f..73e0b628e058 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -2,6 +2,7 @@
 #define _FALLOC_H_
 
 #define FALLOC_FL_KEEP_SIZE	0x01 /* default is extend size */
+#define FALLOC_FL_PUNCH_HOLE	0x02 /* de-allocates range */
 
 #ifdef __KERNEL__
 
-- 
cgit v1.2.3-71-gd317


From 278ad4fd0e7517f1d74b8fc1c8347cad34a09b02 Mon Sep 17 00:00:00 2001
From: Samu Onkalo <samu.p.onkalo@nokia.com>
Date: Wed, 12 Jan 2011 16:59:17 -0800
Subject: leds: leds-lp5523: modify the way of setting led device name

Currently all leds channels begins with string lp5523.  Patch adds a
possibility to provide name via platform data.  This makes it possible to
have several chips without overlapping sysfs names.

Signed-off-by: Samu Onkalo <samu.p.onkalo@nokia.com>
Cc: Arun Murthy <arun.murthy@stericsson.com>
Cc: Ilkka Koskinen <ilkka.koskinen@nokia.com>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/leds/leds-lp5523.c  | 3 ++-
 include/linux/leds-lp5523.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/leds/leds-lp5523.c b/drivers/leds/leds-lp5523.c
index 0cc4ead2fd8b..6cb41607519d 100644
--- a/drivers/leds/leds-lp5523.c
+++ b/drivers/leds/leds-lp5523.c
@@ -870,7 +870,8 @@ static int __init lp5523_init_led(struct lp5523_led *led, struct device *dev,
 			return -EINVAL;
 		}
 
-		snprintf(name, 32, "lp5523:channel%d", chan);
+		snprintf(name, sizeof(name), "%s:channel%d",
+			pdata->label ?: "lp5523", chan);
 
 		led->cdev.name = name;
 		led->cdev.brightness_set = lp5523_set_brightness;
diff --git a/include/linux/leds-lp5523.h b/include/linux/leds-lp5523.h
index 796747637b80..2694289babd0 100644
--- a/include/linux/leds-lp5523.h
+++ b/include/linux/leds-lp5523.h
@@ -42,6 +42,7 @@ struct lp5523_platform_data {
 	int	(*setup_resources)(void);
 	void	(*release_resources)(void);
 	void	(*enable)(bool state);
+	const	char *label;
 };
 
 #endif /* __LINUX_LP5523_H */
-- 
cgit v1.2.3-71-gd317


From 61a83932b8b3f1624d753e06aab2fdf0a119e100 Mon Sep 17 00:00:00 2001
From: Arun Murthy <arun.murthy@stericsson.com>
Date: Wed, 12 Jan 2011 16:59:20 -0800
Subject: leds-lp5521: modify the way of setting led device name

Currently the led device name is fetched from the device_type in
I2C_BOARD_INFO which comes from the platform data.  This name is in turn
used to create an entry in sysfs.

If there exists two or more lp5521 on a particular platform, the
device_type in I2C_BOARD_INFO has to be the same, else lp5521 driver probe
wont be called and if used so, results in run time warning "cannot create
sysfs with same name" and hence a failure.

The name that is used to create sysfs entry is to be passed by the struct
led_platform_data.  Hence adding an element of type const char * and
change in lp5521 driver to use this name in creating the led device if
present else use the name obtained by I2C_BOARD_INFO.

Signed-off-by: Arun Murthy <arun.murthy@stericsson.com>
Acked-by: Samu Onkalo <samu.p.onkalo@nokia.com>
Cc: Ilkka Koskinen <ilkka.koskinen@nokia.com>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/leds/leds-lp5521.c  | 3 ++-
 include/linux/leds-lp5521.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/leds/leds-lp5521.c b/drivers/leds/leds-lp5521.c
index 4d91c08656c5..80a3ae3c00b9 100644
--- a/drivers/leds/leds-lp5521.c
+++ b/drivers/leds/leds-lp5521.c
@@ -617,7 +617,8 @@ static int __init lp5521_init_led(struct lp5521_led *led,
 		return -EINVAL;
 	}
 
-	snprintf(name, sizeof(name), "%s:channel%d", client->name, chan);
+	snprintf(name, sizeof(name), "%s:channel%d",
+			pdata->label ?: client->name, chan);
 	led->cdev.brightness_set = lp5521_set_brightness;
 	led->cdev.name = name;
 	res = led_classdev_register(dev, &led->cdev);
diff --git a/include/linux/leds-lp5521.h b/include/linux/leds-lp5521.h
index 38368d785f08..fd548d2a8775 100644
--- a/include/linux/leds-lp5521.h
+++ b/include/linux/leds-lp5521.h
@@ -42,6 +42,7 @@ struct lp5521_platform_data {
 	int	(*setup_resources)(void);
 	void	(*release_resources)(void);
 	void	(*enable)(bool state);
+	const char *label;
 };
 
 #endif /* __LINUX_LP5521_H */
-- 
cgit v1.2.3-71-gd317


From 04c6862c055fb687c90d9652f32c11a063df15cf Mon Sep 17 00:00:00 2001
From: Seiji Aguchi <seiji.aguchi@hds.com>
Date: Wed, 12 Jan 2011 16:59:30 -0800
Subject: kmsg_dump: add kmsg_dump() calls to the reboot, halt, poweroff and
 emergency_restart paths

We need to know the reason why system rebooted in support service.
However, we can't inform our customers of the reason because final
messages are lost on current Linux kernel.

This patch improves the situation above because the final messages are
saved by adding kmsg_dump() to reboot, halt, poweroff and
emergency_restart path.

Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Marco Stornelli <marco.stornelli@gmail.com>
Reviewed-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmsg_dump.h | 4 ++++
 kernel/printk.c           | 4 ++++
 kernel/sys.c              | 6 ++++++
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 24b44145a886..2a0d7d651dc3 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -18,6 +18,10 @@ enum kmsg_dump_reason {
 	KMSG_DUMP_OOPS,
 	KMSG_DUMP_PANIC,
 	KMSG_DUMP_KEXEC,
+	KMSG_DUMP_RESTART,
+	KMSG_DUMP_HALT,
+	KMSG_DUMP_POWEROFF,
+	KMSG_DUMP_EMERG,
 };
 
 /**
diff --git a/kernel/printk.c b/kernel/printk.c
index f64b8997fc76..0b0c9aa71e89 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1539,6 +1539,10 @@ static const char * const kmsg_reasons[] = {
 	[KMSG_DUMP_OOPS]	= "oops",
 	[KMSG_DUMP_PANIC]	= "panic",
 	[KMSG_DUMP_KEXEC]	= "kexec",
+	[KMSG_DUMP_RESTART]	= "restart",
+	[KMSG_DUMP_HALT]	= "halt",
+	[KMSG_DUMP_POWEROFF]	= "poweroff",
+	[KMSG_DUMP_EMERG]	= "emergency_restart",
 };
 
 static const char *kmsg_to_str(enum kmsg_dump_reason reason)
diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcdb6c6c..31b71a276b40 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
 
+#include <linux/kmsg_dump.h>
+
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
  */
 void emergency_restart(void)
 {
+	kmsg_dump(KMSG_DUMP_EMERG);
 	machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
 		printk(KERN_EMERG "Restarting system.\n");
 	else
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+	kmsg_dump(KMSG_DUMP_RESTART);
 	machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
 	kernel_shutdown_prepare(SYSTEM_HALT);
 	sysdev_shutdown();
 	printk(KERN_EMERG "System halted.\n");
+	kmsg_dump(KMSG_DUMP_HALT);
 	machine_halt();
 }
 
@@ -351,6 +356,7 @@ void kernel_power_off(void)
 	disable_nonboot_cpus();
 	sysdev_shutdown();
 	printk(KERN_EMERG "Power down.\n");
+	kmsg_dump(KMSG_DUMP_POWEROFF);
 	machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
-- 
cgit v1.2.3-71-gd317


From 71a9048448de302d1e968f336de01060d02fae71 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 12 Jan 2011 16:59:35 -0800
Subject: include/linux/kernel.h: abs(): fix handling of 32-bit unsigneds on
 64-bit

Michal reports:

In the framebuffer subsystem the abs() macro is often used as a part of
the calculation of a Manhattan metric, which in turn is used as a measure
of similarity between video modes.  The arguments of abs() are sometimes
unsigned numbers.  This worked fine until commit a49c59c0 ("Make sure the
value in abs() does not get truncated if it is greater than 2^32:) , which
changed the definition of abs() to prevent truncation.  As a result of
this change, in the following piece of code:

u32 a = 0, b = 1;
u32 c = abs(a - b);

'c' will end up with a value of 0xffffffff instead of the expected 0x1.

A problem caused by this change and visible by the end user is that
framebuffer drivers relying on functions from modedb.c will fail to find
high resolution video modes similar to that explicitly requested by the
user if an exact match cannot be found (see e.g.

Fix this by special-casing `long' types within abs().

This patch reduces x86_64 code size a bit - drivers/video/uvesafb.o shrunk
by 15 bytes, presumably because it is doing abs() on 4-byte quantities,
and expanding those to 8-byte longs adds code.

testcase:

#define oldabs(x) ({				\
		long __x = (x);			\
		(__x < 0) ? -__x : __x;		\
	})

#define newabs(x) ({						\
		long ret;					\
		if (sizeof(x) == sizeof(long)) {		\
			long __x = (x);				\
			ret = (__x < 0) ? -__x : __x;		\
		} else {					\
			int __x = (x);				\
			ret = (__x < 0) ? -__x : __x;		\
		}						\
		ret;						\
	})

typedef unsigned int u32;

main()
{
	u32 a = 0;
	u32 b = 1;
	u32 oldc = oldabs(a - b);
	u32 newc = newabs(a - b);

	printf("%u %u\n", oldc, newc);
}

akpm:/home/akpm> gcc t.c
akpm:/home/akpm> ./a.out
4294967295 1

Reported-by: Michal Januszewski <michalj@gmail.com>
Cc: Rolf Eike Beer <eike-kernel@sf-tec.de
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d0fbc043de60..57dac7022b63 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -143,9 +143,22 @@ extern int _cond_resched(void);
 
 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
 
-#define abs(x) ({				\
-		long __x = (x);			\
-		(__x < 0) ? -__x : __x;		\
+/*
+ * abs() handles unsigned and signed longs, ints, shorts and chars.  For all
+ * input types abs() returns a signed long.
+ * abs() should not be used for 64-bit types (s64, u64, long long) - use abs64()
+ * for those.
+ */
+#define abs(x) ({						\
+		long ret;					\
+		if (sizeof(x) == sizeof(long)) {		\
+			long __x = (x);				\
+			ret = (__x < 0) ? -__x : __x;		\
+		} else {					\
+			int __x = (x);				\
+			ret = (__x < 0) ? -__x : __x;		\
+		}						\
+		ret;						\
 	})
 
 #define abs64(x) ({				\
-- 
cgit v1.2.3-71-gd317


From 049763db6cd002cb447a5684b5a543a69d647d42 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <virtuoso@slind.org>
Date: Wed, 12 Jan 2011 16:59:35 -0800
Subject: toshiba.h: hide a function prototypes behind __KERNEL__ macro

Currently, tosh_smm() prototype is present in a header file exported to
userland.  This patch fixes it.

Signed-off-by: Alexander Shishkin <virtuoso@slind.org>
Cc: Jonathan Buzzard <jonathan@buzzard.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/toshiba.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/toshiba.h b/include/linux/toshiba.h
index 6a7c4edf0e13..772dedbc3a22 100644
--- a/include/linux/toshiba.h
+++ b/include/linux/toshiba.h
@@ -33,6 +33,8 @@ typedef struct {
 	unsigned int edi __attribute__ ((packed));
 } SMMRegisters;
 
+#ifdef __KERNEL__
 int tosh_smm(SMMRegisters *regs);
+#endif /* __KERNEL__ */
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 1725310324163f56fea71bb77c5d6e88f9ce4192 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 12 Jan 2011 16:59:37 -0800
Subject: include/linux/unaligned/packed_struct.h: use __packed

Cc: Will Newton <will.newton@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/unaligned/packed_struct.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/unaligned/packed_struct.h b/include/linux/unaligned/packed_struct.h
index c9a6abd972a1..c0d817de4df2 100644
--- a/include/linux/unaligned/packed_struct.h
+++ b/include/linux/unaligned/packed_struct.h
@@ -3,9 +3,9 @@
 
 #include <linux/kernel.h>
 
-struct __una_u16 { u16 x; } __attribute__((packed));
-struct __una_u32 { u32 x; } __attribute__((packed));
-struct __una_u64 { u64 x; } __attribute__((packed));
+struct __una_u16 { u16 x; } __packed;
+struct __una_u32 { u32 x; } __packed;
+struct __una_u64 { u64 x; } __packed;
 
 static inline u16 __get_unaligned_cpu16(const void *p)
 {
-- 
cgit v1.2.3-71-gd317


From 455cd5ab305c90ffc422dd2e0fb634730942b257 Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Wed, 12 Jan 2011 16:59:41 -0800
Subject: kptr_restrict for hiding kernel pointers from unprivileged users

Add the %pK printk format specifier and the /proc/sys/kernel/kptr_restrict
sysctl.

The %pK format specifier is designed to hide exposed kernel pointers,
specifically via /proc interfaces.  Exposing these pointers provides an
easy target for kernel write vulnerabilities, since they reveal the
locations of writable structures containing easily triggerable function
pointers.  The behavior of %pK depends on the kptr_restrict sysctl.

If kptr_restrict is set to 0, no deviation from the standard %p behavior
occurs.  If kptr_restrict is set to 1, the default, if the current user
(intended to be a reader via seq_printf(), etc.) does not have CAP_SYSLOG
(currently in the LSM tree), kernel pointers using %pK are printed as 0's.
 If kptr_restrict is set to 2, kernel pointers using %pK are printed as
0's regardless of privileges.  Replacing with 0's was chosen over the
default "(null)", which cannot be parsed by userland %p, which expects
"(nil)".

[akpm@linux-foundation.org: check for IRQ context when !kptr_restrict, save an indent level, s/WARN/WARN_ONCE/]
[akpm@linux-foundation.org: coding-style fixup]
[randy.dunlap@oracle.com: fix kernel/sysctl.c warning]
Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: James Morris <jmorris@namei.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Thomas Graf <tgraf@infradead.org>
Cc: Eugene Teo <eugeneteo@kernel.org>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Eric Paris <eparis@parisplace.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 include/linux/printk.h          |  1 +
 kernel/sysctl.c                 | 10 ++++++++++
 lib/vsprintf.c                  | 22 ++++++++++++++++++++++
 4 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 574067194f38..11d5ceda5bb0 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -261,6 +262,19 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kptr_restrict:
+
+This toggle indicates whether restrictions are placed on
+exposing kernel addresses via /proc and other interfaces.  When
+kptr_restrict is set to (0), there are no restrictions.  When
+kptr_restrict is set to (1), the default, kernel pointers
+printed using the %pK format specifier will be replaced with 0's
+unless the user has CAP_SYSLOG.  When kptr_restrict is set to
+(2), kernel pointers printed using %pK will be replaced with 0's
+regardless of privileges.
+
+==============================================================
+
 kstack_depth_to_print: (X86 only)
 
 Controls the number of words to print when dumping the raw
diff --git a/include/linux/printk.h b/include/linux/printk.h
index b772ca5fbdf0..9adfba6ec28a 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -83,6 +83,7 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 
 extern int printk_delay_msec;
 extern int dmesg_restrict;
+extern int kptr_restrict;
 
 /*
  * Print a one-time message (analogous to WARN_ONCE() et al):
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1e3ced..c6811ee2092b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/signal.h>
+#include <linux/printk.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -710,6 +711,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "kptr_restrict",
+		.data		= &kptr_restrict,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
+	},
 #endif
 	{
 		.procname	= "ngroups_max",
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index c150d3dafff4..6ff38524ec16 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -936,6 +936,8 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
 	return string(buf, end, uuid, spec);
 }
 
+int kptr_restrict = 1;
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -979,6 +981,7 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
  *       Implements a "recursive vsnprintf".
  *       Do not use this feature without some mechanism to verify the
  *       correctness of the format string and va_list arguments.
+ * - 'K' For a kernel pointer that should be hidden from unprivileged users
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -1035,6 +1038,25 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		return buf + vsnprintf(buf, end - buf,
 				       ((struct va_format *)ptr)->fmt,
 				       *(((struct va_format *)ptr)->va));
+	case 'K':
+		/*
+		 * %pK cannot be used in IRQ context because its test
+		 * for CAP_SYSLOG would be meaningless.
+		 */
+		if (in_irq() || in_serving_softirq() || in_nmi()) {
+			if (spec.field_width == -1)
+				spec.field_width = 2 * sizeof(void *);
+			return string(buf, end, "pK-error", spec);
+		} else if ((kptr_restrict == 0) ||
+			 (kptr_restrict == 1 &&
+			  has_capability_noaudit(current, CAP_SYSLOG)))
+			break;
+
+		if (spec.field_width == -1) {
+			spec.field_width = 2 * sizeof(void *);
+			spec.flags |= ZEROPAD;
+		}
+		return number(buf, end, 0, spec);
 	}
 	spec.flags |= SMALL;
 	if (spec.field_width == -1) {
-- 
cgit v1.2.3-71-gd317


From a9747cc3addf81e56ea9b8a4374fbdb9d1737eae Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 16:59:43 -0800
Subject: include/linux/printk.h: move console functions and variables together

There are many uses of printk_once(KERN_<level>, so add pr_<level>_once
macros to avoid printk_once(KERN_<level> pr_fmt(fmt).

Add an #ifdef CONFIG_PRINTK for print_hex_dump and static inline void
functions for the #else cases to reduce embedded code size.  Neaten and
organize the rest of the code.

This patch:

Move console functions and variables together.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9adfba6ec28a..bd43c727c0a0 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -29,6 +29,17 @@ extern int console_printk[];
 #define minimum_console_loglevel (console_printk[2])
 #define default_console_loglevel (console_printk[3])
 
+static inline void console_silent(void)
+{
+	console_loglevel = 0;
+}
+
+static inline void console_verbose(void)
+{
+	if (console_loglevel)
+		console_loglevel = 15;
+}
+
 struct va_format {
 	const char *fmt;
 	va_list *va;
@@ -131,17 +142,6 @@ extern void printk_tick(void);
 extern void asmlinkage __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
-static inline void console_silent(void)
-{
-	console_loglevel = 0;
-}
-
-static inline void console_verbose(void)
-{
-	if (console_loglevel)
-		console_loglevel = 15;
-}
-
 extern void dump_stack(void) __cold;
 
 enum {
-- 
cgit v1.2.3-71-gd317


From 7d1e91aeb317c3c747369d8594fc1b742c265a07 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 16:59:45 -0800
Subject: include/linux/printk.h: use space after #define

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index bd43c727c0a0..b71131116878 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -4,14 +4,14 @@
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 
-#define	KERN_EMERG	"<0>"	/* system is unusable			*/
-#define	KERN_ALERT	"<1>"	/* action must be taken immediately	*/
-#define	KERN_CRIT	"<2>"	/* critical conditions			*/
-#define	KERN_ERR	"<3>"	/* error conditions			*/
-#define	KERN_WARNING	"<4>"	/* warning conditions			*/
-#define	KERN_NOTICE	"<5>"	/* normal but significant condition	*/
-#define	KERN_INFO	"<6>"	/* informational			*/
-#define	KERN_DEBUG	"<7>"	/* debug-level messages			*/
+#define KERN_EMERG	"<0>"	/* system is unusable			*/
+#define KERN_ALERT	"<1>"	/* action must be taken immediately	*/
+#define KERN_CRIT	"<2>"	/* critical conditions			*/
+#define KERN_ERR	"<3>"	/* error conditions			*/
+#define KERN_WARNING	"<4>"	/* warning conditions			*/
+#define KERN_NOTICE	"<5>"	/* normal but significant condition	*/
+#define KERN_INFO	"<6>"	/* informational			*/
+#define KERN_DEBUG	"<7>"	/* debug-level messages			*/
 
 /* Use the default kernel loglevel */
 #define KERN_DEFAULT	"<d>"
@@ -20,7 +20,7 @@ extern const char linux_proc_banner[];
  * line that had no enclosing \n). Only to be used by core/arch code
  * during early bootup (a continued line is not SMP-safe otherwise).
  */
-#define	KERN_CONT	"<c>"
+#define KERN_CONT	"<c>"
 
 extern int console_printk[];
 
-- 
cgit v1.2.3-71-gd317


From 5264f2f75d8678f3e9683042a4639baa5884bda9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 16:59:45 -0800
Subject: include/linux/printk.h: use and neaten no_printk

- Move no_printk above first CONFIG_PRINTK block so it can be used by
  printk_once.

- Convert statement expression if (0) printk macros to no_printk.

- Convert printk_once(x...) to more normally used (fmt, ...) fmt,
  ##__VA_ARGS__.

- Standardize __attribute__ use.

- Expand single line inline functions.

- Remove space before pointer.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 75 +++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index b71131116878..4788c2887e65 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -76,11 +76,27 @@ struct va_format {
  */
 #define HW_ERR		"[Hardware Error]: "
 
+/*
+ * Dummy printk for disabled debugging statements to use whilst maintaining
+ * gcc's format and side-effect checking.
+ */
+static inline __attribute__ ((format (printf, 1, 2)))
+int no_printk(const char *fmt, ...)
+{
+	return 0;
+}
+
+extern asmlinkage __attribute__ ((format (printf, 1, 2)))
+void early_printk(const char *fmt, ...);
+
+extern int printk_needs_cpu(int cpu);
+extern void printk_tick(void);
+
 #ifdef CONFIG_PRINTK
-asmlinkage int vprintk(const char *fmt, va_list args)
-	__attribute__ ((format (printf, 1, 0)));
-asmlinkage int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2))) __cold;
+asmlinkage __attribute__ ((format (printf, 1, 0)))
+int vprintk(const char *fmt, va_list args);
+asmlinkage __attribute__ ((format (printf, 1, 2))) __cold
+int printk(const char *fmt, ...);
 
 /*
  * Please don't use printk_ratelimit(), because it shares ratelimiting state
@@ -110,38 +126,34 @@ extern int kptr_restrict;
 
 void log_buf_kexec_setup(void);
 #else
-static inline int vprintk(const char *s, va_list args)
-	__attribute__ ((format (printf, 1, 0)));
-static inline int vprintk(const char *s, va_list args) { return 0; }
-static inline int printk(const char *s, ...)
-	__attribute__ ((format (printf, 1, 2)));
-static inline int __cold printk(const char *s, ...) { return 0; }
-static inline int printk_ratelimit(void) { return 0; }
-static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
-					  unsigned int interval_msec)	\
-		{ return false; }
+static inline __attribute__ ((format (printf, 1, 0)))
+int vprintk(const char *s, va_list args)
+{
+	return 0;
+}
+static inline __attribute__ ((format (printf, 1, 2))) __cold
+int printk(const char *s, ...)
+{
+	return 0;
+}
+static inline int printk_ratelimit(void)
+{
+	return 0;
+}
+static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+					  unsigned int interval_msec)
+{
+	return false;
+}
 
 /* No effect, but we still get type checking even in the !PRINTK case: */
-#define printk_once(x...) printk(x)
+#define printk_once(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
 
 static inline void log_buf_kexec_setup(void)
 {
 }
 #endif
 
-/*
- * Dummy printk for disabled debugging statements to use whilst maintaining
- * gcc's format and side-effect checking.
- */
-static inline __attribute__ ((format (printf, 1, 2)))
-int no_printk(const char *s, ...) { return 0; }
-
-extern int printk_needs_cpu(int cpu);
-extern void printk_tick(void);
-
-extern void asmlinkage __attribute__((format(printf, 1, 2)))
-	early_printk(const char *fmt, ...);
-
 extern void dump_stack(void) __cold;
 
 enum {
@@ -186,7 +198,7 @@ extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #else
 #define pr_devel(fmt, ...) \
-	({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; })
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
 /* If you are writing a driver, please use dev_dbg instead */
@@ -199,7 +211,7 @@ extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 	dynamic_pr_debug(fmt, ##__VA_ARGS__)
 #else
 #define pr_debug(fmt, ...) \
-	({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; })
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
 /*
@@ -242,8 +254,7 @@ extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 	printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #else
 #define pr_debug_ratelimited(fmt, ...) \
-	({ if (0) printk_ratelimited(KERN_DEBUG pr_fmt(fmt), \
-				     ##__VA_ARGS__); 0; })
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 16cb839f13324978bd58082e69de81a711802b11 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 16:59:46 -0800
Subject: include/linux/printk.h: add pr_<level>_once macros

- Move printk_once definitions and add an #ifdef CONFIG_PRINTK

- Add pr_<level>_once so printks can use pr_fmt

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 59 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4788c2887e65..6442156707c9 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -112,18 +112,6 @@ extern int printk_delay_msec;
 extern int dmesg_restrict;
 extern int kptr_restrict;
 
-/*
- * Print a one-time message (analogous to WARN_ONCE() et al):
- */
-#define printk_once(x...) ({			\
-	static bool __print_once;		\
-						\
-	if (!__print_once) {			\
-		__print_once = true;		\
-		printk(x);			\
-	}					\
-})
-
 void log_buf_kexec_setup(void);
 #else
 static inline __attribute__ ((format (printf, 1, 0)))
@@ -146,9 +134,6 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 	return false;
 }
 
-/* No effect, but we still get type checking even in the !PRINTK case: */
-#define printk_once(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
-
 static inline void log_buf_kexec_setup(void)
 {
 }
@@ -214,6 +199,50 @@ extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
+/*
+ * Print a one-time message (analogous to WARN_ONCE() et al):
+ */
+
+#ifdef CONFIG_PRINTK
+#define printk_once(fmt, ...)			\
+({						\
+	static bool __print_once;		\
+						\
+	if (!__print_once) {			\
+		__print_once = true;		\
+		printk(fmt, ##__VA_ARGS__);	\
+	}					\
+})
+#else
+#define printk_once(fmt, ...)			\
+	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define pr_emerg_once(fmt, ...)					\
+	printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert_once(fmt, ...)					\
+	printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit_once(fmt, ...)					\
+	printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err_once(fmt, ...)					\
+	printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn_once(fmt, ...)					\
+	printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_notice_once(fmt, ...)				\
+	printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info_once(fmt, ...)					\
+	printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_cont_once(fmt, ...)					\
+	printk_once(KERN_CONT pr_fmt(fmt), ##__VA_ARGS__)
+/* If you are writing a driver, please use dev_dbg instead */
+#if defined(DEBUG)
+#define pr_debug_once(fmt, ...)					\
+	printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#else
+#define pr_debug_once(fmt, ...)					\
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#endif
+
 /*
  * ratelimited messages with local ratelimit_state,
  * no local ratelimit_state used in the !PRINTK case
-- 
cgit v1.2.3-71-gd317


From ac83ed687837a44c6e46fb16411fb0d299fffd80 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 16:59:47 -0800
Subject: include/linux/printk.h lib/hexdump.c: neatening and add CONFIG_PRINTK
 guard

- Move prototypes and align arguments.

- Add CONFIG_PRINTK guard for print_hex functions

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 42 ++++++++++++++++++++++++++++--------------
 lib/hexdump.c          |  2 ++
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 6442156707c9..e96bd41491bd 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -141,20 +141,6 @@ static inline void log_buf_kexec_setup(void)
 
 extern void dump_stack(void) __cold;
 
-enum {
-	DUMP_PREFIX_NONE,
-	DUMP_PREFIX_ADDRESS,
-	DUMP_PREFIX_OFFSET
-};
-extern void hex_dump_to_buffer(const void *buf, size_t len,
-				int rowsize, int groupsize,
-				char *linebuf, size_t linebuflen, bool ascii);
-extern void print_hex_dump(const char *level, const char *prefix_str,
-				int prefix_type, int rowsize, int groupsize,
-				const void *buf, size_t len, bool ascii);
-extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
-			const void *buf, size_t len);
-
 #ifndef pr_fmt
 #define pr_fmt(fmt) fmt
 #endif
@@ -286,4 +272,32 @@ extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
+
+enum {
+	DUMP_PREFIX_NONE,
+	DUMP_PREFIX_ADDRESS,
+	DUMP_PREFIX_OFFSET
+};
+extern void hex_dump_to_buffer(const void *buf, size_t len,
+			       int rowsize, int groupsize,
+			       char *linebuf, size_t linebuflen, bool ascii);
+#ifdef CONFIG_PRINTK
+extern void print_hex_dump(const char *level, const char *prefix_str,
+			   int prefix_type, int rowsize, int groupsize,
+			   const void *buf, size_t len, bool ascii);
+extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
+				 const void *buf, size_t len);
+#else
+static inline void print_hex_dump(const char *level, const char *prefix_str,
+				  int prefix_type, int rowsize, int groupsize,
+				  const void *buf, size_t len, bool ascii)
+{
+}
+static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
+					const void *buf, size_t len)
+{
+}
+
+#endif
+
 #endif
diff --git a/lib/hexdump.c b/lib/hexdump.c
index b66b2bd67952..f5fe6ba7a3ab 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -154,6 +154,7 @@ nil:
 }
 EXPORT_SYMBOL(hex_dump_to_buffer);
 
+#ifdef CONFIG_PRINTK
 /**
  * print_hex_dump - print a text hex dump to syslog for a binary blob of data
  * @level: kernel log level (e.g. KERN_DEBUG)
@@ -238,3 +239,4 @@ void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 		       buf, len, true);
 }
 EXPORT_SYMBOL(print_hex_dump_bytes);
+#endif
-- 
cgit v1.2.3-71-gd317


From 6ec42a56e258462cda510421aecc2680d3642e21 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 16:59:47 -0800
Subject: include/linux/printk.h: organize printk_ratelimited macros

- Use no_printk for !CONFIG_PRINTK printk_ratelimited.

- Whitespace cleanup.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index e96bd41491bd..d30f579c6f97 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -234,7 +234,8 @@ extern void dump_stack(void) __cold;
  * no local ratelimit_state used in the !PRINTK case
  */
 #ifdef CONFIG_PRINTK
-#define printk_ratelimited(fmt, ...)  ({				\
+#define printk_ratelimited(fmt, ...)					\
+({									\
 	static DEFINE_RATELIMIT_STATE(_rs,				\
 				      DEFAULT_RATELIMIT_INTERVAL,	\
 				      DEFAULT_RATELIMIT_BURST);		\
@@ -243,36 +244,34 @@ extern void dump_stack(void) __cold;
 		printk(fmt, ##__VA_ARGS__);				\
 })
 #else
-/* No effect, but we still get type checking even in the !PRINTK case: */
-#define printk_ratelimited printk
+#define printk_ratelimited(fmt, ...)					\
+	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-#define pr_emerg_ratelimited(fmt, ...) \
+#define pr_emerg_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert_ratelimited(fmt, ...) \
+#define pr_alert_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit_ratelimited(fmt, ...) \
+#define pr_crit_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err_ratelimited(fmt, ...) \
+#define pr_err_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warning_ratelimited(fmt, ...) \
+#define pr_warn_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn_ratelimited pr_warning_ratelimited
-#define pr_notice_ratelimited(fmt, ...) \
+#define pr_notice_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info_ratelimited(fmt, ...) \
+#define pr_info_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 /* no pr_cont_ratelimited, don't do that... */
 /* If you are writing a driver, please use dev_dbg instead */
 #if defined(DEBUG)
-#define pr_debug_ratelimited(fmt, ...) \
+#define pr_debug_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #else
 #define pr_debug_ratelimited(fmt, ...) \
 	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
-
 enum {
 	DUMP_PREFIX_NONE,
 	DUMP_PREFIX_ADDRESS,
-- 
cgit v1.2.3-71-gd317


From a3f938bf6f5746d39e013d03ba13118a393fee96 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 16:59:48 -0800
Subject: include/linux/printk.h: use tab not spaces for indent

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index d30f579c6f97..ee048e77e1ae 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -146,20 +146,20 @@ extern void dump_stack(void) __cold;
 #endif
 
 #define pr_emerg(fmt, ...) \
-        printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_alert(fmt, ...) \
-        printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_crit(fmt, ...) \
-        printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_err(fmt, ...) \
-        printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_warning(fmt, ...) \
-        printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_warn pr_warning
 #define pr_notice(fmt, ...) \
-        printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info(fmt, ...) \
-        printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_cont(fmt, ...) \
 	printk(KERN_CONT fmt, ##__VA_ARGS__)
 
-- 
cgit v1.2.3-71-gd317


From 52bd19f7691b2ea6433aef0ef94c08c57efd7e79 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Wed, 12 Jan 2011 17:00:01 -0800
Subject: epoll: convert max_user_watches to long

On a 16TB machine, max_user_watches has an integer overflow.  Convert it
to use a long and handle the associated fallout.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c        | 20 ++++++++++++--------
 include/linux/sched.h |  2 +-
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067d..cc8a9b7d6064 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
  * Configuration options available inside /proc/sys/fs/epoll/
  */
 /* Maximum number of epoll watched descriptors, per user */
-static int max_user_watches __read_mostly;
+static long max_user_watches __read_mostly;
 
 /*
  * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
 
 #include <linux/sysctl.h>
 
-static int zero;
+static long zero;
+static long long_max = LONG_MAX;
 
 ctl_table epoll_table[] = {
 	{
 		.procname	= "max_user_watches",
 		.data		= &max_user_watches,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(max_user_watches),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
+		.extra2		= &long_max,
 	},
 	{ }
 };
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	/* At this point it is safe to free the eventpoll item */
 	kmem_cache_free(epi_cache, epi);
 
-	atomic_dec(&ep->user->epoll_watches);
+	atomic_long_dec(&ep->user->epoll_watches);
 
 	return 0;
 }
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 {
 	int error, revents, pwake = 0;
 	unsigned long flags;
+	long user_watches;
 	struct epitem *epi;
 	struct ep_pqueue epq;
 
-	if (unlikely(atomic_read(&ep->user->epoll_watches) >=
-		     max_user_watches))
+	user_watches = atomic_long_read(&ep->user->epoll_watches);
+	if (unlikely(user_watches >= max_user_watches))
 		return -ENOSPC;
 	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
 		return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
-	atomic_inc(&ep->user->epoll_watches);
+	atomic_long_inc(&ep->user->epoll_watches);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
 	 */
 	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
 		EP_ITEM_COST;
+	BUG_ON(max_user_watches < 0);
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index abc527aa8550..96e23215e276 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -683,7 +683,7 @@ struct user_struct {
 	atomic_t fanotify_listeners;
 #endif
 #ifdef CONFIG_EPOLL
-	atomic_t epoll_watches;	/* The number of file descriptors currently watched */
+	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
 #endif
 #ifdef CONFIG_POSIX_MQUEUE
 	/* protected by mq_lock	*/
-- 
cgit v1.2.3-71-gd317


From f670d0ecda73b7438eec9ed108680bc5f5362ad8 Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Wed, 12 Jan 2011 17:00:02 -0800
Subject: binfmt_elf: cleanups

This cleans up a few bits in binfmt_elf.c and binfmts.h:

- the hasvdso field in struct linux_binfmt is unused, so remove it and
  the only initialization of it

- the elf_map CPP symbol is not defined anywhere in the kernel, so
  remove an unnecessary #ifndef elf_map

- reduce excessive indentation in elf_format's initializer

- add missing spaces, remove extraneous spaces

No functional changes, but tested on x86 (32 and 64 bit), powerpc (32 and
64 bit), sparc64, arm, and alpha.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c         | 23 +++++++++--------------
 include/linux/binfmts.h |  5 ++---
 2 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c7..d5b640ba6cb1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 
 static struct linux_binfmt elf_format = {
-		.module		= THIS_MODULE,
-		.load_binary	= load_elf_binary,
-		.load_shlib	= load_elf_library,
-		.core_dump	= elf_core_dump,
-		.min_coredump	= ELF_EXEC_PAGESIZE,
-		.hasvdso	= 1
+	.module		= THIS_MODULE,
+	.load_binary	= load_elf_binary,
+	.load_shlib	= load_elf_library,
+	.core_dump	= elf_core_dump,
+	.min_coredump	= ELF_EXEC_PAGESIZE,
 };
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	return 0;
 }
 
-#ifndef elf_map
-
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		struct elf_phdr *eppnt, int prot, int type,
 		unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	return(map_addr);
 }
 
-#endif /* !elf_map */
-
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
 	int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		goto out;
 
 	retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
-			     (char *)elf_phdata,size);
+			     (char *)elf_phdata, size);
 	error = -EIO;
 	if (retval != size) {
 		if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto out;
 	if (!elf_check_arch(&loc->elf_ex))
 		goto out;
-	if (!bprm->file->f_op||!bprm->file->f_op->mmap)
+	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			/* There was a PT_LOAD segment with p_memsz > p_filesz
 			   before this one. Map anonymous pages, if needed,
 			   and clear the area.  */
-			retval = set_brk (elf_bss + load_bias,
-					  elf_brk + load_bias);
+			retval = set_brk(elf_bss + load_bias,
+					 elf_brk + load_bias);
 			if (retval) {
 				send_sig(SIGKILL, current, 0);
 				goto out_free_dentry;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 64a7114a9394..c3d6512eded1 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -25,7 +25,7 @@ struct pt_regs;
 /*
  * This structure is used to hold the arguments that are used when loading binaries.
  */
-struct linux_binprm{
+struct linux_binprm {
 	char buf[BINPRM_BUF_SIZE];
 #ifdef CONFIG_MMU
 	struct vm_area_struct *vma;
@@ -93,7 +93,6 @@ struct linux_binfmt {
 	int (*load_shlib)(struct file *);
 	int (*core_dump)(struct coredump_params *cprm);
 	unsigned long min_coredump;	/* minimal dump size */
-	int hasvdso;
 };
 
 extern int __register_binfmt(struct linux_binfmt *fmt, int insert);
@@ -113,7 +112,7 @@ extern void unregister_binfmt(struct linux_binfmt *);
 
 extern int prepare_binprm(struct linux_binprm *);
 extern int __must_check remove_arg_zero(struct linux_binprm *);
-extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
+extern int search_binary_handler(struct linux_binprm *, struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 extern void setup_new_exec(struct linux_binprm * bprm);
 
-- 
cgit v1.2.3-71-gd317


From 1b912c1bca5c162e611384fe7d39c916e081701a Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 12 Jan 2011 17:00:10 -0800
Subject: drivers/gpio/cs5535-gpio.c: add some additional cs5535-specific GPIO
 functionality

This adds (well, re-adds actually) handling for events/IRQs through cs5535
GPIOs.  In the wild and wooly world of CS5535, setup_event() is for
assigning an IRQ to a GPIO filter/event pair, and set_irq() sets up the
pair to trigger IRQs.

These should really only be used in highly platform-specific drivers (such
as OLPC's DCON driver).  Sadly, because set_irq() uses MSRs, this causes
the driver to become X86-specific.

Signed-off-by: Andres Salomon <dilinger@queued.net>
Signed-off-by: Daniel Drake <dsd@laptop.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig       |  2 +-
 drivers/gpio/cs5535-gpio.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cs5535.h     |  2 ++
 3 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 082495bb08a7..bfa276a95411 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -295,7 +295,7 @@ comment "PCI GPIO expanders:"
 
 config GPIO_CS5535
 	tristate "AMD CS5535/CS5536 GPIO support"
-	depends on PCI && !CS5535_GPIO
+	depends on PCI && X86 && !CS5535_GPIO
 	help
 	  The AMD CS5535 and CS5536 southbridges support 28 GPIO pins that
 	  can be used for quite a number of things.  The CS5535/6 is found on
diff --git a/drivers/gpio/cs5535-gpio.c b/drivers/gpio/cs5535-gpio.c
index d3e55a0ae92b..815d98b2c1ba 100644
--- a/drivers/gpio/cs5535-gpio.c
+++ b/drivers/gpio/cs5535-gpio.c
@@ -15,6 +15,7 @@
 #include <linux/gpio.h>
 #include <linux/io.h>
 #include <linux/cs5535.h>
+#include <asm/msr.h>
 
 #define DRV_NAME "cs5535-gpio"
 #define GPIO_BAR 1
@@ -144,6 +145,57 @@ int cs5535_gpio_isset(unsigned offset, unsigned int reg)
 }
 EXPORT_SYMBOL_GPL(cs5535_gpio_isset);
 
+int cs5535_gpio_set_irq(unsigned group, unsigned irq)
+{
+	uint32_t lo, hi;
+
+	if (group > 7 || irq > 15)
+		return -EINVAL;
+
+	rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+
+	lo &= ~(0xF << (group * 4));
+	lo |= (irq & 0xF) << (group * 4);
+
+	wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs5535_gpio_set_irq);
+
+void cs5535_gpio_setup_event(unsigned offset, int pair, int pme)
+{
+	struct cs5535_gpio_chip *chip = &cs5535_gpio_chip;
+	uint32_t shift = (offset % 8) * 4;
+	unsigned long flags;
+	uint32_t val;
+
+	if (offset >= 24)
+		offset = GPIO_MAP_W;
+	else if (offset >= 16)
+		offset = GPIO_MAP_Z;
+	else if (offset >= 8)
+		offset = GPIO_MAP_Y;
+	else
+		offset = GPIO_MAP_X;
+
+	spin_lock_irqsave(&chip->lock, flags);
+	val = inl(chip->base + offset);
+
+	/* Clear whatever was there before */
+	val &= ~(0xF << shift);
+
+	/* Set the new value */
+	val |= ((pair & 7) << shift);
+
+	/* Set the PME bit if this is a PME event */
+	if (pme)
+		val |= (1 << (shift + 3));
+
+	outl(val, chip->base + offset);
+	spin_unlock_irqrestore(&chip->lock, flags);
+}
+EXPORT_SYMBOL_GPL(cs5535_gpio_setup_event);
+
 /*
  * Generic gpio_chip API support.
  */
diff --git a/include/linux/cs5535.h b/include/linux/cs5535.h
index d5a1d4810b80..213cc50b5809 100644
--- a/include/linux/cs5535.h
+++ b/include/linux/cs5535.h
@@ -111,6 +111,8 @@ static inline int cs5535_has_vsa2(void)
 void cs5535_gpio_set(unsigned offset, unsigned int reg);
 void cs5535_gpio_clear(unsigned offset, unsigned int reg);
 int cs5535_gpio_isset(unsigned offset, unsigned int reg);
+int cs5535_gpio_set_irq(unsigned group, unsigned irq);
+void cs5535_gpio_setup_event(unsigned offset, int pair, int pme);
 
 /* MFGPTs */
 
-- 
cgit v1.2.3-71-gd317


From 7637c9259f7b6dd841471ccf1120d484b7364f99 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 12 Jan 2011 17:00:11 -0800
Subject: drivers/staging/olpc_dcon: convert to new cs5535 gpio API

Drop the old geode_gpio crud, as well as the raw outl() calls; instead,
use the Linux GPIO API where possible, and the cs5535_gpio API in other
places.

Note that we don't actually clean up the driver properly yet (once loaded,
it always remains loaded).  That'll come later..

This patch is necessary for building the driver.

Signed-off-by: Andres Salomon <dilinger@queued.net>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/olpc.h                  |  10 +-
 drivers/staging/olpc_dcon/TODO               |   1 -
 drivers/staging/olpc_dcon/olpc_dcon.c        |   3 +-
 drivers/staging/olpc_dcon/olpc_dcon.h        |  20 ----
 drivers/staging/olpc_dcon/olpc_dcon_xo_1.c   | 168 ++++++++++++++++-----------
 drivers/staging/olpc_dcon/olpc_dcon_xo_1_5.c |   4 +-
 include/linux/cs5535.h                       |   4 +
 7 files changed, 114 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 42a978c0c1b3..f482010350fb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -107,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits);
 /* GPIO assignments */
 
 #define OLPC_GPIO_MIC_AC	1
-#define OLPC_GPIO_DCON_IRQ	geode_gpio(7)
+#define OLPC_GPIO_DCON_STAT0	5
+#define OLPC_GPIO_DCON_STAT1	6
+#define OLPC_GPIO_DCON_IRQ	7
 #define OLPC_GPIO_THRM_ALRM	geode_gpio(10)
-#define OLPC_GPIO_SMB_CLK	geode_gpio(14)
-#define OLPC_GPIO_SMB_DATA	geode_gpio(15)
+#define OLPC_GPIO_DCON_LOAD    11
+#define OLPC_GPIO_DCON_BLANK   12
+#define OLPC_GPIO_SMB_CLK      14
+#define OLPC_GPIO_SMB_DATA     15
 #define OLPC_GPIO_WORKAUX	geode_gpio(24)
 #define OLPC_GPIO_LID		geode_gpio(26)
 #define OLPC_GPIO_ECSCI		geode_gpio(27)
diff --git a/drivers/staging/olpc_dcon/TODO b/drivers/staging/olpc_dcon/TODO
index ac2d3d023715..35f9cda7be11 100644
--- a/drivers/staging/olpc_dcon/TODO
+++ b/drivers/staging/olpc_dcon/TODO
@@ -1,6 +1,5 @@
 TODO:
 	- checkpatch.pl cleanups
-	- port geode gpio calls to newer cs5535 API
 	- see if vx855 gpio API can be made similar enough to cs5535 so we can
 	  share more code
 	- allow simultaneous XO-1 and XO-1.5 support
diff --git a/drivers/staging/olpc_dcon/olpc_dcon.c b/drivers/staging/olpc_dcon/olpc_dcon.c
index 4ca45ec7fd84..da040c12fbf9 100644
--- a/drivers/staging/olpc_dcon/olpc_dcon.c
+++ b/drivers/staging/olpc_dcon/olpc_dcon.c
@@ -27,7 +27,6 @@
 #include <asm/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/reboot.h>
-#include <linux/gpio.h>
 #include <asm/tsc.h>
 #include <asm/olpc.h>
 
@@ -49,7 +48,7 @@ struct dcon_platform_data {
 	int (*init)(void);
 	void (*bus_stabilize_wiggle)(void);
 	void (*set_dconload)(int);
-	int (*read_status)(void);
+	u8 (*read_status)(void);
 };
 
 static struct dcon_platform_data *pdata;
diff --git a/drivers/staging/olpc_dcon/olpc_dcon.h b/drivers/staging/olpc_dcon/olpc_dcon.h
index 6453ca4ba0ee..e566d213da2a 100644
--- a/drivers/staging/olpc_dcon/olpc_dcon.h
+++ b/drivers/staging/olpc_dcon/olpc_dcon.h
@@ -29,26 +29,6 @@
 #define DCON_REG_SCAN_INT	9
 #define DCON_REG_BRIGHT		10
 
-/* GPIO registers (CS5536) */
-
-#define MSR_LBAR_GPIO		0x5140000C
-
-#define GPIOx_OUT_VAL     0x00
-#define GPIOx_OUT_EN      0x04
-#define GPIOx_IN_EN       0x20
-#define GPIOx_INV_EN      0x24
-#define GPIOx_IN_FLTR_EN  0x28
-#define GPIOx_EVNTCNT_EN  0x2C
-#define GPIOx_READ_BACK   0x30
-#define GPIOx_EVNT_EN     0x38
-#define GPIOx_NEGEDGE_EN  0x44
-#define GPIOx_NEGEDGE_STS 0x4C
-#define GPIO_FLT7_AMNT    0xD8
-#define GPIO_MAP_X        0xE0
-#define GPIO_MAP_Y        0xE4
-#define GPIO_FE7_SEL      0xF7
-
-
 /* Status values */
 
 #define DCONSTAT_SCANINT	0
diff --git a/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c b/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c
index 779fb7d7b30c..043198dc6ff7 100644
--- a/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c
+++ b/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c
@@ -10,54 +10,70 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
-
+#include <linux/cs5535.h>
+#include <linux/gpio.h>
 #include <asm/olpc.h>
 
 #include "olpc_dcon.h"
 
-/* Base address of the GPIO registers */
-static unsigned long gpio_base;
-
-/*
- * List of GPIOs that we care about:
- * (in)  GPIO12   -- DCONBLANK
- * (in)  GPIO[56] -- DCONSTAT[01]
- * (out) GPIO11   -- DCONLOAD
- */
-
-#define IN_GPIOS ((1<<5) | (1<<6) | (1<<7) | (1<<12))
-#define OUT_GPIOS (1<<11)
-
 static int dcon_init_xo_1(void)
 {
-	unsigned long lo, hi;
 	unsigned char lob;
 
-	rdmsr(MSR_LBAR_GPIO, lo, hi);
-
-	/* Check the mask and whether GPIO is enabled (sanity check) */
-	if (hi != 0x0000f001) {
-		printk(KERN_ERR "GPIO not enabled -- cannot use DCON\n");
-		return -ENODEV;
+	if (gpio_request(OLPC_GPIO_DCON_STAT0, "OLPC-DCON")) {
+		printk(KERN_ERR "olpc-dcon: failed to request STAT0 GPIO\n");
+		return -EIO;
+	}
+	if (gpio_request(OLPC_GPIO_DCON_STAT1, "OLPC-DCON")) {
+		printk(KERN_ERR "olpc-dcon: failed to request STAT1 GPIO\n");
+		goto err_gp_stat1;
+	}
+	if (gpio_request(OLPC_GPIO_DCON_IRQ, "OLPC-DCON")) {
+		printk(KERN_ERR "olpc-dcon: failed to request IRQ GPIO\n");
+		goto err_gp_irq;
+	}
+	if (gpio_request(OLPC_GPIO_DCON_LOAD, "OLPC-DCON")) {
+		printk(KERN_ERR "olpc-dcon: failed to request LOAD GPIO\n");
+		goto err_gp_load;
+	}
+	if (gpio_request(OLPC_GPIO_DCON_BLANK, "OLPC-DCON")) {
+		printk(KERN_ERR "olpc-dcon: failed to request BLANK GPIO\n");
+		goto err_gp_blank;
 	}
-
-	/* Mask off the IO base address */
-	gpio_base = lo & 0x0000ff00;
 
 	/* Turn off the event enable for GPIO7 just to be safe */
-	outl(1 << (16+7), gpio_base + GPIOx_EVNT_EN);
+	cs5535_gpio_clear(OLPC_GPIO_DCON_IRQ, GPIO_EVENTS_ENABLE);
+
+	/*
+	 * Determine the current state by reading the GPIO bit; earlier
+	 * stages of the boot process have established the state.
+	 *
+	 * Note that we read GPIO_OUPUT_VAL rather than GPIO_READ_BACK here;
+	 * this is because OFW will disable input for the pin and set a value..
+	 * READ_BACK will only contain a valid value if input is enabled and
+	 * then a value is set.  So, future readings of the pin can use
+	 * READ_BACK, but the first one cannot.  Awesome, huh?
+	 */
+	dcon_source = cs5535_gpio_isset(OLPC_GPIO_DCON_LOAD, GPIO_OUTPUT_VAL)
+		? DCON_SOURCE_CPU
+		: DCON_SOURCE_DCON;
+	dcon_pending = dcon_source;
 
 	/* Set the directions for the GPIO pins */
-	outl(OUT_GPIOS | (IN_GPIOS << 16), gpio_base + GPIOx_OUT_EN);
-	outl(IN_GPIOS | (OUT_GPIOS << 16), gpio_base + GPIOx_IN_EN);
+	gpio_direction_input(OLPC_GPIO_DCON_STAT0);
+	gpio_direction_input(OLPC_GPIO_DCON_STAT1);
+	gpio_direction_input(OLPC_GPIO_DCON_IRQ);
+	gpio_direction_input(OLPC_GPIO_DCON_BLANK);
+	gpio_direction_output(OLPC_GPIO_DCON_LOAD,
+			dcon_source == DCON_SOURCE_CPU);
 
 	/* Set up the interrupt mappings */
 
 	/* Set the IRQ to pair 2 */
-	geode_gpio_event_irq(OLPC_GPIO_DCON_IRQ, 2);
+	cs5535_gpio_setup_event(OLPC_GPIO_DCON_IRQ, 2, 0);
 
 	/* Enable group 2 to trigger the DCON interrupt */
-	geode_gpio_set_irq(2, DCON_IRQ);
+	cs5535_gpio_set_irq(2, DCON_IRQ);
 
 	/* Select edge level for interrupt (in PIC) */
 	lob = inb(0x4d0);
@@ -65,52 +81,61 @@ static int dcon_init_xo_1(void)
 	outb(lob, 0x4d0);
 
 	/* Register the interupt handler */
-	if (request_irq(DCON_IRQ, &dcon_interrupt, 0, "DCON", &dcon_driver))
-		return -EIO;
+	if (request_irq(DCON_IRQ, &dcon_interrupt, 0, "DCON", &dcon_driver)) {
+		printk(KERN_ERR "olpc-dcon: failed to request DCON's irq\n");
+		goto err_req_irq;
+	}
 
 	/* Clear INV_EN for GPIO7 (DCONIRQ) */
-	outl((1<<(16+7)), gpio_base + GPIOx_INV_EN);
+	cs5535_gpio_clear(OLPC_GPIO_DCON_IRQ, GPIO_INPUT_INVERT);
 
 	/* Enable filter for GPIO12 (DCONBLANK) */
-	outl(1<<(12), gpio_base + GPIOx_IN_FLTR_EN);
+	cs5535_gpio_set(OLPC_GPIO_DCON_BLANK, GPIO_INPUT_FILTER);
 
 	/* Disable filter for GPIO7 */
-	outl(1<<(16+7), gpio_base + GPIOx_IN_FLTR_EN);
+	cs5535_gpio_clear(OLPC_GPIO_DCON_IRQ, GPIO_INPUT_FILTER);
 
 	/* Disable event counter for GPIO7 (DCONIRQ) and GPIO12 (DCONBLANK) */
-
-	outl(1<<(16+7), gpio_base + GPIOx_EVNTCNT_EN);
-	outl(1<<(16+12), gpio_base + GPIOx_EVNTCNT_EN);
+	cs5535_gpio_clear(OLPC_GPIO_DCON_IRQ, GPIO_INPUT_EVENT_COUNT);
+	cs5535_gpio_clear(OLPC_GPIO_DCON_BLANK, GPIO_INPUT_EVENT_COUNT);
 
 	/* Add GPIO12 to the Filter Event Pair #7 */
-	outb(12, gpio_base + GPIO_FE7_SEL);
+	cs5535_gpio_set(OLPC_GPIO_DCON_BLANK, GPIO_FE7_SEL);
 
 	/* Turn off negative Edge Enable for GPIO12 */
-	outl(1<<(16+12), gpio_base + GPIOx_NEGEDGE_EN);
+	cs5535_gpio_clear(OLPC_GPIO_DCON_BLANK, GPIO_NEGATIVE_EDGE_EN);
 
 	/* Enable negative Edge Enable for GPIO7 */
-	outl(1<<7, gpio_base + GPIOx_NEGEDGE_EN);
+	cs5535_gpio_set(OLPC_GPIO_DCON_IRQ, GPIO_NEGATIVE_EDGE_EN);
 
 	/* Zero the filter amount for Filter Event Pair #7 */
-	outw(0, gpio_base + GPIO_FLT7_AMNT);
+	cs5535_gpio_set(0, GPIO_FLTR7_AMOUNT);
 
 	/* Clear the negative edge status for GPIO7 and GPIO12 */
-	outl((1<<7) | (1<<12), gpio_base+0x4c);
+	cs5535_gpio_set(OLPC_GPIO_DCON_IRQ, GPIO_NEGATIVE_EDGE_STS);
+	cs5535_gpio_set(OLPC_GPIO_DCON_BLANK, GPIO_NEGATIVE_EDGE_STS);
 
 	/* FIXME:  Clear the posiitive status as well, just to be sure */
-	outl((1<<7) | (1<<12), gpio_base+0x48);
+	cs5535_gpio_set(OLPC_GPIO_DCON_IRQ, GPIO_POSITIVE_EDGE_STS);
+	cs5535_gpio_set(OLPC_GPIO_DCON_BLANK, GPIO_POSITIVE_EDGE_STS);
 
 	/* Enable events for GPIO7 (DCONIRQ) and GPIO12 (DCONBLANK) */
-	outl((1<<(7))|(1<<12), gpio_base + GPIOx_EVNT_EN);
-
-	/* Determine the current state by reading the GPIO bit */
-	/* Earlier stages of the boot process have established the state */
-	dcon_source = inl(gpio_base + GPIOx_OUT_VAL) & (1<<11)
-		? DCON_SOURCE_CPU
-		: DCON_SOURCE_DCON;
-	dcon_pending = dcon_source;
+	cs5535_gpio_set(OLPC_GPIO_DCON_IRQ, GPIO_EVENTS_ENABLE);
+	cs5535_gpio_set(OLPC_GPIO_DCON_BLANK, GPIO_EVENTS_ENABLE);
 
 	return 0;
+
+err_req_irq:
+	gpio_free(OLPC_GPIO_DCON_BLANK);
+err_gp_blank:
+	gpio_free(OLPC_GPIO_DCON_LOAD);
+err_gp_load:
+	gpio_free(OLPC_GPIO_DCON_IRQ);
+err_gp_irq:
+	gpio_free(OLPC_GPIO_DCON_STAT1);
+err_gp_stat1:
+	gpio_free(OLPC_GPIO_DCON_STAT0);
+	return -EIO;
 }
 
 static void dcon_wiggle_xo_1(void)
@@ -128,37 +153,44 @@ static void dcon_wiggle_xo_1(void)
 	 * simultaneously set AUX1 IN/OUT to GPIO14; ditto for SMB_DATA and
 	 * GPIO15.
  	 */
-	geode_gpio_set(OLPC_GPIO_SMB_CLK|OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_VAL);
-	geode_gpio_set(OLPC_GPIO_SMB_CLK|OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_ENABLE);
-	geode_gpio_clear(OLPC_GPIO_SMB_CLK|OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_AUX1);
-	geode_gpio_clear(OLPC_GPIO_SMB_CLK|OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_AUX2);
-	geode_gpio_clear(OLPC_GPIO_SMB_CLK|OLPC_GPIO_SMB_DATA, GPIO_INPUT_AUX1);
+	cs5535_gpio_set(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_VAL);
+	cs5535_gpio_set(OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_VAL);
+	cs5535_gpio_set(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_ENABLE);
+	cs5535_gpio_set(OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_ENABLE);
+	cs5535_gpio_clear(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_AUX1);
+	cs5535_gpio_clear(OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_AUX1);
+	cs5535_gpio_clear(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_AUX2);
+	cs5535_gpio_clear(OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_AUX2);
+	cs5535_gpio_clear(OLPC_GPIO_SMB_CLK, GPIO_INPUT_AUX1);
+	cs5535_gpio_clear(OLPC_GPIO_SMB_DATA, GPIO_INPUT_AUX1);
 
 	for (x = 0; x < 16; x++) {
 		udelay(5);
-		geode_gpio_clear(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_VAL);
+		cs5535_gpio_clear(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_VAL);
 		udelay(5);
-		geode_gpio_set(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_VAL);
+		cs5535_gpio_set(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_VAL);
 	}
 	udelay(5);
-	geode_gpio_set(OLPC_GPIO_SMB_CLK|OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_AUX1);
-	geode_gpio_set(OLPC_GPIO_SMB_CLK|OLPC_GPIO_SMB_DATA, GPIO_INPUT_AUX1);
+	cs5535_gpio_set(OLPC_GPIO_SMB_CLK, GPIO_OUTPUT_AUX1);
+	cs5535_gpio_set(OLPC_GPIO_SMB_DATA, GPIO_OUTPUT_AUX1);
+	cs5535_gpio_set(OLPC_GPIO_SMB_CLK, GPIO_INPUT_AUX1);
+	cs5535_gpio_set(OLPC_GPIO_SMB_DATA, GPIO_INPUT_AUX1);
 }
 
 static void dcon_set_dconload_1(int val)
 {
-	if (val)	
-		outl(1<<11, gpio_base + GPIOx_OUT_VAL);
-	else
-		outl(1<<(11 + 16), gpio_base + GPIOx_OUT_VAL);
+	gpio_set_value(OLPC_GPIO_DCON_LOAD, val);
 }
 
-static int dcon_read_status_xo_1(void)
+static u8 dcon_read_status_xo_1(void)
 {
-	int status = inl(gpio_base + GPIOx_READ_BACK) >> 5;
-	
+	u8 status;
+
+	status = gpio_get_value(OLPC_GPIO_DCON_STAT0);
+	status |= gpio_get_value(OLPC_GPIO_DCON_STAT1) << 1;
+
 	/* Clear the negative edge status for GPIO7 */
-	outl(1 << 7, gpio_base + GPIOx_NEGEDGE_STS);
+	cs5535_gpio_set(OLPC_GPIO_DCON_IRQ, GPIO_NEGATIVE_EDGE_STS);
 
 	return status;
 }
diff --git a/drivers/staging/olpc_dcon/olpc_dcon_xo_1_5.c b/drivers/staging/olpc_dcon/olpc_dcon_xo_1_5.c
index cca6a235ef96..4f56098bb366 100644
--- a/drivers/staging/olpc_dcon/olpc_dcon_xo_1_5.c
+++ b/drivers/staging/olpc_dcon/olpc_dcon_xo_1_5.c
@@ -195,9 +195,9 @@ static void dcon_set_dconload_xo_1_5(int val)
 	}
 }
 
-static int dcon_read_status_xo_1_5(void) 
+static u8 dcon_read_status_xo_1_5(void)
 {
-	int status;
+	u8 status;
 	
 	if (!dcon_was_irq())
 		return -1;
diff --git a/include/linux/cs5535.h b/include/linux/cs5535.h
index 213cc50b5809..6fe2114f8ad2 100644
--- a/include/linux/cs5535.h
+++ b/include/linux/cs5535.h
@@ -103,11 +103,15 @@ static inline int cs5535_has_vsa2(void)
 #define GPIO_POSITIVE_EDGE_STS	0x48
 #define GPIO_NEGATIVE_EDGE_STS	0x4C
 
+#define GPIO_FLTR7_AMOUNT	0xD8
+
 #define GPIO_MAP_X		0xE0
 #define GPIO_MAP_Y		0xE4
 #define GPIO_MAP_Z		0xE8
 #define GPIO_MAP_W		0xEC
 
+#define GPIO_FE7_SEL		0xF7
+
 void cs5535_gpio_set(unsigned offset, unsigned int reg);
 void cs5535_gpio_clear(unsigned offset, unsigned int reg);
 int cs5535_gpio_isset(unsigned offset, unsigned int reg);
-- 
cgit v1.2.3-71-gd317


From 0fdae42d361bbb431ca0ab0efed5126a94821177 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Wed, 12 Jan 2011 17:00:23 -0800
Subject: gpiolib: annotate gpio-intialization with __must_check

Because GPIOs can have crucial functions especially in embedded systems,
we are better safe than sorry regarding their configuration.  For
gpio_request, the documentation is simply enforced: <quote>"The return
value of gpio_request() must be checked."</quote> For gpio_direction_* and
gpio_request_*, we now act accordingly.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Greg KH <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/gpio.txt     |  2 +-
 include/asm-generic/gpio.h | 10 +++++-----
 include/linux/gpio.h       |  6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 792faa3c06cf..a492d92bb098 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction:
 	int gpio_direction_input(unsigned gpio);
 	int gpio_direction_output(unsigned gpio, int value);
 
-The return value is zero for success, else a negative errno.  It should
+The return value is zero for success, else a negative errno.  It must
 be checked, since the get/set calls don't have error returns and since
 misconfiguration is possible.  You should normally issue these calls from
 a task context.  However, for spinlock-safe GPIOs it's OK to use them
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index ff5c66080c8c..6098cae2af8e 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data,
 /* Always use the library code for GPIO management calls,
  * or when sleeping may be involved.
  */
-extern int gpio_request(unsigned gpio, const char *label);
+extern int __must_check gpio_request(unsigned gpio, const char *label);
 extern void gpio_free(unsigned gpio);
 
-extern int gpio_direction_input(unsigned gpio);
-extern int gpio_direction_output(unsigned gpio, int value);
+extern int __must_check gpio_direction_input(unsigned gpio);
+extern int __must_check gpio_direction_output(unsigned gpio, int value);
 
 extern int gpio_set_debounce(unsigned gpio, unsigned debounce);
 
@@ -192,8 +192,8 @@ struct gpio {
 	const char	*label;
 };
 
-extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-extern int gpio_request_array(struct gpio *array, size_t num);
+extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+extern int __must_check gpio_request_array(struct gpio *array, size_t num);
 extern void gpio_free_array(struct gpio *array, size_t num);
 
 #ifdef CONFIG_GPIO_SYSFS
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index e41f7dd1ae67..1d5214a89110 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -29,7 +29,7 @@ static inline int gpio_is_valid(int number)
 	return 0;
 }
 
-static inline int gpio_request(unsigned gpio, const char *label)
+static inline int __must_check gpio_request(unsigned gpio, const char *label)
 {
 	return -ENOSYS;
 }
@@ -42,12 +42,12 @@ static inline void gpio_free(unsigned gpio)
 	WARN_ON(1);
 }
 
-static inline int gpio_direction_input(unsigned gpio)
+static inline int __must_check gpio_direction_input(unsigned gpio)
 {
 	return -ENOSYS;
 }
 
-static inline int gpio_direction_output(unsigned gpio, int value)
+static inline int __must_check gpio_direction_output(unsigned gpio, int value)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3-71-gd317


From 5f829e405ec4e96f711165a4a7b55c271d4363e2 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Wed, 12 Jan 2011 17:00:24 -0800
Subject: gpiolib: add missing functions to generic fallback

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Greg KH <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gpio.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 1d5214a89110..f79d67f413e4 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 
 struct device;
+struct gpio;
 struct gpio_chip;
 
 /*
@@ -34,6 +35,17 @@ static inline int __must_check gpio_request(unsigned gpio, const char *label)
 	return -ENOSYS;
 }
 
+static inline int __must_check gpio_request_one(unsigned gpio,
+					unsigned long flags, const char *label)
+{
+	return -ENOSYS;
+}
+
+static inline int __must_check gpio_request_array(struct gpio *array, size_t num)
+{
+	return -ENOSYS;
+}
+
 static inline void gpio_free(unsigned gpio)
 {
 	might_sleep();
@@ -42,6 +54,14 @@ static inline void gpio_free(unsigned gpio)
 	WARN_ON(1);
 }
 
+static inline void gpio_free_array(struct gpio *array, size_t num)
+{
+	might_sleep();
+
+	/* GPIO can never have been requested */
+	WARN_ON(1);
+}
+
 static inline int __must_check gpio_direction_input(unsigned gpio)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3-71-gd317


From e6d7202b66d99bf514c8e901db68386b1fcd6d56 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 12 Jan 2011 17:00:37 -0800
Subject: fs/char_dev.c: remove unused cdev_index()

Commit 66fa12c571d3 ("ieee1394: remove the old IEEE 1394 driver stack")
eliminated the only user of cdev_index().  So it can be removed too.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/char_dev.c        | 13 -------------
 include/linux/cdev.h |  2 --
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b93..6e99b9ddd4e9 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	return ret;
 }
 
-int cdev_index(struct inode *inode)
-{
-	int idx;
-	struct kobject *kobj;
-
-	kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
-	if (!kobj)
-		return -1;
-	kobject_put(kobj);
-	return idx;
-}
-
 void cd_forget(struct inode *inode)
 {
 	spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index f389e319a454..fb4591977b03 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -28,8 +28,6 @@ int cdev_add(struct cdev *, dev_t, unsigned);
 
 void cdev_del(struct cdev *);
 
-int cdev_index(struct inode *inode);
-
 void cd_forget(struct inode *);
 
 extern struct backing_dev_info directly_mappable_cdev_bdi;
-- 
cgit v1.2.3-71-gd317


From a93192a5d245a262dc52fa426de5b20467308a77 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 12 Jan 2011 17:00:38 -0800
Subject: rapidio: use common destid storage for endpoints and switches

Change code to use one storage location common for switches and endpoints.
This eliminates unnecessary device type checks during basic access
operations.  Logic that assigns destid to RIO devices stays unchanged - as
before, switches use an associated destid because they do not have their
own.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Thomas Moll <thomas.moll@sysgo.com>
Cc: Micha Nelissen <micha@neli.hopto.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio-scan.c          | 84 +++++++++++++++++----------------
 drivers/rapidio/rio.c               | 74 +++++++++--------------------
 drivers/rapidio/switches/idt_gen2.c | 93 +++++++++++++------------------------
 drivers/rapidio/switches/idtcps.c   |  6 +--
 drivers/rapidio/switches/tsi568.c   | 13 ++----
 drivers/rapidio/switches/tsi57x.c   | 56 ++++++++++------------
 include/linux/rio.h                 |  8 ++--
 include/linux/rio_drv.h             | 72 +++++-----------------------
 8 files changed, 141 insertions(+), 265 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 1eb82c4c712e..51f0af241eb7 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -437,9 +437,15 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 				next_destid++;
 		} else
 			rdev->destid = rio_get_device_id(port, destid, hopcount);
-	} else
-		/* Switch device has an associated destID */
-		rdev->destid = RIO_INVALID_DESTID;
+
+		rdev->hopcount = 0xff;
+	} else {
+		/* Switch device has an associated destID which
+		 * will be adjusted later
+		 */
+		rdev->destid = destid;
+		rdev->hopcount = hopcount;
+	}
 
 	/* If a PE has both switch and other functions, show it as a switch */
 	if (rio_is_switch(rdev)) {
@@ -450,8 +456,6 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 		if (!rswitch)
 			goto cleanup;
 		rswitch->switchid = next_switchid;
-		rswitch->hopcount = hopcount;
-		rswitch->destid = destid;
 		rswitch->port_ok = 0;
 		rswitch->route_table = kzalloc(sizeof(u8)*
 					RIO_MAX_ROUTE_ENTRIES(port->sys_size),
@@ -632,8 +636,7 @@ rio_unlock_device(struct rio_mport *port, u16 destid, u8 hopcount)
 
 /**
  * rio_route_add_entry- Add a route entry to a switch routing table
- * @mport: Master port to send transaction
- * @rswitch: Switch device
+ * @rdev: RIO device
  * @table: Routing table ID
  * @route_destid: Destination ID to be routed
  * @route_port: Port number to be routed
@@ -647,31 +650,31 @@ rio_unlock_device(struct rio_mport *port, u16 destid, u8 hopcount)
  * on failure.
  */
 static int
-rio_route_add_entry(struct rio_mport *mport, struct rio_switch *rswitch,
+rio_route_add_entry(struct rio_dev *rdev,
 		    u16 table, u16 route_destid, u8 route_port, int lock)
 {
 	int rc;
 
 	if (lock) {
-		rc = rio_lock_device(mport, rswitch->destid,
-				     rswitch->hopcount, 1000);
+		rc = rio_lock_device(rdev->net->hport, rdev->destid,
+				     rdev->hopcount, 1000);
 		if (rc)
 			return rc;
 	}
 
-	rc = rswitch->add_entry(mport, rswitch->destid,
-					rswitch->hopcount, table,
-					route_destid, route_port);
+	rc = rdev->rswitch->add_entry(rdev->net->hport, rdev->destid,
+				      rdev->hopcount, table,
+				      route_destid, route_port);
 	if (lock)
-		rio_unlock_device(mport, rswitch->destid, rswitch->hopcount);
+		rio_unlock_device(rdev->net->hport, rdev->destid,
+				  rdev->hopcount);
 
 	return rc;
 }
 
 /**
  * rio_route_get_entry- Read a route entry in a switch routing table
- * @mport: Master port to send transaction
- * @rswitch: Switch device
+ * @rdev: RIO device
  * @table: Routing table ID
  * @route_destid: Destination ID to be routed
  * @route_port: Pointer to read port number into
@@ -685,23 +688,24 @@ rio_route_add_entry(struct rio_mport *mport, struct rio_switch *rswitch,
  * on failure.
  */
 static int
-rio_route_get_entry(struct rio_mport *mport, struct rio_switch *rswitch, u16 table,
+rio_route_get_entry(struct rio_dev *rdev, u16 table,
 		    u16 route_destid, u8 *route_port, int lock)
 {
 	int rc;
 
 	if (lock) {
-		rc = rio_lock_device(mport, rswitch->destid,
-				     rswitch->hopcount, 1000);
+		rc = rio_lock_device(rdev->net->hport, rdev->destid,
+				     rdev->hopcount, 1000);
 		if (rc)
 			return rc;
 	}
 
-	rc = rswitch->get_entry(mport, rswitch->destid,
-					rswitch->hopcount, table,
-					route_destid, route_port);
+	rc = rdev->rswitch->get_entry(rdev->net->hport, rdev->destid,
+				      rdev->hopcount, table,
+				      route_destid, route_port);
 	if (lock)
-		rio_unlock_device(mport, rswitch->destid, rswitch->hopcount);
+		rio_unlock_device(rdev->net->hport, rdev->destid,
+				  rdev->hopcount);
 
 	return rc;
 }
@@ -811,14 +815,14 @@ static int __devinit rio_enum_peer(struct rio_net *net, struct rio_mport *port,
 	if (rio_is_switch(rdev)) {
 		next_switchid++;
 		sw_inport = RIO_GET_PORT_NUM(rdev->swpinfo);
-		rio_route_add_entry(port, rdev->rswitch, RIO_GLOBAL_TABLE,
+		rio_route_add_entry(rdev, RIO_GLOBAL_TABLE,
 				    port->host_deviceid, sw_inport, 0);
 		rdev->rswitch->route_table[port->host_deviceid] = sw_inport;
 
 		for (destid = 0; destid < next_destid; destid++) {
 			if (destid == port->host_deviceid)
 				continue;
-			rio_route_add_entry(port, rdev->rswitch, RIO_GLOBAL_TABLE,
+			rio_route_add_entry(rdev, RIO_GLOBAL_TABLE,
 					    destid, sw_inport, 0);
 			rdev->rswitch->route_table[destid] = sw_inport;
 		}
@@ -850,8 +854,7 @@ static int __devinit rio_enum_peer(struct rio_net *net, struct rio_mport *port,
 				    "RIO: scanning device on port %d\n",
 				    port_num);
 				rdev->rswitch->port_ok |= (1 << port_num);
-				rio_route_add_entry(port, rdev->rswitch,
-						RIO_GLOBAL_TABLE,
+				rio_route_add_entry(rdev, RIO_GLOBAL_TABLE,
 						RIO_ANY_DESTID(port->sys_size),
 						port_num, 0);
 
@@ -865,7 +868,7 @@ static int __devinit rio_enum_peer(struct rio_net *net, struct rio_mport *port,
 					     destid < next_destid; destid++) {
 						if (destid == port->host_deviceid)
 							continue;
-						rio_route_add_entry(port, rdev->rswitch,
+						rio_route_add_entry(rdev,
 								    RIO_GLOBAL_TABLE,
 								    destid,
 								    port_num,
@@ -904,7 +907,7 @@ static int __devinit rio_enum_peer(struct rio_net *net, struct rio_mport *port,
 				next_destid++;
 		}
 
-		rdev->rswitch->destid = sw_destid;
+		rdev->destid = sw_destid;
 	} else
 		pr_debug("RIO: found %s (vid %4.4x did %4.4x)\n",
 		    rio_name(rdev), rdev->vid, rdev->did);
@@ -958,7 +961,7 @@ rio_disc_peer(struct rio_net *net, struct rio_mport *port, u16 destid,
 		next_switchid++;
 
 		/* Associated destid is how we accessed this switch */
-		rdev->rswitch->destid = destid;
+		rdev->destid = destid;
 
 		pr_debug(
 		    "RIO: found %s (vid %4.4x did %4.4x) with %d ports\n",
@@ -981,7 +984,7 @@ rio_disc_peer(struct rio_net *net, struct rio_mport *port, u16 destid,
 				for (ndestid = 0;
 				     ndestid < RIO_ANY_DESTID(port->sys_size);
 				     ndestid++) {
-					rio_route_get_entry(port, rdev->rswitch,
+					rio_route_get_entry(rdev,
 							    RIO_GLOBAL_TABLE,
 							    ndestid,
 							    &route_port, 0);
@@ -1076,7 +1079,7 @@ static void rio_update_route_tables(struct rio_mport *port)
 
 	list_for_each_entry(rdev, &rio_devices, global_list) {
 
-		destid = (rio_is_switch(rdev))?rdev->rswitch->destid:rdev->destid;
+		destid = rdev->destid;
 
 		list_for_each_entry(rswitch, &rio_switches, node) {
 
@@ -1085,13 +1088,13 @@ static void rio_update_route_tables(struct rio_mport *port)
 
 			if (RIO_INVALID_ROUTE == rswitch->route_table[destid]) {
 				/* Skip if destid ends in empty switch*/
-				if (rswitch->destid == destid)
+				if (rswitch->rdev->destid == destid)
 					continue;
 
 				sport = RIO_GET_PORT_NUM(rswitch->rdev->swpinfo);
 
 				if (rswitch->add_entry)	{
-					rio_route_add_entry(port, rswitch,
+					rio_route_add_entry(rswitch->rdev,
 						RIO_GLOBAL_TABLE, destid,
 						sport, 0);
 					rswitch->route_table[destid] = sport;
@@ -1203,21 +1206,20 @@ static void rio_build_route_tables(void)
 
 	list_for_each_entry(rdev, &rio_devices, global_list)
 		if (rio_is_switch(rdev)) {
-			rio_lock_device(rdev->net->hport, rdev->rswitch->destid,
-					rdev->rswitch->hopcount, 1000);
+			rio_lock_device(rdev->net->hport, rdev->destid,
+					rdev->hopcount, 1000);
 			for (i = 0;
 			     i < RIO_MAX_ROUTE_ENTRIES(rdev->net->hport->sys_size);
 			     i++) {
-				if (rio_route_get_entry
-				    (rdev->net->hport, rdev->rswitch,
-				     RIO_GLOBAL_TABLE, i, &sport, 0) < 0)
+				if (rio_route_get_entry(rdev,
+					RIO_GLOBAL_TABLE, i, &sport, 0) < 0)
 					continue;
 				rdev->rswitch->route_table[i] = sport;
 			}
 
 			rio_unlock_device(rdev->net->hport,
-					  rdev->rswitch->destid,
-					  rdev->rswitch->hopcount);
+					  rdev->destid,
+					  rdev->hopcount);
 		}
 }
 
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index 7b5080c45569..c13289e10bb1 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -471,16 +471,9 @@ exit:
  */
 int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
 	u32 regval;
 
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	rio_mport_read_config_32(rdev->net->hport, destid, hopcount,
+	rio_read_config_32(rdev,
 				 rdev->phys_efptr + RIO_PORT_N_CTL_CSR(pnum),
 				 &regval);
 	if (lock)
@@ -488,7 +481,7 @@ int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock)
 	else
 		regval &= ~RIO_PORT_N_CTL_LOCKOUT;
 
-	rio_mport_write_config_32(rdev->net->hport, destid, hopcount,
+	rio_write_config_32(rdev,
 				  rdev->phys_efptr + RIO_PORT_N_CTL_CSR(pnum),
 				  regval);
 	return 0;
@@ -507,7 +500,7 @@ static int
 rio_chk_dev_route(struct rio_dev *rdev, struct rio_dev **nrdev, int *npnum)
 {
 	u32 result;
-	int p_port, dstid, rc = -EIO;
+	int p_port, rc = -EIO;
 	struct rio_dev *prev = NULL;
 
 	/* Find switch with failed RIO link */
@@ -522,9 +515,7 @@ rio_chk_dev_route(struct rio_dev *rdev, struct rio_dev **nrdev, int *npnum)
 	if (prev == NULL)
 		goto err_out;
 
-	dstid = (rdev->pef & RIO_PEF_SWITCH) ?
-			rdev->rswitch->destid : rdev->destid;
-	p_port = prev->rswitch->route_table[dstid];
+	p_port = prev->rswitch->route_table[rdev->destid];
 
 	if (p_port != RIO_INVALID_ROUTE) {
 		pr_debug("RIO: link failed on [%s]-P%d\n",
@@ -567,15 +558,8 @@ rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid, u8 hopcount)
  */
 static int rio_chk_dev_access(struct rio_dev *rdev)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
-
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	return rio_mport_chk_dev_access(rdev->net->hport, destid, hopcount);
+	return rio_mport_chk_dev_access(rdev->net->hport,
+					rdev->destid, rdev->hopcount);
 }
 
 /**
@@ -588,23 +572,20 @@ static int rio_chk_dev_access(struct rio_dev *rdev)
 static int
 rio_get_input_status(struct rio_dev *rdev, int pnum, u32 *lnkresp)
 {
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	u32 regval;
 	int checkcount;
 
 	if (lnkresp) {
 		/* Read from link maintenance response register
 		 * to clear valid bit */
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_MNT_RSP_CSR(pnum),
 			&regval);
 		udelay(50);
 	}
 
 	/* Issue Input-status command */
-	rio_mport_write_config_32(mport, destid, hopcount,
+	rio_write_config_32(rdev,
 		rdev->phys_efptr + RIO_PORT_N_MNT_REQ_CSR(pnum),
 		RIO_MNT_REQ_CMD_IS);
 
@@ -615,7 +596,7 @@ rio_get_input_status(struct rio_dev *rdev, int pnum, u32 *lnkresp)
 	checkcount = 3;
 	while (checkcount--) {
 		udelay(50);
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_MNT_RSP_CSR(pnum),
 			&regval);
 		if (regval & RIO_PORT_N_MNT_RSP_RVAL) {
@@ -635,15 +616,12 @@ rio_get_input_status(struct rio_dev *rdev, int pnum, u32 *lnkresp)
  */
 static int rio_clr_err_stopped(struct rio_dev *rdev, u32 pnum, u32 err_status)
 {
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	struct rio_dev *nextdev = rdev->rswitch->nextdev[pnum];
 	u32 regval;
 	u32 far_ackid, far_linkstat, near_ackid;
 
 	if (err_status == 0)
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ERR_STS_CSR(pnum),
 			&err_status);
 
@@ -661,7 +639,7 @@ static int rio_clr_err_stopped(struct rio_dev *rdev, u32 pnum, u32 err_status)
 			 pnum, regval);
 		far_ackid = (regval & RIO_PORT_N_MNT_RSP_ASTAT) >> 5;
 		far_linkstat = regval & RIO_PORT_N_MNT_RSP_LSTAT;
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ACK_STS_CSR(pnum),
 			&regval);
 		pr_debug("RIO_EM: SP%d_ACK_STS_CSR=0x%08x\n", pnum, regval);
@@ -679,9 +657,8 @@ static int rio_clr_err_stopped(struct rio_dev *rdev, u32 pnum, u32 err_status)
 			/* Align near outstanding/outbound ackIDs with
 			 * far inbound.
 			 */
-			rio_mport_write_config_32(mport, destid,
-				hopcount, rdev->phys_efptr +
-					RIO_PORT_N_ACK_STS_CSR(pnum),
+			rio_write_config_32(rdev,
+				rdev->phys_efptr + RIO_PORT_N_ACK_STS_CSR(pnum),
 				(near_ackid << 24) |
 					(far_ackid << 8) | far_ackid);
 			/* Align far outstanding/outbound ackIDs with
@@ -698,7 +675,7 @@ static int rio_clr_err_stopped(struct rio_dev *rdev, u32 pnum, u32 err_status)
 				pr_debug("RIO_EM: Invalid nextdev pointer (NULL)\n");
 		}
 rd_err:
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ERR_STS_CSR(pnum),
 			&err_status);
 		pr_debug("RIO_EM: SP%d_ERR_STS_CSR=0x%08x\n", pnum, err_status);
@@ -710,7 +687,7 @@ rd_err:
 				     RIO_GET_PORT_NUM(nextdev->swpinfo), NULL);
 		udelay(50);
 
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ERR_STS_CSR(pnum),
 			&err_status);
 		pr_debug("RIO_EM: SP%d_ERR_STS_CSR=0x%08x\n", pnum, err_status);
@@ -730,9 +707,6 @@ rd_err:
 int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg)
 {
 	struct rio_dev *rdev;
-	struct rio_mport *mport;
-	u8 hopcount;
-	u16 destid;
 	u32 err_status, em_perrdet, em_ltlerrdet;
 	int rc, portnum;
 
@@ -800,17 +774,13 @@ int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg)
 		return 0;
 	}
 
-	mport = rdev->net->hport;
-	destid = rdev->rswitch->destid;
-	hopcount = rdev->rswitch->hopcount;
-
 	/*
 	 * Process the port-write notification from switch
 	 */
 	if (rdev->rswitch->em_handle)
 		rdev->rswitch->em_handle(rdev, portnum);
 
-	rio_mport_read_config_32(mport, destid, hopcount,
+	rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ERR_STS_CSR(portnum),
 			&err_status);
 	pr_debug("RIO_PW: SP%d_ERR_STS_CSR=0x%08x\n", portnum, err_status);
@@ -840,7 +810,7 @@ int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg)
 			rdev->rswitch->port_ok &= ~(1 << portnum);
 			rio_set_port_lockout(rdev, portnum, 1);
 
-			rio_mport_write_config_32(mport, destid, hopcount,
+			rio_write_config_32(rdev,
 				rdev->phys_efptr +
 					RIO_PORT_N_ACK_STS_CSR(portnum),
 				RIO_PORT_N_ACK_CLEAR);
@@ -851,28 +821,28 @@ int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg)
 		}
 	}
 
-	rio_mport_read_config_32(mport, destid, hopcount,
+	rio_read_config_32(rdev,
 		rdev->em_efptr + RIO_EM_PN_ERR_DETECT(portnum), &em_perrdet);
 	if (em_perrdet) {
 		pr_debug("RIO_PW: RIO_EM_P%d_ERR_DETECT=0x%08x\n",
 			 portnum, em_perrdet);
 		/* Clear EM Port N Error Detect CSR */
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_write_config_32(rdev,
 			rdev->em_efptr + RIO_EM_PN_ERR_DETECT(portnum), 0);
 	}
 
-	rio_mport_read_config_32(mport, destid, hopcount,
+	rio_read_config_32(rdev,
 		rdev->em_efptr + RIO_EM_LTL_ERR_DETECT, &em_ltlerrdet);
 	if (em_ltlerrdet) {
 		pr_debug("RIO_PW: RIO_EM_LTL_ERR_DETECT=0x%08x\n",
 			 em_ltlerrdet);
 		/* Clear EM L/T Layer Error Detect CSR */
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_write_config_32(rdev,
 			rdev->em_efptr + RIO_EM_LTL_ERR_DETECT, 0);
 	}
 
 	/* Clear remaining error bits and Port-Write Pending bit */
-	rio_mport_write_config_32(mport, destid, hopcount,
+	rio_write_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ERR_STS_CSR(portnum),
 			err_status);
 
diff --git a/drivers/rapidio/switches/idt_gen2.c b/drivers/rapidio/switches/idt_gen2.c
index 0bb871cb5c40..dd4b2b7112c0 100644
--- a/drivers/rapidio/switches/idt_gen2.c
+++ b/drivers/rapidio/switches/idt_gen2.c
@@ -209,9 +209,6 @@ idtg2_get_domain(struct rio_mport *mport, u16 destid, u8 hopcount,
 static int
 idtg2_em_init(struct rio_dev *rdev)
 {
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	u32 regval;
 	int i, tmp;
 
@@ -220,29 +217,25 @@ idtg2_em_init(struct rio_dev *rdev)
 	 * All standard EM configuration should be performed at upper level.
 	 */
 
-	pr_debug("RIO: %s [%d:%d]\n", __func__, destid, hopcount);
+	pr_debug("RIO: %s [%d:%d]\n", __func__, rdev->destid, rdev->hopcount);
 
 	/* Set Port-Write info CSR: PRIO=3 and CRF=1 */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_PW_INFO_CSR, 0x0000e000);
+	rio_write_config_32(rdev, IDT_PW_INFO_CSR, 0x0000e000);
 
 	/*
 	 * Configure LT LAYER error reporting.
 	 */
 
 	/* Enable standard (RIO.p8) error reporting */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_LT_ERR_REPORT_EN,
+	rio_write_config_32(rdev, IDT_LT_ERR_REPORT_EN,
 			REM_LTL_ERR_ILLTRAN | REM_LTL_ERR_UNSOLR |
 			REM_LTL_ERR_UNSUPTR);
 
 	/* Use Port-Writes for LT layer error reporting.
 	 * Enable per-port reset
 	 */
-	rio_mport_read_config_32(mport, destid, hopcount,
-			IDT_DEV_CTRL_1, &regval);
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_DEV_CTRL_1,
+	rio_read_config_32(rdev, IDT_DEV_CTRL_1, &regval);
+	rio_write_config_32(rdev, IDT_DEV_CTRL_1,
 			regval | IDT_DEV_CTRL_1_GENPW | IDT_DEV_CTRL_1_PRSTBEH);
 
 	/*
@@ -250,45 +243,40 @@ idtg2_em_init(struct rio_dev *rdev)
 	 */
 
 	/* Report all RIO.p8 errors supported by device */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_PORT_ERR_REPORT_EN_BC, 0x807e8037);
+	rio_write_config_32(rdev, IDT_PORT_ERR_REPORT_EN_BC, 0x807e8037);
 
 	/* Configure reporting of implementation specific errors/events */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_PORT_ISERR_REPORT_EN_BC, IDT_PORT_INIT_TX_ACQUIRED);
+	rio_write_config_32(rdev, IDT_PORT_ISERR_REPORT_EN_BC,
+			    IDT_PORT_INIT_TX_ACQUIRED);
 
 	/* Use Port-Writes for port error reporting and enable error logging */
 	tmp = RIO_GET_TOTAL_PORTS(rdev->swpinfo);
 	for (i = 0; i < tmp; i++) {
-		rio_mport_read_config_32(mport, destid, hopcount,
-				IDT_PORT_OPS(i), &regval);
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev, IDT_PORT_OPS(i), &regval);
+		rio_write_config_32(rdev,
 				IDT_PORT_OPS(i), regval | IDT_PORT_OPS_GENPW |
 				IDT_PORT_OPS_PL_ELOG |
 				IDT_PORT_OPS_LL_ELOG |
 				IDT_PORT_OPS_LT_ELOG);
 	}
 	/* Overwrite error log if full */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_ERR_CAP, IDT_ERR_CAP_LOG_OVERWR);
+	rio_write_config_32(rdev, IDT_ERR_CAP, IDT_ERR_CAP_LOG_OVERWR);
 
 	/*
 	 * Configure LANE error reporting.
 	 */
 
 	/* Disable line error reporting */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_LANE_ERR_REPORT_EN_BC, 0);
+	rio_write_config_32(rdev, IDT_LANE_ERR_REPORT_EN_BC, 0);
 
 	/* Use Port-Writes for lane error reporting (when enabled)
 	 * (do per-lane update because lanes may have different configuration)
 	 */
 	tmp = (rdev->did == RIO_DID_IDTCPS1848) ? 48 : 16;
 	for (i = 0; i < tmp; i++) {
-		rio_mport_read_config_32(mport, destid, hopcount,
-				IDT_LANE_CTRL(i), &regval);
-		rio_mport_write_config_32(mport, destid, hopcount,
-				IDT_LANE_CTRL(i), regval | IDT_LANE_CTRL_GENPW);
+		rio_read_config_32(rdev, IDT_LANE_CTRL(i), &regval);
+		rio_write_config_32(rdev, IDT_LANE_CTRL(i),
+				    regval | IDT_LANE_CTRL_GENPW);
 	}
 
 	/*
@@ -296,41 +284,32 @@ idtg2_em_init(struct rio_dev *rdev)
 	 */
 
 	/* Disable JTAG and I2C Error capture */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_AUX_PORT_ERR_CAP_EN, 0);
+	rio_write_config_32(rdev, IDT_AUX_PORT_ERR_CAP_EN, 0);
 
 	/* Disable JTAG and I2C Error reporting/logging */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_AUX_ERR_REPORT_EN, 0);
+	rio_write_config_32(rdev, IDT_AUX_ERR_REPORT_EN, 0);
 
 	/* Disable Port-Write notification from JTAG */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_JTAG_CTRL, 0);
+	rio_write_config_32(rdev, IDT_JTAG_CTRL, 0);
 
 	/* Disable Port-Write notification from I2C */
-	rio_mport_read_config_32(mport, destid, hopcount,
-			IDT_I2C_MCTRL, &regval);
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_I2C_MCTRL,
-			regval & ~IDT_I2C_MCTRL_GENPW);
+	rio_read_config_32(rdev, IDT_I2C_MCTRL, &regval);
+	rio_write_config_32(rdev, IDT_I2C_MCTRL, regval & ~IDT_I2C_MCTRL_GENPW);
 
 	/*
 	 * Configure CFG_BLK error reporting.
 	 */
 
 	/* Disable Configuration Block error capture */
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_CFGBLK_ERR_CAPTURE_EN, 0);
+	rio_write_config_32(rdev, IDT_CFGBLK_ERR_CAPTURE_EN, 0);
 
 	/* Disable Port-Writes for Configuration Block error reporting */
-	rio_mport_read_config_32(mport, destid, hopcount,
-			IDT_CFGBLK_ERR_REPORT, &regval);
-	rio_mport_write_config_32(mport, destid, hopcount,
-			IDT_CFGBLK_ERR_REPORT,
-			regval & ~IDT_CFGBLK_ERR_REPORT_GENPW);
+	rio_read_config_32(rdev, IDT_CFGBLK_ERR_REPORT, &regval);
+	rio_write_config_32(rdev, IDT_CFGBLK_ERR_REPORT,
+			    regval & ~IDT_CFGBLK_ERR_REPORT_GENPW);
 
 	/* set TVAL = ~50us */
-	rio_mport_write_config_32(mport, destid, hopcount,
+	rio_write_config_32(rdev,
 		rdev->phys_efptr + RIO_PORT_LINKTO_CTL_CSR, 0x8e << 8);
 
 	return 0;
@@ -339,18 +318,15 @@ idtg2_em_init(struct rio_dev *rdev)
 static int
 idtg2_em_handler(struct rio_dev *rdev, u8 portnum)
 {
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	u32 regval, em_perrdet, em_ltlerrdet;
 
-	rio_mport_read_config_32(mport, destid, hopcount,
+	rio_read_config_32(rdev,
 		rdev->em_efptr + RIO_EM_LTL_ERR_DETECT, &em_ltlerrdet);
 	if (em_ltlerrdet) {
 		/* Service Logical/Transport Layer Error(s) */
 		if (em_ltlerrdet & REM_LTL_ERR_IMPSPEC) {
 			/* Implementation specific error reported */
-			rio_mport_read_config_32(mport, destid, hopcount,
+			rio_read_config_32(rdev,
 					IDT_ISLTL_ADDRESS_CAP, &regval);
 
 			pr_debug("RIO: %s Implementation Specific LTL errors" \
@@ -358,13 +334,12 @@ idtg2_em_handler(struct rio_dev *rdev, u8 portnum)
 				 rio_name(rdev), em_ltlerrdet, regval);
 
 			/* Clear implementation specific address capture CSR */
-			rio_mport_write_config_32(mport, destid, hopcount,
-					IDT_ISLTL_ADDRESS_CAP, 0);
+			rio_write_config_32(rdev, IDT_ISLTL_ADDRESS_CAP, 0);
 
 		}
 	}
 
-	rio_mport_read_config_32(mport, destid, hopcount,
+	rio_read_config_32(rdev,
 		rdev->em_efptr + RIO_EM_PN_ERR_DETECT(portnum), &em_perrdet);
 	if (em_perrdet) {
 		/* Service Port-Level Error(s) */
@@ -372,14 +347,14 @@ idtg2_em_handler(struct rio_dev *rdev, u8 portnum)
 			/* Implementation Specific port error reported */
 
 			/* Get IS errors reported */
-			rio_mport_read_config_32(mport, destid, hopcount,
+			rio_read_config_32(rdev,
 					IDT_PORT_ISERR_DET(portnum), &regval);
 
 			pr_debug("RIO: %s Implementation Specific Port" \
 				 " errors 0x%x\n", rio_name(rdev), regval);
 
 			/* Clear all implementation specific events */
-			rio_mport_write_config_32(mport, destid, hopcount,
+			rio_write_config_32(rdev,
 					IDT_PORT_ISERR_DET(portnum), 0);
 		}
 	}
@@ -391,14 +366,10 @@ static ssize_t
 idtg2_show_errlog(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct rio_dev *rdev = to_rio_dev(dev);
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	ssize_t len = 0;
 	u32 regval;
 
-	while (!rio_mport_read_config_32(mport, destid, hopcount,
-					 IDT_ERR_RD, &regval)) {
+	while (!rio_read_config_32(rdev, IDT_ERR_RD, &regval)) {
 		if (!regval)    /* 0 = end of log */
 			break;
 		len += snprintf(buf + len, PAGE_SIZE - len,
diff --git a/drivers/rapidio/switches/idtcps.c b/drivers/rapidio/switches/idtcps.c
index fc9f6374f759..3a971077e7bf 100644
--- a/drivers/rapidio/switches/idtcps.c
+++ b/drivers/rapidio/switches/idtcps.c
@@ -117,10 +117,6 @@ idtcps_get_domain(struct rio_mport *mport, u16 destid, u8 hopcount,
 
 static int idtcps_switch_init(struct rio_dev *rdev, int do_enum)
 {
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
-
 	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
 	rdev->rswitch->add_entry = idtcps_route_add_entry;
 	rdev->rswitch->get_entry = idtcps_route_get_entry;
@@ -132,7 +128,7 @@ static int idtcps_switch_init(struct rio_dev *rdev, int do_enum)
 
 	if (do_enum) {
 		/* set TVAL = ~50us */
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_write_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_LINKTO_CTL_CSR, 0x8e << 8);
 	}
 
diff --git a/drivers/rapidio/switches/tsi568.c b/drivers/rapidio/switches/tsi568.c
index b9a389b9f812..3994c00aa01f 100644
--- a/drivers/rapidio/switches/tsi568.c
+++ b/drivers/rapidio/switches/tsi568.c
@@ -113,22 +113,17 @@ tsi568_route_clr_table(struct rio_mport *mport, u16 destid, u8 hopcount,
 static int
 tsi568_em_init(struct rio_dev *rdev)
 {
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	u32 regval;
 	int portnum;
 
-	pr_debug("TSI568 %s [%d:%d]\n", __func__, destid, hopcount);
+	pr_debug("TSI568 %s [%d:%d]\n", __func__, rdev->destid, rdev->hopcount);
 
 	/* Make sure that Port-Writes are disabled (for all ports) */
 	for (portnum = 0;
 	     portnum < RIO_GET_TOTAL_PORTS(rdev->swpinfo); portnum++) {
-		rio_mport_read_config_32(mport, destid, hopcount,
-				TSI568_SP_MODE(portnum), &regval);
-		rio_mport_write_config_32(mport, destid, hopcount,
-				TSI568_SP_MODE(portnum),
-				regval | TSI568_SP_MODE_PW_DIS);
+		rio_read_config_32(rdev, TSI568_SP_MODE(portnum), &regval);
+		rio_write_config_32(rdev, TSI568_SP_MODE(portnum),
+				    regval | TSI568_SP_MODE_PW_DIS);
 	}
 
 	return 0;
diff --git a/drivers/rapidio/switches/tsi57x.c b/drivers/rapidio/switches/tsi57x.c
index 2003fb63c404..1a62934bfebc 100644
--- a/drivers/rapidio/switches/tsi57x.c
+++ b/drivers/rapidio/switches/tsi57x.c
@@ -158,48 +158,45 @@ tsi57x_get_domain(struct rio_mport *mport, u16 destid, u8 hopcount,
 static int
 tsi57x_em_init(struct rio_dev *rdev)
 {
-	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	u32 regval;
 	int portnum;
 
-	pr_debug("TSI578 %s [%d:%d]\n", __func__, destid, hopcount);
+	pr_debug("TSI578 %s [%d:%d]\n", __func__, rdev->destid, rdev->hopcount);
 
 	for (portnum = 0;
 	     portnum < RIO_GET_TOTAL_PORTS(rdev->swpinfo); portnum++) {
 		/* Make sure that Port-Writes are enabled (for all ports) */
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 				TSI578_SP_MODE(portnum), &regval);
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_write_config_32(rdev,
 				TSI578_SP_MODE(portnum),
 				regval & ~TSI578_SP_MODE_PW_DIS);
 
 		/* Clear all pending interrupts */
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 				rdev->phys_efptr +
 					RIO_PORT_N_ERR_STS_CSR(portnum),
 				&regval);
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_write_config_32(rdev,
 				rdev->phys_efptr +
 					RIO_PORT_N_ERR_STS_CSR(portnum),
 				regval & 0x07120214);
 
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 				TSI578_SP_INT_STATUS(portnum), &regval);
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_write_config_32(rdev,
 				TSI578_SP_INT_STATUS(portnum),
 				regval & 0x000700bd);
 
 		/* Enable all interrupts to allow ports to send a port-write */
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 				TSI578_SP_CTL_INDEP(portnum), &regval);
-		rio_mport_write_config_32(mport, destid, hopcount,
+		rio_write_config_32(rdev,
 				TSI578_SP_CTL_INDEP(portnum),
 				regval | 0x000b0000);
 
 		/* Skip next (odd) port if the current port is in x4 mode */
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 				rdev->phys_efptr + RIO_PORT_N_CTL_CSR(portnum),
 				&regval);
 		if ((regval & RIO_PORT_N_CTL_PWIDTH) == RIO_PORT_N_CTL_PWIDTH_4)
@@ -207,7 +204,7 @@ tsi57x_em_init(struct rio_dev *rdev)
 	}
 
 	/* set TVAL = ~50us */
-	rio_mport_write_config_32(mport, destid, hopcount,
+	rio_write_config_32(rdev,
 		rdev->phys_efptr + RIO_PORT_LINKTO_CTL_CSR, 0x9a << 8);
 
 	return 0;
@@ -217,14 +214,12 @@ static int
 tsi57x_em_handler(struct rio_dev *rdev, u8 portnum)
 {
 	struct rio_mport *mport = rdev->net->hport;
-	u16 destid = rdev->rswitch->destid;
-	u8 hopcount = rdev->rswitch->hopcount;
 	u32 intstat, err_status;
 	int sendcount, checkcount;
 	u8 route_port;
 	u32 regval;
 
-	rio_mport_read_config_32(mport, destid, hopcount,
+	rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ERR_STS_CSR(portnum),
 			&err_status);
 
@@ -232,15 +227,15 @@ tsi57x_em_handler(struct rio_dev *rdev, u8 portnum)
 	    (err_status & (RIO_PORT_N_ERR_STS_PW_OUT_ES |
 			  RIO_PORT_N_ERR_STS_PW_INP_ES))) {
 		/* Remove any queued packets by locking/unlocking port */
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_CTL_CSR(portnum),
 			&regval);
 		if (!(regval & RIO_PORT_N_CTL_LOCKOUT)) {
-			rio_mport_write_config_32(mport, destid, hopcount,
+			rio_write_config_32(rdev,
 				rdev->phys_efptr + RIO_PORT_N_CTL_CSR(portnum),
 				regval | RIO_PORT_N_CTL_LOCKOUT);
 			udelay(50);
-			rio_mport_write_config_32(mport, destid, hopcount,
+			rio_write_config_32(rdev,
 				rdev->phys_efptr + RIO_PORT_N_CTL_CSR(portnum),
 				regval);
 		}
@@ -248,7 +243,7 @@ tsi57x_em_handler(struct rio_dev *rdev, u8 portnum)
 		/* Read from link maintenance response register to clear
 		 * valid bit
 		 */
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_MNT_RSP_CSR(portnum),
 			&regval);
 
@@ -257,13 +252,12 @@ tsi57x_em_handler(struct rio_dev *rdev, u8 portnum)
 		 */
 		sendcount = 3;
 		while (sendcount) {
-			rio_mport_write_config_32(mport, destid, hopcount,
+			rio_write_config_32(rdev,
 					  TSI578_SP_CS_TX(portnum), 0x40fc8000);
 			checkcount = 3;
 			while (checkcount--) {
 				udelay(50);
-				rio_mport_read_config_32(
-					mport, destid, hopcount,
+				rio_read_config_32(rdev,
 					rdev->phys_efptr +
 						RIO_PORT_N_MNT_RSP_CSR(portnum),
 					&regval);
@@ -277,25 +271,23 @@ tsi57x_em_handler(struct rio_dev *rdev, u8 portnum)
 
 exit_es:
 	/* Clear implementation specific error status bits */
-	rio_mport_read_config_32(mport, destid, hopcount,
-				 TSI578_SP_INT_STATUS(portnum), &intstat);
+	rio_read_config_32(rdev, TSI578_SP_INT_STATUS(portnum), &intstat);
 	pr_debug("TSI578[%x:%x] SP%d_INT_STATUS=0x%08x\n",
-		 destid, hopcount, portnum, intstat);
+		 rdev->destid, rdev->hopcount, portnum, intstat);
 
 	if (intstat & 0x10000) {
-		rio_mport_read_config_32(mport, destid, hopcount,
+		rio_read_config_32(rdev,
 				TSI578_SP_LUT_PEINF(portnum), &regval);
 		regval = (mport->sys_size) ? (regval >> 16) : (regval >> 24);
 		route_port = rdev->rswitch->route_table[regval];
 		pr_debug("RIO: TSI578[%s] P%d LUT Parity Error (destID=%d)\n",
 			rio_name(rdev), portnum, regval);
-		tsi57x_route_add_entry(mport, destid, hopcount,
+		tsi57x_route_add_entry(mport, rdev->destid, rdev->hopcount,
 				RIO_GLOBAL_TABLE, regval, route_port);
 	}
 
-	rio_mport_write_config_32(mport, destid, hopcount,
-				  TSI578_SP_INT_STATUS(portnum),
-				  intstat & 0x000700bd);
+	rio_write_config_32(rdev, TSI578_SP_INT_STATUS(portnum),
+			    intstat & 0x000700bd);
 
 	return 0;
 }
diff --git a/include/linux/rio.h b/include/linux/rio.h
index 0bed941f9b13..f6e25b3a6967 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -98,7 +98,8 @@ union rio_pw_msg;
  * @dev: Device model device
  * @riores: RIO resources this device owns
  * @pwcback: port-write callback function for this device
- * @destid: Network destination ID
+ * @destid: Network destination ID (or associated destid for switch)
+ * @hopcount: Hopcount to this device
  * @prev: Previous RIO device connected to the current one
  */
 struct rio_dev {
@@ -126,6 +127,7 @@ struct rio_dev {
 	struct resource riores[RIO_MAX_DEV_RESOURCES];
 	int (*pwcback) (struct rio_dev *rdev, union rio_pw_msg *msg, int step);
 	u16 destid;
+	u8 hopcount;
 	struct rio_dev *prev;
 };
 
@@ -229,8 +231,6 @@ struct rio_net {
  * @node: Node in global list of switches
  * @rdev: Associated RIO device structure
  * @switchid: Switch ID that is unique across a network
- * @hopcount: Hopcount to this switch
- * @destid: Associated destid in the path
  * @route_table: Copy of switch routing table
  * @port_ok: Status of each port (one bit per port) - OK=1 or UNINIT=0
  * @add_entry: Callback for switch-specific route add function
@@ -247,8 +247,6 @@ struct rio_switch {
 	struct list_head node;
 	struct rio_dev *rdev;
 	u16 switchid;
-	u16 hopcount;
-	u16 destid;
 	u8 *route_table;
 	u32 port_ok;
 	int (*add_entry) (struct rio_mport * mport, u16 destid, u8 hopcount,
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index edc55da717b3..e09e565c4bce 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -150,16 +150,8 @@ static inline int rio_local_write_config_8(struct rio_mport *port, u32 offset,
 static inline int rio_read_config_32(struct rio_dev *rdev, u32 offset,
 				     u32 * data)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
-
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	return rio_mport_read_config_32(rdev->net->hport, destid, hopcount,
-					offset, data);
+	return rio_mport_read_config_32(rdev->net->hport, rdev->destid,
+					rdev->hopcount, offset, data);
 };
 
 /**
@@ -174,16 +166,8 @@ static inline int rio_read_config_32(struct rio_dev *rdev, u32 offset,
 static inline int rio_write_config_32(struct rio_dev *rdev, u32 offset,
 				      u32 data)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
-
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	return rio_mport_write_config_32(rdev->net->hport, destid, hopcount,
-					 offset, data);
+	return rio_mport_write_config_32(rdev->net->hport, rdev->destid,
+					 rdev->hopcount, offset, data);
 };
 
 /**
@@ -198,16 +182,8 @@ static inline int rio_write_config_32(struct rio_dev *rdev, u32 offset,
 static inline int rio_read_config_16(struct rio_dev *rdev, u32 offset,
 				     u16 * data)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
-
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	return rio_mport_read_config_16(rdev->net->hport, destid, hopcount,
-					offset, data);
+	return rio_mport_read_config_16(rdev->net->hport, rdev->destid,
+					rdev->hopcount, offset, data);
 };
 
 /**
@@ -222,16 +198,8 @@ static inline int rio_read_config_16(struct rio_dev *rdev, u32 offset,
 static inline int rio_write_config_16(struct rio_dev *rdev, u32 offset,
 				      u16 data)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
-
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	return rio_mport_write_config_16(rdev->net->hport, destid, hopcount,
-					 offset, data);
+	return rio_mport_write_config_16(rdev->net->hport, rdev->destid,
+					 rdev->hopcount, offset, data);
 };
 
 /**
@@ -245,16 +213,8 @@ static inline int rio_write_config_16(struct rio_dev *rdev, u32 offset,
  */
 static inline int rio_read_config_8(struct rio_dev *rdev, u32 offset, u8 * data)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
-
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	return rio_mport_read_config_8(rdev->net->hport, destid, hopcount,
-				       offset, data);
+	return rio_mport_read_config_8(rdev->net->hport, rdev->destid,
+				       rdev->hopcount, offset, data);
 };
 
 /**
@@ -268,16 +228,8 @@ static inline int rio_read_config_8(struct rio_dev *rdev, u32 offset, u8 * data)
  */
 static inline int rio_write_config_8(struct rio_dev *rdev, u32 offset, u8 data)
 {
-	u8 hopcount = 0xff;
-	u16 destid = rdev->destid;
-
-	if (rdev->rswitch) {
-		destid = rdev->rswitch->destid;
-		hopcount = rdev->rswitch->hopcount;
-	}
-
-	return rio_mport_write_config_8(rdev->net->hport, destid, hopcount,
-					offset, data);
+	return rio_mport_write_config_8(rdev->net->hport, rdev->destid,
+					rdev->hopcount, offset, data);
 };
 
 extern int rio_mport_send_doorbell(struct rio_mport *mport, u16 destid,
-- 
cgit v1.2.3-71-gd317


From ded05782719d0f7e79af98be7cf88c7e23a90435 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 12 Jan 2011 17:00:39 -0800
Subject: rapidio: integrate rio_switch into rio_dev

Convert RIO switches device structures (rio_dev + rio_switch) into a
single allocation unit.

This change is based on the fact that RIO switches are using common RIO
device objects anyway.  Allocating RIO switch objects as RIO devices with
added space for switch information simplifies handling of RIO switch
devices.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Thomas Moll <thomas.moll@sysgo.com>
Cc: Micha Nelissen <micha@neli.hopto.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio-scan.c  | 59 +++++++++++++++++---------------
 drivers/rapidio/rio-sysfs.c |  4 +--
 include/linux/rio.h         | 82 ++++++++++++++++++++++-----------------------
 3 files changed, 75 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 51f0af241eb7..45d14cd6b356 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -378,12 +378,30 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 	struct rio_dev *rdev;
 	struct rio_switch *rswitch = NULL;
 	int result, rdid;
+	size_t size;
+	u32 swpinfo = 0;
 
-	rdev = kzalloc(sizeof(struct rio_dev), GFP_KERNEL);
+	size = sizeof(struct rio_dev);
+	if (rio_mport_read_config_32(port, destid, hopcount,
+				     RIO_PEF_CAR, &result))
+		return NULL;
+
+	if (result & (RIO_PEF_SWITCH | RIO_PEF_MULTIPORT)) {
+		rio_mport_read_config_32(port, destid, hopcount,
+					 RIO_SWP_INFO_CAR, &swpinfo);
+		if (result & RIO_PEF_SWITCH) {
+			size += (RIO_GET_TOTAL_PORTS(swpinfo) *
+				sizeof(rswitch->nextdev[0])) + sizeof(*rswitch);
+		}
+	}
+
+	rdev = kzalloc(size, GFP_KERNEL);
 	if (!rdev)
 		return NULL;
 
 	rdev->net = net;
+	rdev->pef = result;
+	rdev->swpinfo = swpinfo;
 	rio_mport_read_config_32(port, destid, hopcount, RIO_DEV_ID_CAR,
 				 &result);
 	rdev->did = result >> 16;
@@ -397,8 +415,6 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 	rio_mport_read_config_32(port, destid, hopcount, RIO_ASM_INFO_CAR,
 				 &result);
 	rdev->asm_rev = result >> 16;
-	rio_mport_read_config_32(port, destid, hopcount, RIO_PEF_CAR,
-				 &rdev->pef);
 	if (rdev->pef & RIO_PEF_EXT_FEATURES) {
 		rdev->efptr = result & 0xffff;
 		rdev->phys_efptr = rio_mport_get_physefb(port, 0, destid,
@@ -408,11 +424,6 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 						hopcount, RIO_EFB_ERR_MGMNT);
 	}
 
-	if (rdev->pef & (RIO_PEF_SWITCH | RIO_PEF_MULTIPORT)) {
-		rio_mport_read_config_32(port, destid, hopcount,
-					 RIO_SWP_INFO_CAR, &rdev->swpinfo);
-	}
-
 	rio_mport_read_config_32(port, destid, hopcount, RIO_SRC_OPS_CAR,
 				 &rdev->src_ops);
 	rio_mport_read_config_32(port, destid, hopcount, RIO_DST_OPS_CAR,
@@ -449,12 +460,7 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 
 	/* If a PE has both switch and other functions, show it as a switch */
 	if (rio_is_switch(rdev)) {
-		rswitch = kzalloc(sizeof(*rswitch) +
-				  RIO_GET_TOTAL_PORTS(rdev->swpinfo) *
-				  sizeof(rswitch->nextdev[0]),
-				  GFP_KERNEL);
-		if (!rswitch)
-			goto cleanup;
+		rswitch = rdev->rswitch;
 		rswitch->switchid = next_switchid;
 		rswitch->port_ok = 0;
 		rswitch->route_table = kzalloc(sizeof(u8)*
@@ -466,15 +472,13 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 		for (rdid = 0; rdid < RIO_MAX_ROUTE_ENTRIES(port->sys_size);
 				rdid++)
 			rswitch->route_table[rdid] = RIO_INVALID_ROUTE;
-		rdev->rswitch = rswitch;
-		rswitch->rdev = rdev;
 		dev_set_name(&rdev->dev, "%02x:s:%04x", rdev->net->id,
-			     rdev->rswitch->switchid);
+			     rswitch->switchid);
 		rio_switch_init(rdev, do_enum);
 
-		if (do_enum && rdev->rswitch->clr_table)
-			rdev->rswitch->clr_table(port, destid, hopcount,
-						 RIO_GLOBAL_TABLE);
+		if (do_enum && rswitch->clr_table)
+			rswitch->clr_table(port, destid, hopcount,
+					   RIO_GLOBAL_TABLE);
 
 		list_add_tail(&rswitch->node, &rio_switches);
 
@@ -510,10 +514,9 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 	return rdev;
 
 cleanup:
-	if (rswitch) {
+	if (rswitch->route_table)
 		kfree(rswitch->route_table);
-		kfree(rswitch);
-	}
+
 	kfree(rdev);
 	return NULL;
 }
@@ -1072,7 +1075,7 @@ static struct rio_net __devinit *rio_alloc_net(struct rio_mport *port)
  */
 static void rio_update_route_tables(struct rio_mport *port)
 {
-	struct rio_dev *rdev;
+	struct rio_dev *rdev, *swrdev;
 	struct rio_switch *rswitch;
 	u8 sport;
 	u16 destid;
@@ -1087,14 +1090,16 @@ static void rio_update_route_tables(struct rio_mport *port)
 				continue;
 
 			if (RIO_INVALID_ROUTE == rswitch->route_table[destid]) {
+				swrdev = sw_to_rio_dev(rswitch);
+
 				/* Skip if destid ends in empty switch*/
-				if (rswitch->rdev->destid == destid)
+				if (swrdev->destid == destid)
 					continue;
 
-				sport = RIO_GET_PORT_NUM(rswitch->rdev->swpinfo);
+				sport = RIO_GET_PORT_NUM(swrdev->swpinfo);
 
 				if (rswitch->add_entry)	{
-					rio_route_add_entry(rswitch->rdev,
+					rio_route_add_entry(swrdev,
 						RIO_GLOBAL_TABLE, destid,
 						sport, 0);
 					rswitch->route_table[destid] = sport;
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 137ed93ee33f..76b41853a877 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -217,7 +217,7 @@ int rio_create_sysfs_dev_files(struct rio_dev *rdev)
 
 	err = device_create_bin_file(&rdev->dev, &rio_config_attr);
 
-	if (!err && rdev->rswitch) {
+	if (!err && (rdev->pef & RIO_PEF_SWITCH)) {
 		err = device_create_file(&rdev->dev, &dev_attr_routes);
 		if (!err && rdev->rswitch->sw_sysfs)
 			err = rdev->rswitch->sw_sysfs(rdev, RIO_SW_SYSFS_CREATE);
@@ -239,7 +239,7 @@ int rio_create_sysfs_dev_files(struct rio_dev *rdev)
 void rio_remove_sysfs_dev_files(struct rio_dev *rdev)
 {
 	device_remove_bin_file(&rdev->dev, &rio_config_attr);
-	if (rdev->rswitch) {
+	if (rdev->pef & RIO_PEF_SWITCH) {
 		device_remove_file(&rdev->dev, &dev_attr_routes);
 		if (rdev->rswitch->sw_sysfs)
 			rdev->rswitch->sw_sysfs(rdev, RIO_SW_SYSFS_REMOVE);
diff --git a/include/linux/rio.h b/include/linux/rio.h
index f6e25b3a6967..9b558856a8b6 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -71,8 +71,46 @@ extern struct device rio_bus;
 extern struct list_head rio_devices;	/* list of all devices */
 
 struct rio_mport;
+struct rio_dev;
 union rio_pw_msg;
 
+/**
+ * struct rio_switch - RIO switch info
+ * @node: Node in global list of switches
+ * @switchid: Switch ID that is unique across a network
+ * @route_table: Copy of switch routing table
+ * @port_ok: Status of each port (one bit per port) - OK=1 or UNINIT=0
+ * @add_entry: Callback for switch-specific route add function
+ * @get_entry: Callback for switch-specific route get function
+ * @clr_table: Callback for switch-specific clear route table function
+ * @set_domain: Callback for switch-specific domain setting function
+ * @get_domain: Callback for switch-specific domain get function
+ * @em_init: Callback for switch-specific error management init function
+ * @em_handle: Callback for switch-specific error management handler function
+ * @sw_sysfs: Callback that initializes switch-specific sysfs attributes
+ * @nextdev: Array of per-port pointers to the next attached device
+ */
+struct rio_switch {
+	struct list_head node;
+	u16 switchid;
+	u8 *route_table;
+	u32 port_ok;
+	int (*add_entry) (struct rio_mport *mport, u16 destid, u8 hopcount,
+			  u16 table, u16 route_destid, u8 route_port);
+	int (*get_entry) (struct rio_mport *mport, u16 destid, u8 hopcount,
+			  u16 table, u16 route_destid, u8 *route_port);
+	int (*clr_table) (struct rio_mport *mport, u16 destid, u8 hopcount,
+			  u16 table);
+	int (*set_domain) (struct rio_mport *mport, u16 destid, u8 hopcount,
+			   u8 sw_domain);
+	int (*get_domain) (struct rio_mport *mport, u16 destid, u8 hopcount,
+			   u8 *sw_domain);
+	int (*em_init) (struct rio_dev *dev);
+	int (*em_handle) (struct rio_dev *dev, u8 swport);
+	int (*sw_sysfs) (struct rio_dev *dev, int create);
+	struct rio_dev *nextdev[0];
+};
+
 /**
  * struct rio_dev - RIO device info
  * @global_list: Node in list of all RIO devices
@@ -93,7 +131,6 @@ union rio_pw_msg;
  * @phys_efptr: RIO device extended features pointer
  * @em_efptr: RIO Error Management features pointer
  * @dma_mask: Mask of bits of RIO address this device implements
- * @rswitch: Pointer to &struct rio_switch if valid for this device
  * @driver: Driver claiming this device
  * @dev: Device model device
  * @riores: RIO resources this device owns
@@ -101,6 +138,7 @@ union rio_pw_msg;
  * @destid: Network destination ID (or associated destid for switch)
  * @hopcount: Hopcount to this device
  * @prev: Previous RIO device connected to the current one
+ * @rswitch: struct rio_switch (if valid for this device)
  */
 struct rio_dev {
 	struct list_head global_list;	/* node in list of all RIO devices */
@@ -121,7 +159,6 @@ struct rio_dev {
 	u32 phys_efptr;
 	u32 em_efptr;
 	u64 dma_mask;
-	struct rio_switch *rswitch;	/* RIO switch info */
 	struct rio_driver *driver;	/* RIO driver claiming this device */
 	struct device dev;	/* LDM device structure */
 	struct resource riores[RIO_MAX_DEV_RESOURCES];
@@ -129,11 +166,13 @@ struct rio_dev {
 	u16 destid;
 	u8 hopcount;
 	struct rio_dev *prev;
+	struct rio_switch rswitch[0];	/* RIO switch info */
 };
 
 #define rio_dev_g(n) list_entry(n, struct rio_dev, global_list)
 #define rio_dev_f(n) list_entry(n, struct rio_dev, net_list)
 #define	to_rio_dev(n) container_of(n, struct rio_dev, dev)
+#define sw_to_rio_dev(n) container_of(n, struct rio_dev, rswitch[0])
 
 /**
  * struct rio_msg - RIO message event
@@ -226,45 +265,6 @@ struct rio_net {
 #define RIO_SW_SYSFS_CREATE	1	/* Create switch attributes */
 #define RIO_SW_SYSFS_REMOVE	0	/* Remove switch attributes */
 
-/**
- * struct rio_switch - RIO switch info
- * @node: Node in global list of switches
- * @rdev: Associated RIO device structure
- * @switchid: Switch ID that is unique across a network
- * @route_table: Copy of switch routing table
- * @port_ok: Status of each port (one bit per port) - OK=1 or UNINIT=0
- * @add_entry: Callback for switch-specific route add function
- * @get_entry: Callback for switch-specific route get function
- * @clr_table: Callback for switch-specific clear route table function
- * @set_domain: Callback for switch-specific domain setting function
- * @get_domain: Callback for switch-specific domain get function
- * @em_init: Callback for switch-specific error management initialization function
- * @em_handle: Callback for switch-specific error management handler function
- * @sw_sysfs: Callback that initializes switch-specific sysfs attributes
- * @nextdev: Array of per-port pointers to the next attached device
- */
-struct rio_switch {
-	struct list_head node;
-	struct rio_dev *rdev;
-	u16 switchid;
-	u8 *route_table;
-	u32 port_ok;
-	int (*add_entry) (struct rio_mport * mport, u16 destid, u8 hopcount,
-			  u16 table, u16 route_destid, u8 route_port);
-	int (*get_entry) (struct rio_mport * mport, u16 destid, u8 hopcount,
-			  u16 table, u16 route_destid, u8 * route_port);
-	int (*clr_table) (struct rio_mport *mport, u16 destid, u8 hopcount,
-			  u16 table);
-	int (*set_domain) (struct rio_mport *mport, u16 destid, u8 hopcount,
-			   u8 sw_domain);
-	int (*get_domain) (struct rio_mport *mport, u16 destid, u8 hopcount,
-			   u8 *sw_domain);
-	int (*em_init) (struct rio_dev *dev);
-	int (*em_handle) (struct rio_dev *dev, u8 swport);
-	int (*sw_sysfs) (struct rio_dev *dev, int create);
-	struct rio_dev *nextdev[0];
-};
-
 /* Low-level architecture-dependent routines */
 
 /**
-- 
cgit v1.2.3-71-gd317


From e6536927e64a2511864f2141d3e5b198b3f25ba6 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 12 Jan 2011 17:00:40 -0800
Subject: rapidio: add definitions of Component Tag fields

Add definition of the unique device identifier field in the component tag.
 RIO_CTAG_UDEVID does not take all 32 bits of the component tag value to
allow future extensions to the component tag use.

Selected size of the RIO_CTAG_UDEVID field (17 bits) is sufficient to
accommodate maximum number of endpoints in large RIO network (16-bit id)
plus switches.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Thomas Moll <thomas.moll@sysgo.com>
Cc: Micha Nelissen <micha@neli.hopto.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio.c |  2 +-
 include/linux/rio.h   | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index c13289e10bb1..cc2a3b74d0f0 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -710,7 +710,7 @@ int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg)
 	u32 err_status, em_perrdet, em_ltlerrdet;
 	int rc, portnum;
 
-	rdev = rio_get_comptag(pw_msg->em.comptag, NULL);
+	rdev = rio_get_comptag((pw_msg->em.comptag & RIO_CTAG_UDEVID), NULL);
 	if (rdev == NULL) {
 		/* Device removed or enumeration error */
 		pr_debug("RIO: %s No matching device for CTag 0x%08x\n",
diff --git a/include/linux/rio.h b/include/linux/rio.h
index 9b558856a8b6..ff681ebba585 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -66,6 +66,16 @@
 
 #define RIO_PW_MSG_SIZE		64
 
+/*
+ * A component tag value (stored in the component tag CSR) is used as device's
+ * unique identifier assigned during enumeration. Besides being used for
+ * identifying switches (which do not have device ID register), it also is used
+ * by error management notification and therefore has to be assigned
+ * to endpoints as well.
+ */
+#define RIO_CTAG_RESRVD	0xfffe0000 /* Reserved */
+#define RIO_CTAG_UDEVID	0x0001ffff /* Unique device identifier */
+
 extern struct bus_type rio_bus_type;
 extern struct device rio_bus;
 extern struct list_head rio_devices;	/* list of all devices */
-- 
cgit v1.2.3-71-gd317


From 2e9d4d84847b5b05c8a049b681f15906a91e0f78 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 12 Jan 2011 17:00:43 -0800
Subject: rapidio: add new idt sRIO switches

Add new sRIO switch device IDs and enable a basic support for them.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Thomas Moll <thomas.moll@sysgo.com>
Cc: Micha Nelissen <micha@neli.hopto.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/switches/idt_gen2.c | 2 ++
 include/linux/rio_ids.h             | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rapidio/switches/idt_gen2.c b/drivers/rapidio/switches/idt_gen2.c
index dd4b2b7112c0..095016a9dec1 100644
--- a/drivers/rapidio/switches/idt_gen2.c
+++ b/drivers/rapidio/switches/idt_gen2.c
@@ -416,3 +416,5 @@ static int idtg2_switch_init(struct rio_dev *rdev, int do_enum)
 
 DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS1848, idtg2_switch_init);
 DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS1616, idtg2_switch_init);
+DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTVPS1616, idtg2_switch_init);
+DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTSPS1616, idtg2_switch_init);
diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h
index ee7b6ada188f..7410d3365e2a 100644
--- a/include/linux/rio_ids.h
+++ b/include/linux/rio_ids.h
@@ -36,5 +36,7 @@
 #define RIO_DID_IDTCPS10Q		0x035e
 #define RIO_DID_IDTCPS1848		0x0374
 #define RIO_DID_IDTCPS1616		0x0379
+#define RIO_DID_IDTVPS1616		0x0377
+#define RIO_DID_IDTSPS1616		0x0378
 
 #endif				/* LINUX_RIO_IDS_H */
-- 
cgit v1.2.3-71-gd317


From 6164281ab7a4d3bd42588d6b25984e960a2e032f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 12 Jan 2011 17:00:46 -0800
Subject: user_ns: improve the user_ns on-the-slab packaging

Currently on 64-bit arch the user_namespace is 2096 and when being
kmalloc-ed it resides on a 4k slab wasting 2003 bytes.

If we allocate a separate cache for it and reduce the hash size from 128
to 64 chains the packaging becomes *much* better - the struct is 1072
bytes and the hole between is 98 bytes.

[akpm@linux-foundation.org: s/__initcall/module_init/]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_namespace.h |  2 +-
 kernel/user_namespace.c        | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8178156711f9..faf467944baf 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -6,7 +6,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 
-#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 8)
+#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 7)
 #define UIDHASH_SZ	(1 << UIDHASH_BITS)
 
 struct user_namespace {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
 #include <linux/highuid.h>
 #include <linux/cred.h>
 
+static struct kmem_cache *user_ns_cachep __read_mostly;
+
 /*
  * Create a new user namespace, deriving the creator from the user in the
  * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
 	struct user_struct *root_user;
 	int n;
 
-	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+	ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
 	if (!ns)
 		return -ENOMEM;
 
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
 	/* Alloc new root user.  */
 	root_user = alloc_uid(ns, 0);
 	if (!root_user) {
-		kfree(ns);
+		kmem_cache_free(user_ns_cachep, ns);
 		return -ENOMEM;
 	}
 
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
 	struct user_namespace *ns =
 		container_of(work, struct user_namespace, destroyer);
 	free_uid(ns->creator);
-	kfree(ns);
+	kmem_cache_free(user_ns_cachep, ns);
 }
 
 void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
 	/* No useful relationship so no mapping */
 	return overflowgid;
 }
+
+static __init int user_namespaces_init(void)
+{
+	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+	return 0;
+}
+module_init(user_namespaces_init);
-- 
cgit v1.2.3-71-gd317


From 0329326e85aaa30fb8d427828c718d565c287385 Mon Sep 17 00:00:00 2001
From: "Matti J. Aaltonen" <matti.j.aaltonen@nokia.com>
Date: Wed, 12 Jan 2011 17:00:47 -0800
Subject: NFC: Driver for NXP Semiconductors PN544 NFC chip.

Creates a new "Near Field Communication" subsystem in drivers/nfc.
http://en.wikipedia.org/wiki/Near_Field_Communication is useful ;)

This is a driver for the pn544 NFC device. The driver transfers
ETSI messages between the device and the user space.

Signed-off-by: Matti J. Aaltonen <matti.j.aaltonen@nokia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/nfc/nfc-pn544.txt | 114 +++++
 drivers/Kconfig                 |   2 +
 drivers/Makefile                |   2 +-
 drivers/nfc/Kconfig             |  30 ++
 drivers/nfc/Makefile            |   5 +
 drivers/nfc/pn544.c             | 891 ++++++++++++++++++++++++++++++++++++++++
 include/linux/nfc/pn544.h       |  97 +++++
 7 files changed, 1140 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/nfc/nfc-pn544.txt
 create mode 100644 drivers/nfc/Kconfig
 create mode 100644 drivers/nfc/Makefile
 create mode 100644 drivers/nfc/pn544.c
 create mode 100644 include/linux/nfc/pn544.h

(limited to 'include/linux')

diff --git a/Documentation/nfc/nfc-pn544.txt b/Documentation/nfc/nfc-pn544.txt
new file mode 100644
index 000000000000..2fcac9f5996e
--- /dev/null
+++ b/Documentation/nfc/nfc-pn544.txt
@@ -0,0 +1,114 @@
+Kernel driver for the NXP Semiconductors PN544 Near Field
+Communication chip
+
+Author: Jari Vanhala
+Contact: Matti Aaltonen (matti.j.aaltonen at nokia.com)
+
+General
+-------
+
+The PN544 is an integrated transmission module for contactless
+communication. The driver goes under drives/nfc/ and is compiled as a
+module named "pn544". It registers a misc device and creates a device
+file named "/dev/pn544".
+
+Host Interfaces: I2C, SPI and HSU, this driver supports currently only I2C.
+
+The Interface
+-------------
+
+The driver offers a sysfs interface for a hardware test and an IOCTL
+interface for selecting between two operating modes. There are read,
+write and poll functions for transferring messages. The two operating
+modes are the normal (HCI) mode and the firmware update mode.
+
+PN544 is controlled by sending messages from the userspace to the
+chip. The main function of the driver is just to pass those messages
+without caring about the message content.
+
+
+Protocols
+---------
+
+In the normal (HCI) mode and in the firmware update mode read and
+write functions behave a bit differently because the message formats
+or the protocols are different.
+
+In the normal (HCI) mode the protocol used is derived from the ETSI
+HCI specification. The firmware is updated using a specific protocol,
+which is different from HCI.
+
+HCI messages consist of an eight bit header and the message body. The
+header contains the message length. Maximum size for an HCI message is
+33. In HCI mode sent messages are tested for a correct
+checksum. Firmware update messages have the length in the second (MSB)
+and third (LSB) bytes of the message. The maximum FW message length is
+1024 bytes.
+
+For the ETSI HCI specification see
+http://www.etsi.org/WebSite/Technologies/ProtocolSpecification.aspx
+
+The Hardware Test
+-----------------
+
+The idea of the test is that it can performed by reading from the
+corresponding sysfs file. The test is implemented in the board file
+and it should test that PN544 can be put into the firmware update
+mode. If the test is not implemented the sysfs file does not get
+created.
+
+Example:
+> cat /sys/module/pn544/drivers/i2c\:pn544/3-002b/nfc_test
+1
+
+Normal Operation
+----------------
+
+PN544 is powered up when the device file is opened, otherwise it's
+turned off. Only one instance can use the device at a time.
+
+Userspace applications control PN544 with HCI messages. The hardware
+sends an interrupt when data is available for reading. Data is
+physically read when the read function is called by a userspace
+application. Poll() checks the read interrupt state. Configuration and
+self testing are also done from the userspace using read and write.
+
+Example platform data:
+
+static int rx71_pn544_nfc_request_resources(struct i2c_client *client)
+{
+	/* Get and setup the HW resources for the device */
+}
+
+static void rx71_pn544_nfc_free_resources(void)
+{
+	/* Release the HW resources */
+}
+
+static void rx71_pn544_nfc_enable(int fw)
+{
+	/* Turn the device on */
+}
+
+static int rx71_pn544_nfc_test(void)
+{
+	/*
+	 * Put the device into the FW update mode
+	 * and then back to the normal mode.
+	 * Check the behavior and return one on success,
+	 * zero on failure.
+	 */
+}
+
+static void rx71_pn544_nfc_disable(void)
+{
+	/* turn the power off */
+}
+
+static struct pn544_nfc_platform_data rx71_nfc_data = {
+	.request_resources = rx71_pn544_nfc_request_resources,
+	.free_resources = rx71_pn544_nfc_free_resources,
+	.enable = rx71_pn544_nfc_enable,
+	.test = rx71_pn544_nfc_test,
+	.disable = rx71_pn544_nfc_disable,
+};
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 3d93b3a3d630..dd0a5b5e9bf3 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -88,6 +88,8 @@ source "drivers/memstick/Kconfig"
 
 source "drivers/leds/Kconfig"
 
+source "drivers/nfc/Kconfig"
+
 source "drivers/accessibility/Kconfig"
 
 source "drivers/infiniband/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index bf15ce7493d2..ef5132469f58 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_FB_INTEL)          += video/intelfb/
 
 obj-y				+= serial/
 obj-$(CONFIG_PARPORT)		+= parport/
-obj-y				+= base/ block/ misc/ mfd/
+obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
 obj-$(CONFIG_IDE)		+= ide/
diff --git a/drivers/nfc/Kconfig b/drivers/nfc/Kconfig
new file mode 100644
index 000000000000..ffedfd492754
--- /dev/null
+++ b/drivers/nfc/Kconfig
@@ -0,0 +1,30 @@
+#
+# Near Field Communication (NFC) devices
+#
+
+menuconfig NFC_DEVICES
+	bool "NFC devices"
+	default n
+	---help---
+	  You'll have to say Y if your computer contains an NFC device that
+	  you want to use under Linux.
+
+	  You can say N here if you don't have any Near Field Communication
+	  devices connected to your computer.
+
+if NFC_DEVICES
+
+config PN544_NFC
+	tristate "PN544 NFC driver"
+	depends on I2C
+	select CRC_CCITT
+	default n
+	---help---
+	  Say yes if you want PN544 Near Field Communication driver.
+	  This is for i2c connected version. If unsure, say N here.
+
+	  To compile this driver as a module, choose m here. The module will
+	  be called pn544.
+
+
+endif # NFC_DEVICES
diff --git a/drivers/nfc/Makefile b/drivers/nfc/Makefile
new file mode 100644
index 000000000000..a4efb164ec49
--- /dev/null
+++ b/drivers/nfc/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for nfc devices
+#
+
+obj-$(CONFIG_PN544_NFC)		+= pn544.o
diff --git a/drivers/nfc/pn544.c b/drivers/nfc/pn544.c
new file mode 100644
index 000000000000..401c44b6eadb
--- /dev/null
+++ b/drivers/nfc/pn544.c
@@ -0,0 +1,891 @@
+/*
+ * Driver for the PN544 NFC chip.
+ *
+ * Copyright (C) Nokia Corporation
+ *
+ * Author: Jari Vanhala <ext-jari.vanhala@nokia.com>
+ * Contact: Matti Aaltonen <matti.j.aaltonen@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/completion.h>
+#include <linux/crc-ccitt.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/nfc/pn544.h>
+#include <linux/poll.h>
+#include <linux/regulator/consumer.h>
+#include <linux/serial_core.h> /* for TCGETS */
+#include <linux/slab.h>
+
+#define DRIVER_CARD	"PN544 NFC"
+#define DRIVER_DESC	"NFC driver for PN544"
+
+static struct i2c_device_id pn544_id_table[] = {
+	{ PN544_DRIVER_NAME, 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, pn544_id_table);
+
+#define HCI_MODE	0
+#define FW_MODE		1
+
+enum pn544_state {
+	PN544_ST_COLD,
+	PN544_ST_FW_READY,
+	PN544_ST_READY,
+};
+
+enum pn544_irq {
+	PN544_NONE,
+	PN544_INT,
+};
+
+struct pn544_info {
+	struct miscdevice miscdev;
+	struct i2c_client *i2c_dev;
+	struct regulator_bulk_data regs[2];
+
+	enum pn544_state state;
+	wait_queue_head_t read_wait;
+	loff_t read_offset;
+	enum pn544_irq read_irq;
+	struct mutex read_mutex; /* Serialize read_irq access */
+	struct mutex mutex; /* Serialize info struct access */
+	u8 *buf;
+	unsigned int buflen;
+};
+
+static const char reg_vdd_io[]	= "Vdd_IO";
+static const char reg_vbat[]	= "VBat";
+
+/* sysfs interface */
+static ssize_t pn544_test(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct pn544_info *info = dev_get_drvdata(dev);
+	struct i2c_client *client = info->i2c_dev;
+	struct pn544_nfc_platform_data *pdata = client->dev.platform_data;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", pdata->test());
+}
+
+static int pn544_enable(struct pn544_info *info, int mode)
+{
+	struct pn544_nfc_platform_data *pdata;
+	struct i2c_client *client = info->i2c_dev;
+
+	int r;
+
+	r = regulator_bulk_enable(ARRAY_SIZE(info->regs), info->regs);
+	if (r < 0)
+		return r;
+
+	pdata = client->dev.platform_data;
+	info->read_irq = PN544_NONE;
+	if (pdata->enable)
+		pdata->enable(mode);
+
+	if (mode) {
+		info->state = PN544_ST_FW_READY;
+		dev_dbg(&client->dev, "now in FW-mode\n");
+	} else {
+		info->state = PN544_ST_READY;
+		dev_dbg(&client->dev, "now in HCI-mode\n");
+	}
+
+	usleep_range(10000, 15000);
+
+	return 0;
+}
+
+static void pn544_disable(struct pn544_info *info)
+{
+	struct pn544_nfc_platform_data *pdata;
+	struct i2c_client *client = info->i2c_dev;
+
+	pdata = client->dev.platform_data;
+	if (pdata->disable)
+		pdata->disable();
+
+	info->state = PN544_ST_COLD;
+
+	dev_dbg(&client->dev, "Now in OFF-mode\n");
+
+	msleep(PN544_RESETVEN_TIME);
+
+	info->read_irq = PN544_NONE;
+	regulator_bulk_disable(ARRAY_SIZE(info->regs), info->regs);
+}
+
+static int check_crc(u8 *buf, int buflen)
+{
+	u8 len;
+	u16 crc;
+
+	len = buf[0] + 1;
+	if (len < 4 || len != buflen || len > PN544_MSG_MAX_SIZE) {
+		pr_err(PN544_DRIVER_NAME
+		       ": CRC; corrupt packet len %u (%d)\n", len, buflen);
+		print_hex_dump(KERN_DEBUG, "crc: ", DUMP_PREFIX_NONE,
+			       16, 2, buf, buflen, false);
+		return -EPERM;
+	}
+	crc = crc_ccitt(0xffff, buf, len - 2);
+	crc = ~crc;
+
+	if (buf[len-2] != (crc & 0xff) || buf[len-1] != (crc >> 8)) {
+		pr_err(PN544_DRIVER_NAME ": CRC error 0x%x != 0x%x 0x%x\n",
+		       crc, buf[len-1], buf[len-2]);
+
+		print_hex_dump(KERN_DEBUG, "crc: ", DUMP_PREFIX_NONE,
+			       16, 2, buf, buflen, false);
+		return -EPERM;
+	}
+	return 0;
+}
+
+static int pn544_i2c_write(struct i2c_client *client, u8 *buf, int len)
+{
+	int r;
+
+	if (len < 4 || len != (buf[0] + 1)) {
+		dev_err(&client->dev, "%s: Illegal message length: %d\n",
+			__func__, len);
+		return -EINVAL;
+	}
+
+	if (check_crc(buf, len))
+		return -EINVAL;
+
+	usleep_range(3000, 6000);
+
+	r = i2c_master_send(client, buf, len);
+	dev_dbg(&client->dev, "send: %d\n", r);
+
+	if (r == -EREMOTEIO) { /* Retry, chip was in standby */
+		usleep_range(6000, 10000);
+		r = i2c_master_send(client, buf, len);
+		dev_dbg(&client->dev, "send2: %d\n", r);
+	}
+
+	if (r != len)
+		return -EREMOTEIO;
+
+	return r;
+}
+
+static int pn544_i2c_read(struct i2c_client *client, u8 *buf, int buflen)
+{
+	int r;
+	u8 len;
+
+	/*
+	 * You could read a packet in one go, but then you'd need to read
+	 * max size and rest would be 0xff fill, so we do split reads.
+	 */
+	r = i2c_master_recv(client, &len, 1);
+	dev_dbg(&client->dev, "recv1: %d\n", r);
+
+	if (r != 1)
+		return -EREMOTEIO;
+
+	if (len < PN544_LLC_HCI_OVERHEAD)
+		len = PN544_LLC_HCI_OVERHEAD;
+	else if (len > (PN544_MSG_MAX_SIZE - 1))
+		len = PN544_MSG_MAX_SIZE - 1;
+
+	if (1 + len > buflen) /* len+(data+crc16) */
+		return -EMSGSIZE;
+
+	buf[0] = len;
+
+	r = i2c_master_recv(client, buf + 1, len);
+	dev_dbg(&client->dev, "recv2: %d\n", r);
+
+	if (r != len)
+		return -EREMOTEIO;
+
+	usleep_range(3000, 6000);
+
+	return r + 1;
+}
+
+static int pn544_fw_write(struct i2c_client *client, u8 *buf, int len)
+{
+	int r;
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+
+	if (len < PN544_FW_HEADER_SIZE ||
+	    (PN544_FW_HEADER_SIZE + (buf[1] << 8) + buf[2]) != len)
+		return -EINVAL;
+
+	r = i2c_master_send(client, buf, len);
+	dev_dbg(&client->dev, "fw send: %d\n", r);
+
+	if (r == -EREMOTEIO) { /* Retry, chip was in standby */
+		usleep_range(6000, 10000);
+		r = i2c_master_send(client, buf, len);
+		dev_dbg(&client->dev, "fw send2: %d\n", r);
+	}
+
+	if (r != len)
+		return -EREMOTEIO;
+
+	return r;
+}
+
+static int pn544_fw_read(struct i2c_client *client, u8 *buf, int buflen)
+{
+	int r, len;
+
+	if (buflen < PN544_FW_HEADER_SIZE)
+		return -EINVAL;
+
+	r = i2c_master_recv(client, buf, PN544_FW_HEADER_SIZE);
+	dev_dbg(&client->dev, "FW recv1: %d\n", r);
+
+	if (r < 0)
+		return r;
+
+	if (r < PN544_FW_HEADER_SIZE)
+		return -EINVAL;
+
+	len = (buf[1] << 8) + buf[2];
+	if (len == 0) /* just header, no additional data */
+		return r;
+
+	if (len > buflen - PN544_FW_HEADER_SIZE)
+		return -EMSGSIZE;
+
+	r = i2c_master_recv(client, buf + PN544_FW_HEADER_SIZE, len);
+	dev_dbg(&client->dev, "fw recv2: %d\n", r);
+
+	if (r != len)
+		return -EINVAL;
+
+	return r + PN544_FW_HEADER_SIZE;
+}
+
+static irqreturn_t pn544_irq_thread_fn(int irq, void *dev_id)
+{
+	struct pn544_info *info = dev_id;
+	struct i2c_client *client = info->i2c_dev;
+
+	BUG_ON(!info);
+	BUG_ON(irq != info->i2c_dev->irq);
+
+	dev_dbg(&client->dev, "IRQ\n");
+
+	mutex_lock(&info->read_mutex);
+	info->read_irq = PN544_INT;
+	mutex_unlock(&info->read_mutex);
+
+	wake_up_interruptible(&info->read_wait);
+
+	return IRQ_HANDLED;
+}
+
+static enum pn544_irq pn544_irq_state(struct pn544_info *info)
+{
+	enum pn544_irq irq;
+
+	mutex_lock(&info->read_mutex);
+	irq = info->read_irq;
+	mutex_unlock(&info->read_mutex);
+	/*
+	 * XXX: should we check GPIO-line status directly?
+	 * return pdata->irq_status() ? PN544_INT : PN544_NONE;
+	 */
+
+	return irq;
+}
+
+static ssize_t pn544_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *offset)
+{
+	struct pn544_info *info = container_of(file->private_data,
+					       struct pn544_info, miscdev);
+	struct i2c_client *client = info->i2c_dev;
+	enum pn544_irq irq;
+	size_t len;
+	int r = 0;
+
+	dev_dbg(&client->dev, "%s: info: %p, count: %zu\n", __func__,
+		info, count);
+
+	mutex_lock(&info->mutex);
+
+	if (info->state == PN544_ST_COLD) {
+		r = -ENODEV;
+		goto out;
+	}
+
+	irq = pn544_irq_state(info);
+	if (irq == PN544_NONE) {
+		if (file->f_flags & O_NONBLOCK) {
+			r = -EAGAIN;
+			goto out;
+		}
+
+		if (wait_event_interruptible(info->read_wait,
+					     (info->read_irq == PN544_INT))) {
+			r = -ERESTARTSYS;
+			goto out;
+		}
+	}
+
+	if (info->state == PN544_ST_FW_READY) {
+		len = min(count, info->buflen);
+
+		mutex_lock(&info->read_mutex);
+		r = pn544_fw_read(info->i2c_dev, info->buf, len);
+		info->read_irq = PN544_NONE;
+		mutex_unlock(&info->read_mutex);
+
+		if (r < 0) {
+			dev_err(&info->i2c_dev->dev, "FW read failed: %d\n", r);
+			goto out;
+		}
+
+		print_hex_dump(KERN_DEBUG, "FW read: ", DUMP_PREFIX_NONE,
+			       16, 2, info->buf, r, false);
+
+		*offset += r;
+		if (copy_to_user(buf, info->buf, r)) {
+			r = -EFAULT;
+			goto out;
+		}
+	} else {
+		len = min(count, info->buflen);
+
+		mutex_lock(&info->read_mutex);
+		r = pn544_i2c_read(info->i2c_dev, info->buf, len);
+		info->read_irq = PN544_NONE;
+		mutex_unlock(&info->read_mutex);
+
+		if (r < 0) {
+			dev_err(&info->i2c_dev->dev, "read failed (%d)\n", r);
+			goto out;
+		}
+		print_hex_dump(KERN_DEBUG, "read: ", DUMP_PREFIX_NONE,
+			       16, 2, info->buf, r, false);
+
+		*offset += r;
+		if (copy_to_user(buf, info->buf, r)) {
+			r = -EFAULT;
+			goto out;
+		}
+	}
+
+out:
+	mutex_unlock(&info->mutex);
+
+	return r;
+}
+
+static unsigned int pn544_poll(struct file *file, poll_table *wait)
+{
+	struct pn544_info *info = container_of(file->private_data,
+					       struct pn544_info, miscdev);
+	struct i2c_client *client = info->i2c_dev;
+	int r = 0;
+
+	dev_dbg(&client->dev, "%s: info: %p\n", __func__, info);
+
+	mutex_lock(&info->mutex);
+
+	if (info->state == PN544_ST_COLD) {
+		r = -ENODEV;
+		goto out;
+	}
+
+	poll_wait(file, &info->read_wait, wait);
+
+	if (pn544_irq_state(info) == PN544_INT) {
+		r = POLLIN | POLLRDNORM;
+		goto out;
+	}
+out:
+	mutex_unlock(&info->mutex);
+
+	return r;
+}
+
+static ssize_t pn544_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct pn544_info *info = container_of(file->private_data,
+					       struct pn544_info, miscdev);
+	struct i2c_client *client = info->i2c_dev;
+	ssize_t	len;
+	int r;
+
+	dev_dbg(&client->dev, "%s: info: %p, count %zu\n", __func__,
+		info, count);
+
+	mutex_lock(&info->mutex);
+
+	if (info->state == PN544_ST_COLD) {
+		r = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * XXX: should we detect rset-writes and clean possible
+	 * read_irq state
+	 */
+	if (info->state == PN544_ST_FW_READY) {
+		size_t fw_len;
+
+		if (count < PN544_FW_HEADER_SIZE) {
+			r = -EINVAL;
+			goto out;
+		}
+
+		len = min(count, info->buflen);
+		if (copy_from_user(info->buf, buf, len)) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		print_hex_dump(KERN_DEBUG, "FW write: ", DUMP_PREFIX_NONE,
+			       16, 2, info->buf, len, false);
+
+		fw_len = PN544_FW_HEADER_SIZE + (info->buf[1] << 8) +
+			info->buf[2];
+
+		if (len > fw_len) /* 1 msg at a time */
+			len = fw_len;
+
+		r = pn544_fw_write(info->i2c_dev, info->buf, len);
+	} else {
+		if (count < PN544_LLC_MIN_SIZE) {
+			r = -EINVAL;
+			goto out;
+		}
+
+		len = min(count, info->buflen);
+		if (copy_from_user(info->buf, buf, len)) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		print_hex_dump(KERN_DEBUG, "write: ", DUMP_PREFIX_NONE,
+			       16, 2, info->buf, len, false);
+
+		if (len > (info->buf[0] + 1)) /* 1 msg at a time */
+			len  = info->buf[0] + 1;
+
+		r = pn544_i2c_write(info->i2c_dev, info->buf, len);
+	}
+out:
+	mutex_unlock(&info->mutex);
+
+	return r;
+
+}
+
+static long pn544_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pn544_info *info = container_of(file->private_data,
+					       struct pn544_info, miscdev);
+	struct i2c_client *client = info->i2c_dev;
+	struct pn544_nfc_platform_data *pdata;
+	unsigned int val;
+	int r = 0;
+
+	dev_dbg(&client->dev, "%s: info: %p, cmd: 0x%x\n", __func__, info, cmd);
+
+	mutex_lock(&info->mutex);
+
+	if (info->state == PN544_ST_COLD) {
+		r = -ENODEV;
+		goto out;
+	}
+
+	pdata = info->i2c_dev->dev.platform_data;
+	switch (cmd) {
+	case PN544_GET_FW_MODE:
+		dev_dbg(&client->dev, "%s:  PN544_GET_FW_MODE\n", __func__);
+
+		val = (info->state == PN544_ST_FW_READY);
+		if (copy_to_user((void __user *)arg, &val, sizeof(val))) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		break;
+
+	case PN544_SET_FW_MODE:
+		dev_dbg(&client->dev, "%s:  PN544_SET_FW_MODE\n", __func__);
+
+		if (copy_from_user(&val, (void __user *)arg, sizeof(val))) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		if (val) {
+			if (info->state == PN544_ST_FW_READY)
+				break;
+
+			pn544_disable(info);
+			r = pn544_enable(info, FW_MODE);
+			if (r < 0)
+				goto out;
+		} else {
+			if (info->state == PN544_ST_READY)
+				break;
+			pn544_disable(info);
+			r = pn544_enable(info, HCI_MODE);
+			if (r < 0)
+				goto out;
+		}
+		file->f_pos = info->read_offset;
+		break;
+
+	case TCGETS:
+		dev_dbg(&client->dev, "%s:  TCGETS\n", __func__);
+
+		r = -ENOIOCTLCMD;
+		break;
+
+	default:
+		dev_err(&client->dev, "Unknown ioctl 0x%x\n", cmd);
+		r = -ENOIOCTLCMD;
+		break;
+	}
+
+out:
+	mutex_unlock(&info->mutex);
+
+	return r;
+}
+
+static int pn544_open(struct inode *inode, struct file *file)
+{
+	struct pn544_info *info = container_of(file->private_data,
+					       struct pn544_info, miscdev);
+	struct i2c_client *client = info->i2c_dev;
+	int r = 0;
+
+	dev_dbg(&client->dev, "%s: info: %p, client %p\n", __func__,
+		info, info->i2c_dev);
+
+	mutex_lock(&info->mutex);
+
+	/*
+	 * Only 1 at a time.
+	 * XXX: maybe user (counter) would work better
+	 */
+	if (info->state != PN544_ST_COLD) {
+		r = -EBUSY;
+		goto out;
+	}
+
+	file->f_pos = info->read_offset;
+	r = pn544_enable(info, HCI_MODE);
+
+out:
+	mutex_unlock(&info->mutex);
+	return r;
+}
+
+static int pn544_close(struct inode *inode, struct file *file)
+{
+	struct pn544_info *info = container_of(file->private_data,
+					       struct pn544_info, miscdev);
+	struct i2c_client *client = info->i2c_dev;
+
+	dev_dbg(&client->dev, "%s: info: %p, client %p\n",
+		__func__, info, info->i2c_dev);
+
+	mutex_lock(&info->mutex);
+	pn544_disable(info);
+	mutex_unlock(&info->mutex);
+
+	return 0;
+}
+
+static const struct file_operations pn544_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= pn544_read,
+	.write		= pn544_write,
+	.poll		= pn544_poll,
+	.open		= pn544_open,
+	.release	= pn544_close,
+	.unlocked_ioctl	= pn544_ioctl,
+};
+
+#ifdef CONFIG_PM
+static int pn544_suspend(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct pn544_info *info;
+	int r = 0;
+
+	dev_info(&client->dev, "***\n%s: client %p\n***\n", __func__, client);
+
+	info = i2c_get_clientdata(client);
+	dev_info(&client->dev, "%s: info: %p, client %p\n", __func__,
+		 info, client);
+
+	mutex_lock(&info->mutex);
+
+	switch (info->state) {
+	case PN544_ST_FW_READY:
+		/* Do not suspend while upgrading FW, please! */
+		r = -EPERM;
+		break;
+
+	case PN544_ST_READY:
+		/*
+		 * CHECK: Device should be in standby-mode. No way to check?
+		 * Allowing low power mode for the regulator is potentially
+		 * dangerous if pn544 does not go to suspension.
+		 */
+		break;
+
+	case PN544_ST_COLD:
+		break;
+	};
+
+	mutex_unlock(&info->mutex);
+	return r;
+}
+
+static int pn544_resume(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct pn544_info *info = i2c_get_clientdata(client);
+	int r = 0;
+
+	dev_dbg(&client->dev, "%s: info: %p, client %p\n", __func__,
+		info, client);
+
+	mutex_lock(&info->mutex);
+
+	switch (info->state) {
+	case PN544_ST_READY:
+		/*
+		 * CHECK: If regulator low power mode is allowed in
+		 * pn544_suspend, we should go back to normal mode
+		 * here.
+		 */
+		break;
+
+	case PN544_ST_COLD:
+		break;
+
+	case PN544_ST_FW_READY:
+		break;
+	};
+
+	mutex_unlock(&info->mutex);
+
+	return r;
+}
+
+static SIMPLE_DEV_PM_OPS(pn544_pm_ops, pn544_suspend, pn544_resume);
+#endif
+
+static struct device_attribute pn544_attr =
+	__ATTR(nfc_test, S_IRUGO, pn544_test, NULL);
+
+static int __devinit pn544_probe(struct i2c_client *client,
+				 const struct i2c_device_id *id)
+{
+	struct pn544_info *info;
+	struct pn544_nfc_platform_data *pdata;
+	int r = 0;
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+	dev_dbg(&client->dev, "IRQ: %d\n", client->irq);
+
+	/* private data allocation */
+	info = kzalloc(sizeof(struct pn544_info), GFP_KERNEL);
+	if (!info) {
+		dev_err(&client->dev,
+			"Cannot allocate memory for pn544_info.\n");
+		r = -ENOMEM;
+		goto err_info_alloc;
+	}
+
+	info->buflen = max(PN544_MSG_MAX_SIZE, PN544_MAX_I2C_TRANSFER);
+	info->buf = kzalloc(info->buflen, GFP_KERNEL);
+	if (!info->buf) {
+		dev_err(&client->dev,
+			"Cannot allocate memory for pn544_info->buf.\n");
+		r = -ENOMEM;
+		goto err_buf_alloc;
+	}
+
+	info->regs[0].supply = reg_vdd_io;
+	info->regs[1].supply = reg_vbat;
+	r = regulator_bulk_get(&client->dev, ARRAY_SIZE(info->regs),
+				 info->regs);
+	if (r < 0)
+		goto err_kmalloc;
+
+	info->i2c_dev = client;
+	info->state = PN544_ST_COLD;
+	info->read_irq = PN544_NONE;
+	mutex_init(&info->read_mutex);
+	mutex_init(&info->mutex);
+	init_waitqueue_head(&info->read_wait);
+	i2c_set_clientdata(client, info);
+	pdata = client->dev.platform_data;
+	if (!pdata) {
+		dev_err(&client->dev, "No platform data\n");
+		r = -EINVAL;
+		goto err_reg;
+	}
+
+	if (!pdata->request_resources) {
+		dev_err(&client->dev, "request_resources() missing\n");
+		r = -EINVAL;
+		goto err_reg;
+	}
+
+	r = pdata->request_resources(client);
+	if (r) {
+		dev_err(&client->dev, "Cannot get platform resources\n");
+		goto err_reg;
+	}
+
+	r = request_threaded_irq(client->irq, NULL, pn544_irq_thread_fn,
+				 IRQF_TRIGGER_RISING, PN544_DRIVER_NAME,
+				 info);
+	if (r < 0) {
+		dev_err(&client->dev, "Unable to register IRQ handler\n");
+		goto err_res;
+	}
+
+	/* If we don't have the test we don't need the sysfs file */
+	if (pdata->test) {
+		r = device_create_file(&client->dev, &pn544_attr);
+		if (r) {
+			dev_err(&client->dev,
+				"sysfs registration failed, error %d\n", r);
+			goto err_irq;
+		}
+	}
+
+	info->miscdev.minor = MISC_DYNAMIC_MINOR;
+	info->miscdev.name = PN544_DRIVER_NAME;
+	info->miscdev.fops = &pn544_fops;
+	info->miscdev.parent = &client->dev;
+	r = misc_register(&info->miscdev);
+	if (r < 0) {
+		dev_err(&client->dev, "Device registration failed\n");
+		goto err_sysfs;
+	}
+
+	dev_dbg(&client->dev, "%s: info: %p, pdata %p, client %p\n",
+		__func__, info, pdata, client);
+
+	return 0;
+
+err_sysfs:
+	if (pdata->test)
+		device_remove_file(&client->dev, &pn544_attr);
+err_irq:
+	free_irq(client->irq, info);
+err_res:
+	if (pdata->free_resources)
+		pdata->free_resources();
+err_reg:
+	regulator_bulk_free(ARRAY_SIZE(info->regs), info->regs);
+err_kmalloc:
+	kfree(info->buf);
+err_buf_alloc:
+	kfree(info);
+err_info_alloc:
+	return r;
+}
+
+static __devexit int pn544_remove(struct i2c_client *client)
+{
+	struct pn544_info *info = i2c_get_clientdata(client);
+	struct pn544_nfc_platform_data *pdata = client->dev.platform_data;
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+
+	misc_deregister(&info->miscdev);
+	if (pdata->test)
+		device_remove_file(&client->dev, &pn544_attr);
+
+	if (info->state != PN544_ST_COLD) {
+		if (pdata->disable)
+			pdata->disable();
+
+		info->read_irq = PN544_NONE;
+	}
+
+	free_irq(client->irq, info);
+	if (pdata->free_resources)
+		pdata->free_resources();
+
+	regulator_bulk_free(ARRAY_SIZE(info->regs), info->regs);
+	kfree(info->buf);
+	kfree(info);
+
+	return 0;
+}
+
+static struct i2c_driver pn544_driver = {
+	.driver = {
+		.name = PN544_DRIVER_NAME,
+#ifdef CONFIG_PM
+		.pm = &pn544_pm_ops,
+#endif
+	},
+	.probe = pn544_probe,
+	.id_table = pn544_id_table,
+	.remove = __devexit_p(pn544_remove),
+};
+
+static int __init pn544_init(void)
+{
+	int r;
+
+	pr_debug(DRIVER_DESC ": %s\n", __func__);
+
+	r = i2c_add_driver(&pn544_driver);
+	if (r) {
+		pr_err(PN544_DRIVER_NAME ": driver registration failed\n");
+		return r;
+	}
+
+	return 0;
+}
+
+static void __exit pn544_exit(void)
+{
+	i2c_del_driver(&pn544_driver);
+	pr_info(DRIVER_DESC ", Exiting.\n");
+}
+
+module_init(pn544_init);
+module_exit(pn544_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/linux/nfc/pn544.h b/include/linux/nfc/pn544.h
new file mode 100644
index 000000000000..7ab8521f2347
--- /dev/null
+++ b/include/linux/nfc/pn544.h
@@ -0,0 +1,97 @@
+/*
+ * Driver include for the PN544 NFC chip.
+ *
+ * Copyright (C) Nokia Corporation
+ *
+ * Author: Jari Vanhala <ext-jari.vanhala@nokia.com>
+ * Contact: Matti Aaltoenn <matti.j.aaltonen@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _PN544_H_
+#define _PN544_H_
+
+#include <linux/i2c.h>
+
+#define PN544_DRIVER_NAME	"pn544"
+#define PN544_MAXWINDOW_SIZE	7
+#define PN544_WINDOW_SIZE	4
+#define PN544_RETRIES		10
+#define PN544_MAX_I2C_TRANSFER	0x0400
+#define PN544_MSG_MAX_SIZE	0x21 /* at normal HCI mode */
+
+/* ioctl */
+#define PN544_CHAR_BASE		'P'
+#define PN544_IOR(num, dtype)	_IOR(PN544_CHAR_BASE, num, dtype)
+#define PN544_IOW(num, dtype)	_IOW(PN544_CHAR_BASE, num, dtype)
+#define PN544_GET_FW_MODE	PN544_IOW(1, unsigned int)
+#define PN544_SET_FW_MODE	PN544_IOW(2, unsigned int)
+#define PN544_GET_DEBUG		PN544_IOW(3, unsigned int)
+#define PN544_SET_DEBUG		PN544_IOW(4, unsigned int)
+
+/* Timing restrictions (ms) */
+#define PN544_RESETVEN_TIME	30 /* 7 */
+#define PN544_PVDDVEN_TIME	0
+#define PN544_VBATVEN_TIME	0
+#define PN544_GPIO4VEN_TIME	0
+#define PN544_WAKEUP_ACK	5
+#define PN544_WAKEUP_GUARD	(PN544_WAKEUP_ACK + 1)
+#define PN544_INACTIVITY_TIME	1000
+#define PN544_INTERFRAME_DELAY	200 /* us */
+#define PN544_BAUDRATE_CHANGE	150 /* us */
+
+/* Debug bits */
+#define PN544_DEBUG_BUF		0x01
+#define PN544_DEBUG_READ	0x02
+#define PN544_DEBUG_WRITE	0x04
+#define PN544_DEBUG_IRQ		0x08
+#define PN544_DEBUG_CALLS	0x10
+#define PN544_DEBUG_MODE	0x20
+
+/* Normal (HCI) mode */
+#define PN544_LLC_HCI_OVERHEAD	3 /* header + crc (to length) */
+#define PN544_LLC_MIN_SIZE	(1 + PN544_LLC_HCI_OVERHEAD) /* length + */
+#define PN544_LLC_MAX_DATA	(PN544_MSG_MAX_SIZE - 2)
+#define PN544_LLC_MAX_HCI_SIZE	(PN544_LLC_MAX_DATA - 2)
+
+struct pn544_llc_packet {
+	unsigned char length; /* of rest of packet */
+	unsigned char header;
+	unsigned char data[PN544_LLC_MAX_DATA]; /* includes crc-ccitt */
+};
+
+/* Firmware upgrade mode */
+#define PN544_FW_HEADER_SIZE	3
+/* max fw transfer is 1024bytes, but I2C limits it to 0xC0 */
+#define PN544_MAX_FW_DATA	(PN544_MAX_I2C_TRANSFER - PN544_FW_HEADER_SIZE)
+
+struct pn544_fw_packet {
+	unsigned char command; /* status in answer */
+	unsigned char length[2]; /* big-endian order (msf) */
+	unsigned char data[PN544_MAX_FW_DATA];
+};
+
+#ifdef __KERNEL__
+/* board config */
+struct pn544_nfc_platform_data {
+	int (*request_resources) (struct i2c_client *client);
+	void (*free_resources) (void);
+	void (*enable) (int fw);
+	int (*test) (void);
+	void (*disable) (void);
+};
+#endif /* __KERNEL__ */
+
+#endif /* _PN544_H_ */
-- 
cgit v1.2.3-71-gd317


From 7a21a3cc0be92e70474cc2ab06cb074f6a7c3f09 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:49 -0800
Subject: pps: trivial fixes

Here are some very trivial fixes combined:

- add macro definitions to protect header file from including several times

- remove declaration for an unexistent array

- fix typos

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/kapi.c         | 2 +-
 include/linux/pps_kernel.h | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
index 1aa02db3ff4e..55f39618261b 100644
--- a/drivers/pps/kapi.c
+++ b/drivers/pps/kapi.c
@@ -324,7 +324,7 @@ void pps_event(int source, struct pps_ktime *ts, int event, void *data)
 		captured = ~0;
 	}
 
-	/* Wake up iif captured somthing */
+	/* Wake up if captured something */
 	if (captured) {
 		pps->go = ~0;
 		wake_up_interruptible(&pps->queue);
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index e0a193f830ef..23f63c3263bb 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -18,6 +18,9 @@
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#ifndef LINUX_PPS_KERNEL_H
+#define LINUX_PPS_KERNEL_H
+
 #include <linux/pps.h>
 
 #include <linux/cdev.h>
@@ -71,7 +74,6 @@ struct pps_device {
 
 extern spinlock_t pps_idr_lock;
 extern struct idr pps_idr;
-extern struct timespec pps_irq_ts[];
 
 extern struct device_attribute pps_attrs[];
 
@@ -87,3 +89,5 @@ extern void pps_unregister_source(int source);
 extern int pps_register_cdev(struct pps_device *pps);
 extern void pps_unregister_cdev(struct pps_device *pps);
 extern void pps_event(int source, struct pps_ktime *ts, int event, void *data);
+
+#endif /* LINUX_PPS_KERNEL_H */
-- 
cgit v1.2.3-71-gd317


From 3003d55b59aa98aeaff2773df69732b27c0cbf6a Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:50 -0800
Subject: pps: fix race in PPS_FETCH handler

There was a race in PPS_FETCH ioctl handler when several processes want to
obtain PPS data simultaneously using sleeping PPS_FETCH.  They all sleep
most of the time in the system call.

With the old approach when the first process waiting on the pps queue is
waken up it makes new system call right away and zeroes pps->go.  So other
processes continue to sleep.  This is a clear race condition because of
the global 'go' variable.

With the new approach pps->last_ev holds some value increasing at each PPS
event.  PPS_FETCH ioctl handler saves current value to the local variable
at the very beginning so it can safely check that there is a new event by
just comparing both variables.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/kapi.c         |  4 ++--
 drivers/pps/pps.c          | 10 +++++++---
 include/linux/pps_kernel.h |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
index 55f39618261b..3f89f5eba81c 100644
--- a/drivers/pps/kapi.c
+++ b/drivers/pps/kapi.c
@@ -326,8 +326,8 @@ void pps_event(int source, struct pps_ktime *ts, int event, void *data)
 
 	/* Wake up if captured something */
 	if (captured) {
-		pps->go = ~0;
-		wake_up_interruptible(&pps->queue);
+		pps->last_ev++;
+		wake_up_interruptible_all(&pps->queue);
 
 		kill_fasync(&pps->async_queue, SIGIO, POLL_IN);
 	}
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
index c76afb980a98..dc7e66cb2762 100644
--- a/drivers/pps/pps.c
+++ b/drivers/pps/pps.c
@@ -136,6 +136,7 @@ static long pps_cdev_ioctl(struct file *file,
 
 	case PPS_FETCH: {
 		struct pps_fdata fdata;
+		unsigned int ev;
 
 		pr_debug("PPS_FETCH: source %d\n", pps->id);
 
@@ -143,11 +144,12 @@ static long pps_cdev_ioctl(struct file *file,
 		if (err)
 			return -EFAULT;
 
-		pps->go = 0;
+		ev = pps->last_ev;
 
 		/* Manage the timeout */
 		if (fdata.timeout.flags & PPS_TIME_INVALID)
-			err = wait_event_interruptible(pps->queue, pps->go);
+			err = wait_event_interruptible(pps->queue,
+					ev != pps->last_ev);
 		else {
 			unsigned long ticks;
 
@@ -159,7 +161,9 @@ static long pps_cdev_ioctl(struct file *file,
 
 			if (ticks != 0) {
 				err = wait_event_interruptible_timeout(
-						pps->queue, pps->go, ticks);
+						pps->queue,
+						ev != pps->last_ev,
+						ticks);
 				if (err == 0)
 					return -ETIMEDOUT;
 			}
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 23f63c3263bb..65194fe498bb 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -55,7 +55,7 @@ struct pps_device {
 	struct pps_ktime clear_tu;
 	int current_mode;			/* PPS mode at event time */
 
-	int go;					/* PPS event is arrived? */
+	unsigned int last_ev;			/* last PPS event id */
 	wait_queue_head_t queue;		/* PPS event queue */
 
 	unsigned int id;			/* PPS source unique ID */
-- 
cgit v1.2.3-71-gd317


From 6f4229b51106cbc859e9d8209b22c8a2ec749e64 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:50 -0800
Subject: pps: unify timestamp gathering

Add a helper function to gather timestamps.  This way clients don't have
to duplicate it.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/clients/pps-ktimer.c |  9 ++-------
 drivers/pps/clients/pps-ldisc.c  | 18 ++++++------------
 drivers/pps/kapi.c               | 19 ++++++++++++-------
 include/linux/pps_kernel.h       | 20 +++++++++++++++++++-
 include/linux/serial_core.h      |  5 +++--
 include/linux/tty_ldisc.h        |  5 +++--
 6 files changed, 45 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pps/clients/pps-ktimer.c b/drivers/pps/clients/pps-ktimer.c
index e7ef5b8186d0..e1bdd8bc8c9c 100644
--- a/drivers/pps/clients/pps-ktimer.c
+++ b/drivers/pps/clients/pps-ktimer.c
@@ -40,18 +40,13 @@ static struct timer_list ktimer;
 
 static void pps_ktimer_event(unsigned long ptr)
 {
-	struct timespec __ts;
-	struct pps_ktime ts;
+	struct pps_event_time ts;
 
 	/* First of all we get the time stamp... */
-	getnstimeofday(&__ts);
+	pps_get_ts(&ts);
 
 	pr_info("PPS event at %lu\n", jiffies);
 
-	/* ... and translate it to PPS time data struct */
-	ts.sec = __ts.tv_sec;
-	ts.nsec = __ts.tv_nsec;
-
 	pps_event(source, &ts, PPS_CAPTUREASSERT, NULL);
 
 	mod_timer(&ktimer, jiffies + HZ);
diff --git a/drivers/pps/clients/pps-ldisc.c b/drivers/pps/clients/pps-ldisc.c
index 8e1932d29fd4..20fc9f7645c8 100644
--- a/drivers/pps/clients/pps-ldisc.c
+++ b/drivers/pps/clients/pps-ldisc.c
@@ -27,26 +27,20 @@
 #define PPS_TTY_MAGIC		0x0001
 
 static void pps_tty_dcd_change(struct tty_struct *tty, unsigned int status,
-				struct timespec *ts)
+				struct pps_event_time *ts)
 {
 	int id = (long)tty->disc_data;
-	struct timespec __ts;
-	struct pps_ktime pps_ts;
+	struct pps_event_time __ts;
 
 	/* First of all we get the time stamp... */
-	getnstimeofday(&__ts);
+	pps_get_ts(&__ts);
 
 	/* Does caller give us a timestamp? */
-	if (ts) {	/* Yes. Let's use it! */
-		pps_ts.sec = ts->tv_sec;
-		pps_ts.nsec = ts->tv_nsec;
-	} else {	/* No. Do it ourself! */
-		pps_ts.sec = __ts.tv_sec;
-		pps_ts.nsec = __ts.tv_nsec;
-	}
+	if (!ts)	/* No. Do it ourself! */
+		ts = &__ts;
 
 	/* Now do the PPS event report */
-	pps_event(id, &pps_ts, status ? PPS_CAPTUREASSERT : PPS_CAPTURECLEAR,
+	pps_event(id, ts, status ? PPS_CAPTUREASSERT : PPS_CAPTURECLEAR,
 			NULL);
 
 	pr_debug("PPS %s at %lu on source #%d\n",
diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
index 3f89f5eba81c..b431d83b824a 100644
--- a/drivers/pps/kapi.c
+++ b/drivers/pps/kapi.c
@@ -268,11 +268,12 @@ EXPORT_SYMBOL(pps_unregister_source);
  *	pps->info.echo(source, event, data);
  */
 
-void pps_event(int source, struct pps_ktime *ts, int event, void *data)
+void pps_event(int source, struct pps_event_time *ts, int event, void *data)
 {
 	struct pps_device *pps;
 	unsigned long flags;
 	int captured = 0;
+	struct pps_ktime ts_real;
 
 	if ((event & (PPS_CAPTUREASSERT | PPS_CAPTURECLEAR)) == 0) {
 		printk(KERN_ERR "pps: unknown event (%x) for source %d\n",
@@ -284,8 +285,10 @@ void pps_event(int source, struct pps_ktime *ts, int event, void *data)
 	if (!pps)
 		return;
 
-	pr_debug("PPS event on source %d at %llu.%06u\n",
-			pps->id, (unsigned long long) ts->sec, ts->nsec);
+	pr_debug("PPS event on source %d at %ld.%09ld\n",
+			pps->id, ts->ts_real.tv_sec, ts->ts_real.tv_nsec);
+
+	timespec_to_pps_ktime(&ts_real, ts->ts_real);
 
 	spin_lock_irqsave(&pps->lock, flags);
 
@@ -299,10 +302,11 @@ void pps_event(int source, struct pps_ktime *ts, int event, void *data)
 			(pps->params.mode & PPS_CAPTUREASSERT)) {
 		/* We have to add an offset? */
 		if (pps->params.mode & PPS_OFFSETASSERT)
-			pps_add_offset(ts, &pps->params.assert_off_tu);
+			pps_add_offset(&ts_real,
+					&pps->params.assert_off_tu);
 
 		/* Save the time stamp */
-		pps->assert_tu = *ts;
+		pps->assert_tu = ts_real;
 		pps->assert_sequence++;
 		pr_debug("capture assert seq #%u for source %d\n",
 			pps->assert_sequence, source);
@@ -313,10 +317,11 @@ void pps_event(int source, struct pps_ktime *ts, int event, void *data)
 			(pps->params.mode & PPS_CAPTURECLEAR)) {
 		/* We have to add an offset? */
 		if (pps->params.mode & PPS_OFFSETCLEAR)
-			pps_add_offset(ts, &pps->params.clear_off_tu);
+			pps_add_offset(&ts_real,
+					&pps->params.clear_off_tu);
 
 		/* Save the time stamp */
-		pps->clear_tu = *ts;
+		pps->clear_tu = ts_real;
 		pps->clear_sequence++;
 		pr_debug("capture clear seq #%u for source %d\n",
 			pps->clear_sequence, source);
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 65194fe498bb..32aa6763ca1b 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -43,6 +43,10 @@ struct pps_source_info {
 	struct device *dev;
 };
 
+struct pps_event_time {
+	struct timespec ts_real;
+};
+
 /* The main struct */
 struct pps_device {
 	struct pps_source_info info;		/* PSS source info */
@@ -88,6 +92,20 @@ extern int pps_register_source(struct pps_source_info *info,
 extern void pps_unregister_source(int source);
 extern int pps_register_cdev(struct pps_device *pps);
 extern void pps_unregister_cdev(struct pps_device *pps);
-extern void pps_event(int source, struct pps_ktime *ts, int event, void *data);
+extern void pps_event(int source, struct pps_event_time *ts, int event,
+		void *data);
+
+static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
+		struct timespec ts)
+{
+	kt->sec = ts.tv_sec;
+	kt->nsec = ts.tv_nsec;
+}
+
+static inline void pps_get_ts(struct pps_event_time *ts)
+{
+	getnstimeofday(&ts->ts_real);
+}
 
 #endif /* LINUX_PPS_KERNEL_H */
+
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index a23fa29d4eb0..758c5b0c6fd3 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -212,6 +212,7 @@
 #include <linux/tty.h>
 #include <linux/mutex.h>
 #include <linux/sysrq.h>
+#include <linux/pps_kernel.h>
 
 struct uart_port;
 struct serial_struct;
@@ -528,10 +529,10 @@ uart_handle_dcd_change(struct uart_port *uport, unsigned int status)
 	struct uart_state *state = uport->state;
 	struct tty_port *port = &state->port;
 	struct tty_ldisc *ld = tty_ldisc_ref(port->tty);
-	struct timespec ts;
+	struct pps_event_time ts;
 
 	if (ld && ld->ops->dcd_change)
-		getnstimeofday(&ts);
+		pps_get_ts(&ts);
 
 	uport->icount.dcd++;
 #ifdef CONFIG_HARD_PPS
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 526d66f066a3..763e061e19f1 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -101,7 +101,7 @@
  *	any pending driver I/O is completed.
  *
  * void (*dcd_change)(struct tty_struct *tty, unsigned int status,
- * 			struct timespec *ts)
+ *			struct pps_event_time *ts)
  *
  *	Tells the discipline that the DCD pin has changed its status and
  *	the relative timestamp. Pointer ts can be NULL.
@@ -109,6 +109,7 @@
 
 #include <linux/fs.h>
 #include <linux/wait.h>
+#include <linux/pps_kernel.h>
 
 struct tty_ldisc_ops {
 	int	magic;
@@ -143,7 +144,7 @@ struct tty_ldisc_ops {
 			       char *fp, int count);
 	void	(*write_wakeup)(struct tty_struct *);
 	void	(*dcd_change)(struct tty_struct *, unsigned int,
-				struct timespec *);
+				struct pps_event_time *);
 
 	struct  module *owner;
 	
-- 
cgit v1.2.3-71-gd317


From 5e196d34a776420278e4117b4742cd9d3f2350ed Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:51 -0800
Subject: pps: access pps device by direct pointer

Using device index as a pointer needs some unnecessary work to be done
every time the pointer is needed (in irq handler for example).  Using a
direct pointer is much more easy (and safe as well).

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/clients/pps-ktimer.c |  30 ++++------
 drivers/pps/clients/pps-ldisc.c  |  41 ++++++++-----
 drivers/pps/kapi.c               | 124 +++++++++------------------------------
 drivers/pps/pps.c                |  22 ++-----
 include/linux/pps_kernel.h       |  23 +++-----
 5 files changed, 82 insertions(+), 158 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pps/clients/pps-ktimer.c b/drivers/pps/clients/pps-ktimer.c
index e1bdd8bc8c9c..966ead188ee2 100644
--- a/drivers/pps/clients/pps-ktimer.c
+++ b/drivers/pps/clients/pps-ktimer.c
@@ -31,7 +31,7 @@
  * Global variables
  */
 
-static int source;
+static struct pps_device *pps;
 static struct timer_list ktimer;
 
 /*
@@ -47,7 +47,7 @@ static void pps_ktimer_event(unsigned long ptr)
 
 	pr_info("PPS event at %lu\n", jiffies);
 
-	pps_event(source, &ts, PPS_CAPTUREASSERT, NULL);
+	pps_event(pps, &ts, PPS_CAPTUREASSERT, NULL);
 
 	mod_timer(&ktimer, jiffies + HZ);
 }
@@ -56,12 +56,11 @@ static void pps_ktimer_event(unsigned long ptr)
  * The echo function
  */
 
-static void pps_ktimer_echo(int source, int event, void *data)
+static void pps_ktimer_echo(struct pps_device *pps, int event, void *data)
 {
-	pr_info("echo %s %s for source %d\n",
+	dev_info(pps->dev, "echo %s %s\n",
 		event & PPS_CAPTUREASSERT ? "assert" : "",
-		event & PPS_CAPTURECLEAR ? "clear" : "",
-		source);
+		event & PPS_CAPTURECLEAR ? "clear" : "");
 }
 
 /*
@@ -84,30 +83,27 @@ static struct pps_source_info pps_ktimer_info = {
 
 static void __exit pps_ktimer_exit(void)
 {
-	del_timer_sync(&ktimer);
-	pps_unregister_source(source);
+	dev_info(pps->dev, "ktimer PPS source unregistered\n");
 
-	pr_info("ktimer PPS source unregistered\n");
+	del_timer_sync(&ktimer);
+	pps_unregister_source(pps);
 }
 
 static int __init pps_ktimer_init(void)
 {
-	int ret;
-
-	ret = pps_register_source(&pps_ktimer_info,
+	pps = pps_register_source(&pps_ktimer_info,
 				PPS_CAPTUREASSERT | PPS_OFFSETASSERT);
-	if (ret < 0) {
+	if (pps == NULL) {
 		printk(KERN_ERR "cannot register ktimer source\n");
-		return ret;
+		return -ENOMEM;
 	}
-	source = ret;
 
 	setup_timer(&ktimer, pps_ktimer_event, 0);
 	mod_timer(&ktimer, jiffies + HZ);
 
-	pr_info("ktimer PPS source registered at %d\n", source);
+	dev_info(pps->dev, "ktimer PPS source registered\n");
 
-	return  0;
+	return 0;
 }
 
 module_init(pps_ktimer_init);
diff --git a/drivers/pps/clients/pps-ldisc.c b/drivers/pps/clients/pps-ldisc.c
index 20fc9f7645c8..1517f5498ec0 100644
--- a/drivers/pps/clients/pps-ldisc.c
+++ b/drivers/pps/clients/pps-ldisc.c
@@ -29,7 +29,7 @@
 static void pps_tty_dcd_change(struct tty_struct *tty, unsigned int status,
 				struct pps_event_time *ts)
 {
-	int id = (long)tty->disc_data;
+	struct pps_device *pps = (struct pps_device *)tty->disc_data;
 	struct pps_event_time __ts;
 
 	/* First of all we get the time stamp... */
@@ -39,12 +39,14 @@ static void pps_tty_dcd_change(struct tty_struct *tty, unsigned int status,
 	if (!ts)	/* No. Do it ourself! */
 		ts = &__ts;
 
+	BUG_ON(pps == NULL);
+
 	/* Now do the PPS event report */
-	pps_event(id, ts, status ? PPS_CAPTUREASSERT : PPS_CAPTURECLEAR,
-			NULL);
+	pps_event(pps, ts, status ? PPS_CAPTUREASSERT :
+			PPS_CAPTURECLEAR, NULL);
 
-	pr_debug("PPS %s at %lu on source #%d\n",
-			status ? "assert" : "clear", jiffies, id);
+	dev_dbg(pps->dev, "PPS %s at %lu\n",
+			status ? "assert" : "clear", jiffies);
 }
 
 static int (*alias_n_tty_open)(struct tty_struct *tty);
@@ -54,6 +56,7 @@ static int pps_tty_open(struct tty_struct *tty)
 	struct pps_source_info info;
 	struct tty_driver *drv = tty->driver;
 	int index = tty->index + drv->name_base;
+	struct pps_device *pps;
 	int ret;
 
 	info.owner = THIS_MODULE;
@@ -64,34 +67,42 @@ static int pps_tty_open(struct tty_struct *tty)
 			PPS_OFFSETASSERT | PPS_OFFSETCLEAR | \
 			PPS_CANWAIT | PPS_TSFMT_TSPEC;
 
-	ret = pps_register_source(&info, PPS_CAPTUREBOTH | \
+	pps = pps_register_source(&info, PPS_CAPTUREBOTH | \
 				PPS_OFFSETASSERT | PPS_OFFSETCLEAR);
-	if (ret < 0) {
+	if (pps == NULL) {
 		pr_err("cannot register PPS source \"%s\"\n", info.path);
-		return ret;
+		return -ENOMEM;
 	}
-	tty->disc_data = (void *)(long)ret;
+	tty->disc_data = pps;
 
 	/* Should open N_TTY ldisc too */
 	ret = alias_n_tty_open(tty);
-	if (ret < 0)
-		pps_unregister_source((long)tty->disc_data);
+	if (ret < 0) {
+		pr_err("cannot open tty ldisc \"%s\"\n", info.path);
+		goto err_unregister;
+	}
 
-	pr_info("PPS source #%d \"%s\" added\n", ret, info.path);
+	dev_info(pps->dev, "source \"%s\" added\n", info.path);
 
 	return 0;
+
+err_unregister:
+	tty->disc_data = NULL;
+	pps_unregister_source(pps);
+	return ret;
 }
 
 static void (*alias_n_tty_close)(struct tty_struct *tty);
 
 static void pps_tty_close(struct tty_struct *tty)
 {
-	int id = (long)tty->disc_data;
+	struct pps_device *pps = (struct pps_device *)tty->disc_data;
 
-	pps_unregister_source(id);
 	alias_n_tty_close(tty);
 
-	pr_info("PPS source #%d removed\n", id);
+	tty->disc_data = NULL;
+	dev_info(pps->dev, "removed\n");
+	pps_unregister_source(pps);
 }
 
 static struct tty_ldisc_ops pps_ldisc_ops;
diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
index b431d83b824a..98d4012ca595 100644
--- a/drivers/pps/kapi.c
+++ b/drivers/pps/kapi.c
@@ -32,11 +32,11 @@
 #include <linux/slab.h>
 
 /*
- * Global variables
+ * Local variables
  */
 
-DEFINE_SPINLOCK(pps_idr_lock);
-DEFINE_IDR(pps_idr);
+static DEFINE_SPINLOCK(pps_idr_lock);
+static DEFINE_IDR(pps_idr);
 
 /*
  * Local functions
@@ -60,60 +60,6 @@ static void pps_add_offset(struct pps_ktime *ts, struct pps_ktime *offset)
  * Exported functions
  */
 
-/* pps_get_source - find a PPS source
- * @source: the PPS source ID.
- *
- * This function is used to find an already registered PPS source into the
- * system.
- *
- * The function returns NULL if found nothing, otherwise it returns a pointer
- * to the PPS source data struct (the refcounter is incremented by 1).
- */
-
-struct pps_device *pps_get_source(int source)
-{
-	struct pps_device *pps;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pps_idr_lock, flags);
-
-	pps = idr_find(&pps_idr, source);
-	if (pps != NULL)
-		atomic_inc(&pps->usage);
-
-	spin_unlock_irqrestore(&pps_idr_lock, flags);
-
-	return pps;
-}
-
-/* pps_put_source - free the PPS source data
- * @pps: a pointer to the PPS source.
- *
- * This function is used to free a PPS data struct if its refcount is 0.
- */
-
-void pps_put_source(struct pps_device *pps)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pps_idr_lock, flags);
-	BUG_ON(atomic_read(&pps->usage) == 0);
-
-	if (!atomic_dec_and_test(&pps->usage)) {
-		pps = NULL;
-		goto exit;
-	}
-
-	/* No more reference to the PPS source. We can safely remove the
-	 * PPS data struct.
-	 */
-	idr_remove(&pps_idr, pps->id);
-
-exit:
-	spin_unlock_irqrestore(&pps_idr_lock, flags);
-	kfree(pps);
-}
-
 /* pps_register_source - add a PPS source in the system
  * @info: the PPS info struct
  * @default_params: the default PPS parameters of the new source
@@ -122,10 +68,11 @@ exit:
  * source is described by info's fields and it will have, as default PPS
  * parameters, the ones specified into default_params.
  *
- * The function returns, in case of success, the PPS source ID.
+ * The function returns, in case of success, the PPS device. Otherwise NULL.
  */
 
-int pps_register_source(struct pps_source_info *info, int default_params)
+struct pps_device *pps_register_source(struct pps_source_info *info,
+		int default_params)
 {
 	struct pps_device *pps;
 	int id;
@@ -168,7 +115,6 @@ int pps_register_source(struct pps_source_info *info, int default_params)
 
 	init_waitqueue_head(&pps->queue);
 	spin_lock_init(&pps->lock);
-	atomic_set(&pps->usage, 1);
 
 	/* Get new ID for the new PPS source */
 	if (idr_pre_get(&pps_idr, GFP_KERNEL) == 0) {
@@ -211,7 +157,7 @@ int pps_register_source(struct pps_source_info *info, int default_params)
 
 	pr_info("new PPS source %s at ID %d\n", info->name, id);
 
-	return id;
+	return pps;
 
 free_idr:
 	spin_lock_irq(&pps_idr_lock);
@@ -224,38 +170,33 @@ kfree_pps:
 pps_register_source_exit:
 	printk(KERN_ERR "pps: %s: unable to register source\n", info->name);
 
-	return err;
+	return NULL;
 }
 EXPORT_SYMBOL(pps_register_source);
 
 /* pps_unregister_source - remove a PPS source from the system
- * @source: the PPS source ID
+ * @pps: the PPS source
  *
  * This function is used to remove a previously registered PPS source from
  * the system.
  */
 
-void pps_unregister_source(int source)
+void pps_unregister_source(struct pps_device *pps)
 {
-	struct pps_device *pps;
+	unsigned int id = pps->id;
 
-	spin_lock_irq(&pps_idr_lock);
-	pps = idr_find(&pps_idr, source);
+	pps_unregister_cdev(pps);
 
-	if (!pps) {
-		BUG();
-		spin_unlock_irq(&pps_idr_lock);
-		return;
-	}
+	spin_lock_irq(&pps_idr_lock);
+	idr_remove(&pps_idr, pps->id);
 	spin_unlock_irq(&pps_idr_lock);
 
-	pps_unregister_cdev(pps);
-	pps_put_source(pps);
+	kfree(pps);
 }
 EXPORT_SYMBOL(pps_unregister_source);
 
 /* pps_event - register a PPS event into the system
- * @source: the PPS source ID
+ * @pps: the PPS device
  * @ts: the event timestamp
  * @event: the event type
  * @data: userdef pointer
@@ -263,30 +204,24 @@ EXPORT_SYMBOL(pps_unregister_source);
  * This function is used by each PPS client in order to register a new
  * PPS event into the system (it's usually called inside an IRQ handler).
  *
- * If an echo function is associated with the PPS source it will be called
+ * If an echo function is associated with the PPS device it will be called
  * as:
- *	pps->info.echo(source, event, data);
+ *	pps->info.echo(pps, event, data);
  */
-
-void pps_event(int source, struct pps_event_time *ts, int event, void *data)
+void pps_event(struct pps_device *pps, struct pps_event_time *ts, int event,
+		void *data)
 {
-	struct pps_device *pps;
 	unsigned long flags;
 	int captured = 0;
 	struct pps_ktime ts_real;
 
 	if ((event & (PPS_CAPTUREASSERT | PPS_CAPTURECLEAR)) == 0) {
-		printk(KERN_ERR "pps: unknown event (%x) for source %d\n",
-			event, source);
+		dev_err(pps->dev, "unknown event (%x)\n", event);
 		return;
 	}
 
-	pps = pps_get_source(source);
-	if (!pps)
-		return;
-
-	pr_debug("PPS event on source %d at %ld.%09ld\n",
-			pps->id, ts->ts_real.tv_sec, ts->ts_real.tv_nsec);
+	dev_dbg(pps->dev, "PPS event at %ld.%09ld\n",
+			ts->ts_real.tv_sec, ts->ts_real.tv_nsec);
 
 	timespec_to_pps_ktime(&ts_real, ts->ts_real);
 
@@ -294,7 +229,7 @@ void pps_event(int source, struct pps_event_time *ts, int event, void *data)
 
 	/* Must call the echo function? */
 	if ((pps->params.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)))
-		pps->info.echo(source, event, data);
+		pps->info.echo(pps, event, data);
 
 	/* Check the event */
 	pps->current_mode = pps->params.mode;
@@ -308,8 +243,8 @@ void pps_event(int source, struct pps_event_time *ts, int event, void *data)
 		/* Save the time stamp */
 		pps->assert_tu = ts_real;
 		pps->assert_sequence++;
-		pr_debug("capture assert seq #%u for source %d\n",
-			pps->assert_sequence, source);
+		dev_dbg(pps->dev, "capture assert seq #%u\n",
+			pps->assert_sequence);
 
 		captured = ~0;
 	}
@@ -323,8 +258,8 @@ void pps_event(int source, struct pps_event_time *ts, int event, void *data)
 		/* Save the time stamp */
 		pps->clear_tu = ts_real;
 		pps->clear_sequence++;
-		pr_debug("capture clear seq #%u for source %d\n",
-			pps->clear_sequence, source);
+		dev_dbg(pps->dev, "capture clear seq #%u\n",
+			pps->clear_sequence);
 
 		captured = ~0;
 	}
@@ -338,8 +273,5 @@ void pps_event(int source, struct pps_event_time *ts, int event, void *data)
 	}
 
 	spin_unlock_irqrestore(&pps->lock, flags);
-
-	/* Now we can release the PPS source for (possible) deregistration */
-	pps_put_source(pps);
 }
 EXPORT_SYMBOL(pps_event);
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
index dc7e66cb2762..1922f27a52ac 100644
--- a/drivers/pps/pps.c
+++ b/drivers/pps/pps.c
@@ -204,12 +204,6 @@ static int pps_cdev_open(struct inode *inode, struct file *file)
 {
 	struct pps_device *pps = container_of(inode->i_cdev,
 						struct pps_device, cdev);
-	int found;
-
-	found = pps_get_source(pps->id) != 0;
-	if (!found)
-		return -ENODEV;
-
 	file->private_data = pps;
 
 	return 0;
@@ -217,11 +211,6 @@ static int pps_cdev_open(struct inode *inode, struct file *file)
 
 static int pps_cdev_release(struct inode *inode, struct file *file)
 {
-	struct pps_device *pps = file->private_data;
-
-	/* Free the PPS source and wake up (possible) deregistration */
-	pps_put_source(pps);
-
 	return 0;
 }
 
@@ -242,22 +231,23 @@ static const struct file_operations pps_cdev_fops = {
 int pps_register_cdev(struct pps_device *pps)
 {
 	int err;
+	dev_t devt;
+
+	devt = MKDEV(MAJOR(pps_devt), pps->id);
 
-	pps->devno = MKDEV(MAJOR(pps_devt), pps->id);
 	cdev_init(&pps->cdev, &pps_cdev_fops);
 	pps->cdev.owner = pps->info.owner;
 
-	err = cdev_add(&pps->cdev, pps->devno, 1);
+	err = cdev_add(&pps->cdev, devt, 1);
 	if (err) {
 		printk(KERN_ERR "pps: %s: failed to add char device %d:%d\n",
 				pps->info.name, MAJOR(pps_devt), pps->id);
 		return err;
 	}
-	pps->dev = device_create(pps_class, pps->info.dev, pps->devno, NULL,
+	pps->dev = device_create(pps_class, pps->info.dev, devt, pps,
 							"pps%d", pps->id);
 	if (IS_ERR(pps->dev))
 		goto del_cdev;
-	dev_set_drvdata(pps->dev, pps);
 
 	pr_debug("source %s got cdev (%d:%d)\n", pps->info.name,
 			MAJOR(pps_devt), pps->id);
@@ -272,7 +262,7 @@ del_cdev:
 
 void pps_unregister_cdev(struct pps_device *pps)
 {
-	device_destroy(pps_class, pps->devno);
+	device_destroy(pps_class, pps->dev->devt);
 	cdev_del(&pps->cdev);
 }
 
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 32aa6763ca1b..1aedf50088cf 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -31,13 +31,16 @@
  * Global defines
  */
 
+struct pps_device;
+
 /* The specific PPS source info */
 struct pps_source_info {
 	char name[PPS_MAX_NAME_LEN];		/* simbolic name */
 	char path[PPS_MAX_NAME_LEN];		/* path of connected device */
 	int mode;				/* PPS's allowed mode */
 
-	void (*echo)(int source, int event, void *data); /* PPS echo function */
+	void (*echo)(struct pps_device *pps,
+			int event, void *data);	/* PPS echo function */
 
 	struct module *owner;
 	struct device *dev;
@@ -65,35 +68,27 @@ struct pps_device {
 	unsigned int id;			/* PPS source unique ID */
 	struct cdev cdev;
 	struct device *dev;
-	int devno;
 	struct fasync_struct *async_queue;	/* fasync method */
 	spinlock_t lock;
-
-	atomic_t usage;				/* usage count */
 };
 
 /*
  * Global variables
  */
 
-extern spinlock_t pps_idr_lock;
-extern struct idr pps_idr;
-
 extern struct device_attribute pps_attrs[];
 
 /*
  * Exported functions
  */
 
-struct pps_device *pps_get_source(int source);
-extern void pps_put_source(struct pps_device *pps);
-extern int pps_register_source(struct pps_source_info *info,
-				int default_params);
-extern void pps_unregister_source(int source);
+extern struct pps_device *pps_register_source(
+		struct pps_source_info *info, int default_params);
+extern void pps_unregister_source(struct pps_device *pps);
 extern int pps_register_cdev(struct pps_device *pps);
 extern void pps_unregister_cdev(struct pps_device *pps);
-extern void pps_event(int source, struct pps_event_time *ts, int event,
-		void *data);
+extern void pps_event(struct pps_device *pps,
+		struct pps_event_time *ts, int event, void *data);
 
 static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
 		struct timespec ts)
-- 
cgit v1.2.3-71-gd317


From 12f9b1f9c11700893a7b453705d95b260d78f268 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:55 -0800
Subject: pps: timestamp is always passed to dcd_change()

Remove the code that gatheres timestamp in pps_tty_dcd_change() in case
passed ts parameter is NULL because it never happens in the current code.
Fix comments as well.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/serial/tty.txt    | 2 +-
 drivers/pps/clients/pps-ldisc.c | 8 --------
 include/linux/tty_ldisc.h       | 2 +-
 3 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt
index 7c900507279f..540db41dfd5d 100644
--- a/Documentation/serial/tty.txt
+++ b/Documentation/serial/tty.txt
@@ -107,7 +107,7 @@ write_wakeup()	-	May be called at any point between open and close.
 
 dcd_change()	-	Report to the tty line the current DCD pin status
 			changes and the relative timestamp. The timestamp
-			can be NULL.
+			cannot be NULL.
 
 
 Driver Access
diff --git a/drivers/pps/clients/pps-ldisc.c b/drivers/pps/clients/pps-ldisc.c
index 1b3c6eda12ba..79451f2dea6a 100644
--- a/drivers/pps/clients/pps-ldisc.c
+++ b/drivers/pps/clients/pps-ldisc.c
@@ -32,14 +32,6 @@ static void pps_tty_dcd_change(struct tty_struct *tty, unsigned int status,
 				struct pps_event_time *ts)
 {
 	struct pps_device *pps = (struct pps_device *)tty->disc_data;
-	struct pps_event_time __ts;
-
-	/* First of all we get the time stamp... */
-	pps_get_ts(&__ts);
-
-	/* Does caller give us a timestamp? */
-	if (!ts)	/* No. Do it ourself! */
-		ts = &__ts;
 
 	BUG_ON(pps == NULL);
 
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 763e061e19f1..ff7dc08696a8 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -104,7 +104,7 @@
  *			struct pps_event_time *ts)
  *
  *	Tells the discipline that the DCD pin has changed its status and
- *	the relative timestamp. Pointer ts can be NULL.
+ *	the relative timestamp. Pointer ts cannot be NULL.
  */
 
 #include <linux/fs.h>
-- 
cgit v1.2.3-71-gd317


From 025b40abe715d638e60516a657d354e8560c1a85 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:56 -0800
Subject: ntp: add hardpps implementation

This commit adds hardpps() implementation based upon the original one from
the NTPv4 reference kernel code from David Mills.  However, it is highly
optimized towards very fast syncronization and maximum stickness to PPS
signal.  The typical error is less then a microsecond.

To make it sync faster I had to throw away exponential phase filter so
that the full phase offset is corrected immediately.  Then I also had to
throw away median phase filter because it gives a bigger error itself if
used without exponential filter.

Maybe we will find an appropriate filtering scheme in the future but it's
not necessary if the signal quality is ok.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/Kconfig   |   9 ++
 include/linux/timex.h |   1 +
 kernel/time/ntp.c     | 425 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 420 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pps/Kconfig b/drivers/pps/Kconfig
index 1afe4e03440f..0ad5ff38bfec 100644
--- a/drivers/pps/Kconfig
+++ b/drivers/pps/Kconfig
@@ -30,6 +30,15 @@ config PPS_DEBUG
 	  messages to the system log.  Select this if you are having a
 	  problem with PPS support and want to see more of what is going on.
 
+config NTP_PPS
+	bool "PPS kernel consumer support"
+	depends on PPS && !NO_HZ
+	help
+	  This option adds support for direct in-kernel time
+	  syncronization using an external PPS signal.
+
+	  It doesn't work on tickless systems at the moment.
+
 source drivers/pps/clients/Kconfig
 
 endmenu
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 32d852f8cbe4..d23999f9499d 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -268,6 +268,7 @@ extern u64 tick_length;
 extern void second_overflow(void);
 extern void update_ntp_one_tick(void);
 extern int do_adjtimex(struct timex *);
+extern void hardpps(const struct timespec *, const struct timespec *);
 
 int read_current_timer(unsigned long *timer_val);
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
 #include <linux/timex.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 
 /*
  * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
 static s64			ntp_tick_adj;
 
+#ifdef CONFIG_NTP_PPS
+
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID	10	/* PPS signal watchdog max (s) */
+#define PPS_POPCORN	4	/* popcorn spike threshold (shift) */
+#define PPS_INTMIN	2	/* min freq interval (s) (shift) */
+#define PPS_INTMAX	8	/* max freq interval (s) (shift) */
+#define PPS_INTCOUNT	4	/* number of consecutive good intervals to
+				   increase pps_shift or consecutive bad
+				   intervals to decrease it */
+#define PPS_MAXWANDER	100000	/* max PPS freq wander (ns/s) */
+
+static int pps_valid;		/* signal watchdog counter */
+static long pps_tf[3];		/* phase median filter */
+static long pps_jitter;		/* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift;		/* current interval duration (s) (shift) */
+static int pps_intcnt;		/* interval counter */
+static s64 pps_freq;		/* frequency offset (scaled ns/s) */
+static long pps_stabil;		/* current stability (scaled ns/s) */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt;		/* calibration intervals */
+static long pps_jitcnt;		/* jitter limit exceeded */
+static long pps_stbcnt;		/* stability limit exceeded */
+static long pps_errcnt;		/* calibration errors */
+
+
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+		return offset;
+	else
+		return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void)
+{
+	/* the PPS calibration interval may end
+	   surprisingly early */
+	pps_shift = PPS_INTMIN;
+	pps_intcnt = 0;
+}
+
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_clear(void)
+{
+	pps_reset_freq_interval();
+	pps_tf[0] = 0;
+	pps_tf[1] = 0;
+	pps_tf[2] = 0;
+	pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+	pps_freq = 0;
+}
+
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_dec_valid(void)
+{
+	if (pps_valid > 0)
+		pps_valid--;
+	else {
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				 STA_PPSWANDER | STA_PPSERROR);
+		pps_clear();
+	}
+}
+
+static inline void pps_set_freq(s64 freq)
+{
+	pps_freq = freq;
+}
+
+static inline int is_error_status(int status)
+{
+	return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+		/* PPS signal lost when either PPS time or
+		 * PPS frequency synchronization requested
+		 */
+		|| ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+			&& !(time_status & STA_PPSSIGNAL))
+		/* PPS jitter exceeded when
+		 * PPS time synchronization requested */
+		|| ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+			== (STA_PPSTIME|STA_PPSJITTER))
+		/* PPS wander exceeded or calibration error when
+		 * PPS frequency synchronization requested
+		 */
+		|| ((time_status & STA_PPSFREQ)
+			&& (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+	txc->ppsfreq	   = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+					 PPM_SCALE_INV, NTP_SCALE_SHIFT);
+	txc->jitter	   = pps_jitter;
+	if (!(time_status & STA_NANO))
+		txc->jitter /= NSEC_PER_USEC;
+	txc->shift	   = pps_shift;
+	txc->stabil	   = pps_stabil;
+	txc->jitcnt	   = pps_jitcnt;
+	txc->calcnt	   = pps_calcnt;
+	txc->errcnt	   = pps_errcnt;
+	txc->stbcnt	   = pps_stbcnt;
+}
+
+#else /* !CONFIG_NTP_PPS */
+
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+	return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+
+static inline int is_error_status(int status)
+{
+	return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+	/* PPS is not implemented, so these are zero */
+	txc->ppsfreq	   = 0;
+	txc->jitter	   = 0;
+	txc->shift	   = 0;
+	txc->stabil	   = 0;
+	txc->jitcnt	   = 0;
+	txc->calcnt	   = 0;
+	txc->errcnt	   = 0;
+	txc->stbcnt	   = 0;
+}
+
+#endif /* CONFIG_NTP_PPS */
+
 /*
  * NTP methods:
  */
@@ -185,6 +342,9 @@ void ntp_clear(void)
 
 	tick_length	= tick_length_base;
 	time_offset	= 0;
+
+	/* Clear PPS state variables */
+	pps_clear();
 }
 
 /*
@@ -250,16 +410,16 @@ void second_overflow(void)
 		time_status |= STA_UNSYNC;
 	}
 
-	/*
-	 * Compute the phase adjustment for the next second. The offset is
-	 * reduced by a fixed factor times the time constant.
-	 */
+	/* Compute the phase adjustment for the next second */
 	tick_length	 = tick_length_base;
 
-	delta		 = shift_right(time_offset, SHIFT_PLL + time_constant);
+	delta		 = ntp_offset_chunk(time_offset);
 	time_offset	-= delta;
 	tick_length	+= delta;
 
+	/* Check PPS signal */
+	pps_dec_valid();
+
 	if (!time_adjust)
 		return;
 
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
 		time_state = TIME_OK;
 		time_status = STA_UNSYNC;
+		/* restart PPS frequency calibration */
+		pps_reset_freq_interval();
 	}
 
 	/*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 		time_freq = txc->freq * PPM_SCALE;
 		time_freq = min(time_freq, MAXFREQ_SCALED);
 		time_freq = max(time_freq, -MAXFREQ_SCALED);
+		/* update pps_freq */
+		pps_set_freq(time_freq);
 	}
 
 	if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
 	}
 
 	result = time_state;	/* mostly `TIME_OK' */
-	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+	/* check for errors */
+	if (is_error_status(time_status))
 		result = TIME_ERROR;
 
 	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
 	txc->tick	   = tick_usec;
 	txc->tai	   = time_tai;
 
-	/* PPS is not implemented, so these are zero */
-	txc->ppsfreq	   = 0;
-	txc->jitter	   = 0;
-	txc->shift	   = 0;
-	txc->stabil	   = 0;
-	txc->jitcnt	   = 0;
-	txc->calcnt	   = 0;
-	txc->errcnt	   = 0;
-	txc->stbcnt	   = 0;
+	/* fill PPS status fields */
+	pps_fill_timex(txc);
 
 	write_sequnlock_irq(&xtime_lock);
 
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
 	return result;
 }
 
+#ifdef	CONFIG_NTP_PPS
+
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+	__kernel_time_t	sec;	/* seconds */
+	long		nsec;	/* nanoseconds */
+};
+
+/* normalize the timestamp so that nsec is in the
+   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+	struct pps_normtime norm = {
+		.sec = ts.tv_sec,
+		.nsec = ts.tv_nsec
+	};
+
+	if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+		norm.nsec -= NSEC_PER_SEC;
+		norm.sec++;
+	}
+
+	return norm;
+}
+
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+	*jitter = pps_tf[0] - pps_tf[1];
+	if (*jitter < 0)
+		*jitter = -*jitter;
+
+	/* TODO: test various filters */
+	return pps_tf[0];
+}
+
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0] = err;
+}
+
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+	if (--pps_intcnt <= -PPS_INTCOUNT) {
+		pps_intcnt = -PPS_INTCOUNT;
+		if (pps_shift > PPS_INTMIN) {
+			pps_shift--;
+			pps_intcnt = 0;
+		}
+	}
+}
+
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+	if (++pps_intcnt >= PPS_INTCOUNT) {
+		pps_intcnt = PPS_INTCOUNT;
+		if (pps_shift < PPS_INTMAX) {
+			pps_shift++;
+			pps_intcnt = 0;
+		}
+	}
+}
+
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+	long delta, delta_mod;
+	s64 ftemp;
+
+	/* check if the frequency interval was too long */
+	if (freq_norm.sec > (2 << pps_shift)) {
+		time_status |= STA_PPSERROR;
+		pps_errcnt++;
+		pps_dec_freq_interval();
+		pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+				freq_norm.sec);
+		return 0;
+	}
+
+	/* here the raw frequency offset and wander (stability) is
+	 * calculated. If the wander is less than the wander threshold
+	 * the interval is increased; otherwise it is decreased.
+	 */
+	ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+			freq_norm.sec);
+	delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+	pps_freq = ftemp;
+	if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+		pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+		time_status |= STA_PPSWANDER;
+		pps_stbcnt++;
+		pps_dec_freq_interval();
+	} else {	/* good sample */
+		pps_inc_freq_interval();
+	}
+
+	/* the stability metric is calculated as the average of recent
+	 * frequency changes, but is used only for performance
+	 * monitoring
+	 */
+	delta_mod = delta;
+	if (delta_mod < 0)
+		delta_mod = -delta_mod;
+	pps_stabil += (div_s64(((s64)delta_mod) <<
+				(NTP_SCALE_SHIFT - SHIFT_USEC),
+				NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+
+	/* if enabled, the system clock frequency is updated */
+	if ((time_status & STA_PPSFREQ) != 0 &&
+	    (time_status & STA_FREQHOLD) == 0) {
+		time_freq = pps_freq;
+		ntp_update_frequency();
+	}
+
+	return delta;
+}
+
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+	long correction = -error;
+	long jitter;
+
+	/* add the sample to the median filter */
+	pps_phase_filter_add(correction);
+	correction = pps_phase_filter_get(&jitter);
+
+	/* Nominal jitter is due to PPS signal noise. If it exceeds the
+	 * threshold, the sample is discarded; otherwise, if so enabled,
+	 * the time offset is updated.
+	 */
+	if (jitter > (pps_jitter << PPS_POPCORN)) {
+		pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+		       jitter, (pps_jitter << PPS_POPCORN));
+		time_status |= STA_PPSJITTER;
+		pps_jitcnt++;
+	} else if (time_status & STA_PPSTIME) {
+		/* correct the time using the phase offset */
+		time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+				NTP_INTERVAL_FREQ);
+		/* cancel running adjtime() */
+		time_adjust = 0;
+	}
+	/* update jitter */
+	pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+	struct pps_normtime pts_norm, freq_norm;
+	unsigned long flags;
+
+	pts_norm = pps_normalize_ts(*phase_ts);
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	/* clear the error bits, they will be set again if needed */
+	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+
+	/* indicate signal presence */
+	time_status |= STA_PPSSIGNAL;
+	pps_valid = PPS_VALID;
+
+	/* when called for the first time,
+	 * just start the frequency interval */
+	if (unlikely(pps_fbase.tv_sec == 0)) {
+		pps_fbase = *raw_ts;
+		write_sequnlock_irqrestore(&xtime_lock, flags);
+		return;
+	}
+
+	/* ok, now we have a base for frequency calculation */
+	freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+
+	/* check that the signal is in the range
+	 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+	if ((freq_norm.sec == 0) ||
+			(freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+			(freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+		time_status |= STA_PPSJITTER;
+		/* restart the frequency calibration interval */
+		pps_fbase = *raw_ts;
+		write_sequnlock_irqrestore(&xtime_lock, flags);
+		pr_err("hardpps: PPSJITTER: bad pulse\n");
+		return;
+	}
+
+	/* signal is ok */
+
+	/* check if the current frequency interval is finished */
+	if (freq_norm.sec >= (1 << pps_shift)) {
+		pps_calcnt++;
+		/* restart the frequency calibration interval */
+		pps_fbase = *raw_ts;
+		hardpps_update_freq(freq_norm);
+	}
+
+	hardpps_update_phase(pts_norm.nsec);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+
+#endif	/* CONFIG_NTP_PPS */
+
 static int __init ntp_tick_adj_setup(char *str)
 {
 	ntp_tick_adj = simple_strtol(str, NULL, 0);
-- 
cgit v1.2.3-71-gd317


From e2c18e49a0d4f822ffc29fb4958943beb1ff08b7 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:57 -0800
Subject: pps: capture MONOTONIC_RAW timestamps as well

MONOTONIC_RAW clock timestamps are ideally suited for frequency
calculation and also fit well into the original NTP hardpps design.  Now
phase and frequency can be adjusted separately: the former based on
REALTIME clock and the latter based on MONOTONIC_RAW clock.

A new function getnstime_raw_and_real is added to timekeeping subsystem to
capture both timestamps at the same time and atomically.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pps_kernel.h | 14 ++++++++++++++
 include/linux/time.h       |  2 ++
 kernel/time/timekeeping.c  | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 1aedf50088cf..94048547f29a 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -47,6 +47,9 @@ struct pps_source_info {
 };
 
 struct pps_event_time {
+#ifdef CONFIG_NTP_PPS
+	struct timespec ts_raw;
+#endif /* CONFIG_NTP_PPS */
 	struct timespec ts_real;
 };
 
@@ -97,10 +100,21 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
 	kt->nsec = ts.tv_nsec;
 }
 
+#ifdef CONFIG_NTP_PPS
+
+static inline void pps_get_ts(struct pps_event_time *ts)
+{
+	getnstime_raw_and_real(&ts->ts_raw, &ts->ts_real);
+}
+
+#else /* CONFIG_NTP_PPS */
+
 static inline void pps_get_ts(struct pps_event_time *ts)
 {
 	getnstimeofday(&ts->ts_real);
 }
 
+#endif /* CONFIG_NTP_PPS */
+
 #endif /* LINUX_PPS_KERNEL_H */
 
diff --git a/include/linux/time.h b/include/linux/time.h
index 9f15ac7ab92a..1e6d3b59238d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -158,6 +158,8 @@ extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
 extern void getrawmonotonic(struct timespec *ts);
+extern void getnstime_raw_and_real(struct timespec *ts_raw,
+		struct timespec *ts_real);
 extern void getboottime(struct timespec *ts);
 extern void monotonic_to_bootbased(struct timespec *ts);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5bb86da82003..5536aaf3ba36 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
+#ifdef CONFIG_NTP_PPS
+
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw:	pointer to the timespec to be set to raw monotonic time
+ * @ts_real:	pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+	unsigned long seq;
+	s64 nsecs_raw, nsecs_real;
+
+	WARN_ON_ONCE(timekeeping_suspended);
+
+	do {
+		u32 arch_offset;
+
+		seq = read_seqbegin(&xtime_lock);
+
+		*ts_raw = raw_time;
+		*ts_real = xtime;
+
+		nsecs_raw = timekeeping_get_ns_raw();
+		nsecs_real = timekeeping_get_ns();
+
+		/* If arch requires, add in gettimeoffset() */
+		arch_offset = arch_gettimeoffset();
+		nsecs_raw += arch_offset;
+		nsecs_real += arch_offset;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	timespec_add_ns(ts_raw, nsecs_raw);
+	timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+
+#endif /* CONFIG_NTP_PPS */
+
 /**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
-- 
cgit v1.2.3-71-gd317


From 717c033669ed3ceaee8df57d4562fafcc1a6267a Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:58 -0800
Subject: pps: add kernel consumer support

Add an optional feature of PPSAPI, kernel consumer support, which uses the
added hardpps() function.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ioctl/ioctl-number.txt |   2 +-
 drivers/pps/Makefile                 |   1 +
 drivers/pps/kapi.c                   |   6 ++
 drivers/pps/kc.c                     | 122 +++++++++++++++++++++++++++++++++++
 drivers/pps/kc.h                     |  46 +++++++++++++
 drivers/pps/pps.c                    |  38 ++++++++++-
 include/linux/pps.h                  |   7 ++
 7 files changed, 220 insertions(+), 2 deletions(-)
 create mode 100644 drivers/pps/kc.c
 create mode 100644 drivers/pps/kc.h

(limited to 'include/linux')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index d6a63c7b4478..ac293e955308 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -247,7 +247,7 @@ Code  Seq#(hex)	Include File		Comments
 'p'	40-7F	linux/nvram.h
 'p'	80-9F	linux/ppdev.h		user-space parport
 					<mailto:tim@cyberelk.net>
-'p'	A1-A4	linux/pps.h		LinuxPPS
+'p'	A1-A5	linux/pps.h		LinuxPPS
 					<mailto:giometti@linux.it>
 'q'	00-1F	linux/serio.h
 'q'	80-FF	linux/telephony.h	Internet PhoneJACK, Internet LineJACK
diff --git a/drivers/pps/Makefile b/drivers/pps/Makefile
index 98960ddd3188..77c23457b739 100644
--- a/drivers/pps/Makefile
+++ b/drivers/pps/Makefile
@@ -3,6 +3,7 @@
 #
 
 pps_core-y			:= pps.o kapi.o sysfs.o
+pps_core-$(CONFIG_NTP_PPS)	+= kc.o
 obj-$(CONFIG_PPS)		:= pps_core.o
 obj-y				+= clients/
 
diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
index ebf33746c37c..cba1b43f7519 100644
--- a/drivers/pps/kapi.c
+++ b/drivers/pps/kapi.c
@@ -26,11 +26,14 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/time.h>
+#include <linux/timex.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/pps_kernel.h>
 #include <linux/slab.h>
 
+#include "kc.h"
+
 /*
  * Local functions
  */
@@ -139,6 +142,7 @@ EXPORT_SYMBOL(pps_register_source);
 
 void pps_unregister_source(struct pps_device *pps)
 {
+	pps_kc_remove(pps);
 	pps_unregister_cdev(pps);
 
 	/* don't have to kfree(pps) here because it will be done on
@@ -211,6 +215,8 @@ void pps_event(struct pps_device *pps, struct pps_event_time *ts, int event,
 		captured = ~0;
 	}
 
+	pps_kc_event(pps, ts, event);
+
 	/* Wake up if captured something */
 	if (captured) {
 		pps->last_ev++;
diff --git a/drivers/pps/kc.c b/drivers/pps/kc.c
new file mode 100644
index 000000000000..079e930b1938
--- /dev/null
+++ b/drivers/pps/kc.c
@@ -0,0 +1,122 @@
+/*
+ * PPS kernel consumer API
+ *
+ * Copyright (C) 2009-2010   Alexander Gordeev <lasaine@lvk.cs.msu.su>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/pps_kernel.h>
+
+#include "kc.h"
+
+/*
+ * Global variables
+ */
+
+/* state variables to bind kernel consumer */
+DEFINE_SPINLOCK(pps_kc_hardpps_lock);
+/* PPS API (RFC 2783): current source and mode for kernel consumer */
+struct pps_device *pps_kc_hardpps_dev;	/* unique pointer to device */
+int pps_kc_hardpps_mode;		/* mode bits for kernel consumer */
+
+/* pps_kc_bind - control PPS kernel consumer binding
+ * @pps: the PPS source
+ * @bind_args: kernel consumer bind parameters
+ *
+ * This function is used to bind or unbind PPS kernel consumer according to
+ * supplied parameters. Should not be called in interrupt context.
+ */
+int pps_kc_bind(struct pps_device *pps, struct pps_bind_args *bind_args)
+{
+	/* Check if another consumer is already bound */
+	spin_lock_irq(&pps_kc_hardpps_lock);
+
+	if (bind_args->edge == 0)
+		if (pps_kc_hardpps_dev == pps) {
+			pps_kc_hardpps_mode = 0;
+			pps_kc_hardpps_dev = NULL;
+			spin_unlock_irq(&pps_kc_hardpps_lock);
+			dev_info(pps->dev, "unbound kernel"
+					" consumer\n");
+		} else {
+			spin_unlock_irq(&pps_kc_hardpps_lock);
+			dev_err(pps->dev, "selected kernel consumer"
+					" is not bound\n");
+			return -EINVAL;
+		}
+	else
+		if (pps_kc_hardpps_dev == NULL ||
+				pps_kc_hardpps_dev == pps) {
+			pps_kc_hardpps_mode = bind_args->edge;
+			pps_kc_hardpps_dev = pps;
+			spin_unlock_irq(&pps_kc_hardpps_lock);
+			dev_info(pps->dev, "bound kernel consumer: "
+				"edge=0x%x\n", bind_args->edge);
+		} else {
+			spin_unlock_irq(&pps_kc_hardpps_lock);
+			dev_err(pps->dev, "another kernel consumer"
+					" is already bound\n");
+			return -EINVAL;
+		}
+
+	return 0;
+}
+
+/* pps_kc_remove - unbind kernel consumer on PPS source removal
+ * @pps: the PPS source
+ *
+ * This function is used to disable kernel consumer on PPS source removal
+ * if this source was bound to PPS kernel consumer. Can be called on any
+ * source safely. Should not be called in interrupt context.
+ */
+void pps_kc_remove(struct pps_device *pps)
+{
+	spin_lock_irq(&pps_kc_hardpps_lock);
+	if (pps == pps_kc_hardpps_dev) {
+		pps_kc_hardpps_mode = 0;
+		pps_kc_hardpps_dev = NULL;
+		spin_unlock_irq(&pps_kc_hardpps_lock);
+		dev_info(pps->dev, "unbound kernel consumer"
+				" on device removal\n");
+	} else
+		spin_unlock_irq(&pps_kc_hardpps_lock);
+}
+
+/* pps_kc_event - call hardpps() on PPS event
+ * @pps: the PPS source
+ * @ts: PPS event timestamp
+ * @event: PPS event edge
+ *
+ * This function calls hardpps() when an event from bound PPS source occurs.
+ */
+void pps_kc_event(struct pps_device *pps, struct pps_event_time *ts,
+		int event)
+{
+	unsigned long flags;
+
+	/* Pass some events to kernel consumer if activated */
+	spin_lock_irqsave(&pps_kc_hardpps_lock, flags);
+	if (pps == pps_kc_hardpps_dev && event & pps_kc_hardpps_mode)
+		hardpps(&ts->ts_real, &ts->ts_raw);
+	spin_unlock_irqrestore(&pps_kc_hardpps_lock, flags);
+}
diff --git a/drivers/pps/kc.h b/drivers/pps/kc.h
new file mode 100644
index 000000000000..d296fcd0a175
--- /dev/null
+++ b/drivers/pps/kc.h
@@ -0,0 +1,46 @@
+/*
+ * PPS kernel consumer API header
+ *
+ * Copyright (C) 2009-2010   Alexander Gordeev <lasaine@lvk.cs.msu.su>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef LINUX_PPS_KC_H
+#define LINUX_PPS_KC_H
+
+#include <linux/errno.h>
+#include <linux/pps_kernel.h>
+
+#ifdef CONFIG_NTP_PPS
+
+extern int pps_kc_bind(struct pps_device *pps,
+		struct pps_bind_args *bind_args);
+extern void pps_kc_remove(struct pps_device *pps);
+extern void pps_kc_event(struct pps_device *pps,
+		struct pps_event_time *ts, int event);
+
+
+#else /* CONFIG_NTP_PPS */
+
+static inline int pps_kc_bind(struct pps_device *pps,
+		struct pps_bind_args *bind_args) { return -EOPNOTSUPP; }
+static inline void pps_kc_remove(struct pps_device *pps) {}
+static inline void pps_kc_event(struct pps_device *pps,
+		struct pps_event_time *ts, int event) {}
+
+#endif /* CONFIG_NTP_PPS */
+
+#endif /* LINUX_PPS_KC_H */
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
index 9e15cf1da946..2baadd21b7a6 100644
--- a/drivers/pps/pps.c
+++ b/drivers/pps/pps.c
@@ -33,6 +33,8 @@
 #include <linux/pps_kernel.h>
 #include <linux/slab.h>
 
+#include "kc.h"
+
 /*
  * Local variables
  */
@@ -198,9 +200,43 @@ static long pps_cdev_ioctl(struct file *file,
 
 		break;
 	}
+	case PPS_KC_BIND: {
+		struct pps_bind_args bind_args;
+
+		dev_dbg(pps->dev, "PPS_KC_BIND\n");
+
+		/* Check the capabilities */
+		if (!capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		if (copy_from_user(&bind_args, uarg,
+					sizeof(struct pps_bind_args)))
+			return -EFAULT;
+
+		/* Check for supported capabilities */
+		if ((bind_args.edge & ~pps->info.mode) != 0) {
+			dev_err(pps->dev, "unsupported capabilities (%x)\n",
+					bind_args.edge);
+			return -EINVAL;
+		}
+
+		/* Validate parameters roughly */
+		if (bind_args.tsformat != PPS_TSFMT_TSPEC ||
+				(bind_args.edge & ~PPS_CAPTUREBOTH) != 0 ||
+				bind_args.consumer != PPS_KC_HARDPPS) {
+			dev_err(pps->dev, "invalid kernel consumer bind"
+					" parameters (%x)\n", bind_args.edge);
+			return -EINVAL;
+		}
+
+		err = pps_kc_bind(pps, &bind_args);
+		if (err < 0)
+			return err;
+
+		break;
+	}
 	default:
 		return -ENOTTY;
-		break;
 	}
 
 	return 0;
diff --git a/include/linux/pps.h b/include/linux/pps.h
index 0194ab06177b..a9bb1d93451a 100644
--- a/include/linux/pps.h
+++ b/include/linux/pps.h
@@ -114,11 +114,18 @@ struct pps_fdata {
 	struct pps_ktime timeout;
 };
 
+struct pps_bind_args {
+	int tsformat;	/* format of time stamps */
+	int edge;	/* selected event type */
+	int consumer;	/* selected kernel consumer */
+};
+
 #include <linux/ioctl.h>
 
 #define PPS_GETPARAMS		_IOR('p', 0xa1, struct pps_kparams *)
 #define PPS_SETPARAMS		_IOW('p', 0xa2, struct pps_kparams *)
 #define PPS_GETCAP		_IOR('p', 0xa3, int *)
 #define PPS_FETCH		_IOWR('p', 0xa4, struct pps_fdata *)
+#define PPS_KC_BIND		_IOW('p', 0xa5, struct pps_bind_args *)
 
 #endif /* _PPS_H_ */
-- 
cgit v1.2.3-71-gd317


From 8930c8aa740b12ad69f44a35137bcc39bfa3dc41 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 12 Jan 2011 17:01:03 -0800
Subject: memstick: add support for JMicron JMB 385 and 390 controllers

Signed-off-by: Aries Lee <arieslee@jmicron.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Cc: Alex Dubov <oakad@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/memstick/host/jmb38x_ms.c | 17 +++++++++++++++--
 include/linux/pci_ids.h           |  2 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 4ce8773fa3b8..d89d925caecf 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -765,6 +765,8 @@ static int jmb38x_ms_set_param(struct memstick_host *msh,
 #define  PMOS0_SW_LED_POLARITY_ENABLE	0x80
 #define  PMOS0_ACTIVE_BITS (PMOS0_ENABLE | PMOS0_EN_OVERCURRENT_DEBOUNCE | \
 			    PMOS0_OVERCURRENT_LEVEL_2_4V)
+#define PCI_PMOS1_CONTROL		0xbd
+#define  PMOS1_ACTIVE_BITS		0x4a
 #define PCI_CLOCK_CTL			0xb9
 
 static int jmb38x_ms_pmos(struct pci_dev *pdev, int flag)
@@ -779,6 +781,16 @@ static int jmb38x_ms_pmos(struct pci_dev *pdev, int flag)
 	pci_write_config_byte(pdev, PCI_PMOS0_CONTROL, val);
 	dev_dbg(&pdev->dev, "JMB38x: set PMOS0 val 0x%x\n", val);
 
+	if (pci_resource_flags(pdev, 1)) {
+		pci_read_config_byte(pdev, PCI_PMOS1_CONTROL, &val);
+		if (flag)
+			val |= PMOS1_ACTIVE_BITS;
+		else
+			val &= ~PMOS1_ACTIVE_BITS;
+		pci_write_config_byte(pdev, PCI_PMOS1_CONTROL, val);
+		dev_dbg(&pdev->dev, "JMB38x: set PMOS1 val 0x%x\n", val);
+	}
+
 	pci_read_config_byte(pdev, PCI_CLOCK_CTL, &val);
 	pci_write_config_byte(pdev, PCI_CLOCK_CTL, val & ~0x0f);
 	pci_write_config_byte(pdev, PCI_CLOCK_CTL, val | 0x01);
@@ -1018,8 +1030,9 @@ static void jmb38x_ms_remove(struct pci_dev *dev)
 }
 
 static struct pci_device_id jmb38x_ms_id_tbl [] = {
-	{ PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB38X_MS, PCI_ANY_ID,
-	  PCI_ANY_ID, 0, 0, 0 },
+	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB38X_MS) },
+	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB385_MS) },
+	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB390_MS) },
 	{ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ab47732d81e0..ae0dc453e3e2 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2369,8 +2369,10 @@
 #define PCI_DEVICE_ID_JMICRON_JMB38X_SD	0x2381
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MMC 0x2382
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MS	0x2383
+#define PCI_DEVICE_ID_JMICRON_JMB385_MS	0x2388
 #define PCI_DEVICE_ID_JMICRON_JMB388_SD	0x2391
 #define PCI_DEVICE_ID_JMICRON_JMB388_ESD 0x2392
+#define PCI_DEVICE_ID_JMICRON_JMB390_MS	0x2393
 
 #define PCI_VENDOR_ID_KORENIX		0x1982
 #define PCI_DEVICE_ID_KORENIX_JETCARDF0	0x1600
-- 
cgit v1.2.3-71-gd317


From beeae05138baf615a0284fc4a9022c33271ee37c Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <virtuoso@slind.org>
Date: Wed, 12 Jan 2011 17:01:09 -0800
Subject: cramfs: hide function prototypes behind __KERNEL__ macro

Currently, 3 kernel function prototypes are present in a header
file exported to userland. This patch fixes it.

Signed-off-by: Alexander Shishkin <virtuoso@slind.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cramfs_fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cramfs_fs.h b/include/linux/cramfs_fs.h
index 6fc2bed368b8..0e7bf272ec2f 100644
--- a/include/linux/cramfs_fs.h
+++ b/include/linux/cramfs_fs.h
@@ -84,9 +84,11 @@ struct cramfs_super {
 				| CRAMFS_FLAG_WRONG_SIGNATURE \
 				| CRAMFS_FLAG_SHIFTED_ROOT_OFFSET )
 
+#ifdef __KERNEL__
 /* Uncompression interfaces to the underlying zlib */
 int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
 int cramfs_uncompress_init(void);
 void cramfs_uncompress_exit(void);
+#endif /* __KERNEL__ */
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 8d0a1decb4a07bcb8ef5c4d7f693eafdebc2c41b Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 12 Jan 2011 17:01:12 -0800
Subject: romfs: have romfs_fs.h pull in necessary headers

This header uses things like __be32, so pull in linux/types.h.

Further, it uses BLOCK_SIZE, so pull in linux/fs.h.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/romfs_fs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/romfs_fs.h b/include/linux/romfs_fs.h
index c490fbc43fe2..5f57f93b284f 100644
--- a/include/linux/romfs_fs.h
+++ b/include/linux/romfs_fs.h
@@ -1,6 +1,9 @@
 #ifndef __LINUX_ROMFS_FS_H
 #define __LINUX_ROMFS_FS_H
 
+#include <linux/types.h>
+#include <linux/fs.h>
+
 /* The basic structures of the romfs filesystem */
 
 #define ROMBSIZE BLOCK_SIZE
-- 
cgit v1.2.3-71-gd317


From 93685ad247ef65b7d6f90ffe97b44f5cfeaf40d3 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Wed, 12 Jan 2011 17:01:14 -0800
Subject: Decompressors: get rid of set_error_fn() macro

set_error_fn() has become a useless complication after c1e7c3ae59
("bzip2/lzma/gzip: pre-boot malloc doesn't return NULL on failure") fixed
the use of error() in malloc().  Only decompress_unlzma.c had some use for
it and that was easy to change too.

This also gets rid of the static function pointer "error", which
should have been marked as __initdata.

Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alain Knaff <alain@knaff.lu>
Cc: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/decompress/mm.h |  5 -----
 lib/decompress_bunzip2.c      |  7 +++----
 lib/decompress_inflate.c      |  3 +--
 lib/decompress_unlzma.c       | 11 ++++++-----
 lib/decompress_unlzo.c        |  4 +---
 5 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
index ad5ec1d0475e..a31e9820710a 100644
--- a/include/linux/decompress/mm.h
+++ b/include/linux/decompress/mm.h
@@ -61,8 +61,6 @@ static void free(void *where)
 #define large_malloc(a) malloc(a)
 #define large_free(a) free(a)
 
-#define set_error_fn(x)
-
 #define INIT
 
 #else /* STATIC */
@@ -84,9 +82,6 @@ static void free(void *where)
 #define large_malloc(a) vmalloc(a)
 #define large_free(a) vfree(a)
 
-static void(*error)(char *m);
-#define set_error_fn(x) error = x;
-
 #define INIT __init
 #define STATIC
 
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index 81c8bb1cc6aa..e805087f5561 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -682,13 +682,12 @@ STATIC int INIT bunzip2(unsigned char *buf, int len,
 			int(*flush)(void*, unsigned int),
 			unsigned char *outbuf,
 			int *pos,
-			void(*error_fn)(char *x))
+			void(*error)(char *x))
 {
 	struct bunzip_data *bd;
 	int i = -1;
 	unsigned char *inbuf;
 
-	set_error_fn(error_fn);
 	if (flush)
 		outbuf = malloc(BZIP2_IOBUF_SIZE);
 
@@ -751,8 +750,8 @@ STATIC int INIT decompress(unsigned char *buf, int len,
 			int(*flush)(void*, unsigned int),
 			unsigned char *outbuf,
 			int *pos,
-			void(*error_fn)(char *x))
+			void(*error)(char *x))
 {
-	return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error_fn);
+	return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error);
 }
 #endif
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 1eea07e8f21b..9a7f5dfc0ed7 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -38,13 +38,12 @@ STATIC int INIT gunzip(unsigned char *buf, int len,
 		       int(*flush)(void*, unsigned int),
 		       unsigned char *out_buf,
 		       int *pos,
-		       void(*error_fn)(char *x)) {
+		       void(*error)(char *x)) {
 	u8 *zbuf;
 	struct z_stream_s *strm;
 	int rc;
 	size_t out_len;
 
-	set_error_fn(error_fn);
 	rc = -1;
 	if (flush) {
 		out_len = 0x8000; /* 32 K */
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
index 951f7277e98a..2787dc560e4b 100644
--- a/lib/decompress_unlzma.c
+++ b/lib/decompress_unlzma.c
@@ -74,6 +74,7 @@ struct rc {
 	uint32_t code;
 	uint32_t range;
 	uint32_t bound;
+	void (*error)(char *);
 };
 
 
@@ -92,7 +93,7 @@ static void INIT rc_read(struct rc *rc)
 {
 	rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE);
 	if (rc->buffer_size <= 0)
-		error("unexpected EOF");
+		rc->error("unexpected EOF");
 	rc->ptr = rc->buffer;
 	rc->buffer_end = rc->buffer + rc->buffer_size;
 }
@@ -536,7 +537,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, int in_len,
 			      int(*flush)(void*, unsigned int),
 			      unsigned char *output,
 			      int *posp,
-			      void(*error_fn)(char *x)
+			      void(*error)(char *x)
 	)
 {
 	struct lzma_header header;
@@ -552,7 +553,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, int in_len,
 	unsigned char *inbuf;
 	int ret = -1;
 
-	set_error_fn(error_fn);
+	rc.error = error;
 
 	if (buf)
 		inbuf = buf;
@@ -659,9 +660,9 @@ STATIC int INIT decompress(unsigned char *buf, int in_len,
 			      int(*flush)(void*, unsigned int),
 			      unsigned char *output,
 			      int *posp,
-			      void(*error_fn)(char *x)
+			      void(*error)(char *x)
 	)
 {
-	return unlzma(buf, in_len - 4, fill, flush, output, posp, error_fn);
+	return unlzma(buf, in_len - 4, fill, flush, output, posp, error);
 }
 #endif
diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c
index bcb3a4bd68ff..df3e98f945a6 100644
--- a/lib/decompress_unlzo.c
+++ b/lib/decompress_unlzo.c
@@ -91,7 +91,7 @@ STATIC inline int INIT unlzo(u8 *input, int in_len,
 				int (*fill) (void *, unsigned int),
 				int (*flush) (void *, unsigned int),
 				u8 *output, int *posp,
-				void (*error_fn) (char *x))
+				void (*error) (char *x))
 {
 	u8 skip = 0, r = 0;
 	u32 src_len, dst_len;
@@ -99,8 +99,6 @@ STATIC inline int INIT unlzo(u8 *input, int in_len,
 	u8 *in_buf, *in_buf_save, *out_buf;
 	int ret = -1;
 
-	set_error_fn(error_fn);
-
 	if (output) {
 		out_buf = output;
 	} else if (!flush) {
-- 
cgit v1.2.3-71-gd317


From 2b6b5caa6d05579bd1501006e34feec1b2aef8c4 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Wed, 12 Jan 2011 17:01:15 -0800
Subject: Decompressors: include <linux/slab.h> in <linux/decompress/mm.h>

Currently users of mm.h need to include <linux/slab.h> to use the macros
malloc() and free() provided by mm.h.  This fixes it.

Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alain Knaff <alain@knaff.lu>
Cc: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/decompress/mm.h | 1 +
 lib/decompress_bunzip2.c      | 1 -
 lib/decompress_inflate.c      | 1 -
 lib/decompress_unlzma.c       | 1 -
 lib/decompress_unlzo.c        | 1 -
 5 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
index a31e9820710a..4cb72b920c74 100644
--- a/include/linux/decompress/mm.h
+++ b/include/linux/decompress/mm.h
@@ -70,6 +70,7 @@ static void free(void *where)
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 
 /* Use defines rather than static inline in order to avoid spurious
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index e805087f5561..a7b80c1d6a0d 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -49,7 +49,6 @@
 #define PREBOOT
 #else
 #include <linux/decompress/bunzip2.h>
-#include <linux/slab.h>
 #endif /* STATIC */
 
 #include <linux/decompress/mm.h>
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 9a7f5dfc0ed7..b5fe1d1d5f05 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -19,7 +19,6 @@
 #include "zlib_inflate/inflate.h"
 
 #include "zlib_inflate/infutil.h"
-#include <linux/slab.h>
 
 #endif /* STATIC */
 
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
index 2787dc560e4b..946b83be3bcc 100644
--- a/lib/decompress_unlzma.c
+++ b/lib/decompress_unlzma.c
@@ -33,7 +33,6 @@
 #define PREBOOT
 #else
 #include <linux/decompress/unlzma.h>
-#include <linux/slab.h>
 #endif /* STATIC */
 
 #include <linux/decompress/mm.h>
diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c
index df3e98f945a6..6e56d547ba32 100644
--- a/lib/decompress_unlzo.c
+++ b/lib/decompress_unlzo.c
@@ -33,7 +33,6 @@
 #ifdef STATIC
 #include "lzo/lzo1x_decompress.c"
 #else
-#include <linux/slab.h>
 #include <linux/decompress/unlzo.h>
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 24fa0402a9b6a537e87e38341e78b7da86486846 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Wed, 12 Jan 2011 17:01:22 -0800
Subject: decompressors: add XZ decompressor module

In userspace, the .lzma format has become mostly a legacy file format that
got superseded by the .xz format.  Similarly, LZMA Utils was superseded by
XZ Utils.

These patches add support for XZ decompression into the kernel.  Most of
the code is as is from XZ Embedded <http://tukaani.org/xz/embedded.html>.
It was written for the Linux kernel but is usable in other projects too.

Advantages of XZ over the current LZMA code in the kernel:
  - Nice API that can be used by other kernel modules; it's
    not limited to kernel, initramfs, and initrd decompression.
  - Integrity check support (CRC32)
  - BCJ filters improve compression of executable code on
    certain architectures. These together with LZMA2 can
    produce a few percent smaller kernel or Squashfs images
    than plain LZMA without making the decompression slower.

This patch: Add the main decompression code (xz_dec), testing module
(xz_dec_test), wrapper script (xz_wrap.sh) for the xz command line tool,
and documentation.  The xz_dec module is enough to have a usable XZ
decompressor e.g.  for Squashfs.

Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alain Knaff <alain@knaff.lu>
Cc: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/xz.txt   |  121 +++++
 include/linux/xz.h     |  264 +++++++++++
 lib/Kconfig            |    2 +
 lib/Makefile           |    1 +
 lib/xz/Kconfig         |   59 +++
 lib/xz/Makefile        |    5 +
 lib/xz/xz_crc32.c      |   59 +++
 lib/xz/xz_dec_bcj.c    |  561 +++++++++++++++++++++++
 lib/xz/xz_dec_lzma2.c  | 1171 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/xz/xz_dec_stream.c |  821 +++++++++++++++++++++++++++++++++
 lib/xz/xz_dec_syms.c   |   26 ++
 lib/xz/xz_dec_test.c   |  220 +++++++++
 lib/xz/xz_lzma2.h      |  204 +++++++++
 lib/xz/xz_private.h    |  156 +++++++
 lib/xz/xz_stream.h     |   62 +++
 scripts/Makefile.lib   |   28 ++
 scripts/xz_wrap.sh     |   23 +
 17 files changed, 3783 insertions(+)
 create mode 100644 Documentation/xz.txt
 create mode 100644 include/linux/xz.h
 create mode 100644 lib/xz/Kconfig
 create mode 100644 lib/xz/Makefile
 create mode 100644 lib/xz/xz_crc32.c
 create mode 100644 lib/xz/xz_dec_bcj.c
 create mode 100644 lib/xz/xz_dec_lzma2.c
 create mode 100644 lib/xz/xz_dec_stream.c
 create mode 100644 lib/xz/xz_dec_syms.c
 create mode 100644 lib/xz/xz_dec_test.c
 create mode 100644 lib/xz/xz_lzma2.h
 create mode 100644 lib/xz/xz_private.h
 create mode 100644 lib/xz/xz_stream.h
 create mode 100644 scripts/xz_wrap.sh

(limited to 'include/linux')

diff --git a/Documentation/xz.txt b/Documentation/xz.txt
new file mode 100644
index 000000000000..2cf3e2608de3
--- /dev/null
+++ b/Documentation/xz.txt
@@ -0,0 +1,121 @@
+
+XZ data compression in Linux
+============================
+
+Introduction
+
+    XZ is a general purpose data compression format with high compression
+    ratio and relatively fast decompression. The primary compression
+    algorithm (filter) is LZMA2. Additional filters can be used to improve
+    compression ratio even further. E.g. Branch/Call/Jump (BCJ) filters
+    improve compression ratio of executable data.
+
+    The XZ decompressor in Linux is called XZ Embedded. It supports
+    the LZMA2 filter and optionally also BCJ filters. CRC32 is supported
+    for integrity checking. The home page of XZ Embedded is at
+    <http://tukaani.org/xz/embedded.html>, where you can find the
+    latest version and also information about using the code outside
+    the Linux kernel.
+
+    For userspace, XZ Utils provide a zlib-like compression library
+    and a gzip-like command line tool. XZ Utils can be downloaded from
+    <http://tukaani.org/xz/>.
+
+XZ related components in the kernel
+
+    The xz_dec module provides XZ decompressor with single-call (buffer
+    to buffer) and multi-call (stateful) APIs. The usage of the xz_dec
+    module is documented in include/linux/xz.h.
+
+    The xz_dec_test module is for testing xz_dec. xz_dec_test is not
+    useful unless you are hacking the XZ decompressor. xz_dec_test
+    allocates a char device major dynamically to which one can write
+    .xz files from userspace. The decompressed output is thrown away.
+    Keep an eye on dmesg to see diagnostics printed by xz_dec_test.
+    See the xz_dec_test source code for the details.
+
+    For decompressing the kernel image, initramfs, and initrd, there
+    is a wrapper function in lib/decompress_unxz.c. Its API is the
+    same as in other decompress_*.c files, which is defined in
+    include/linux/decompress/generic.h.
+
+    scripts/xz_wrap.sh is a wrapper for the xz command line tool found
+    from XZ Utils. The wrapper sets compression options to values suitable
+    for compressing the kernel image.
+
+    For kernel makefiles, two commands are provided for use with
+    $(call if_needed). The kernel image should be compressed with
+    $(call if_needed,xzkern) which will use a BCJ filter and a big LZMA2
+    dictionary. It will also append a four-byte trailer containing the
+    uncompressed size of the file, which is needed by the boot code.
+    Other things should be compressed with $(call if_needed,xzmisc)
+    which will use no BCJ filter and 1 MiB LZMA2 dictionary.
+
+Notes on compression options
+
+    Since the XZ Embedded supports only streams with no integrity check or
+    CRC32, make sure that you don't use some other integrity check type
+    when encoding files that are supposed to be decoded by the kernel. With
+    liblzma, you need to use either LZMA_CHECK_NONE or LZMA_CHECK_CRC32
+    when encoding. With the xz command line tool, use --check=none or
+    --check=crc32.
+
+    Using CRC32 is strongly recommended unless there is some other layer
+    which will verify the integrity of the uncompressed data anyway.
+    Double checking the integrity would probably be waste of CPU cycles.
+    Note that the headers will always have a CRC32 which will be validated
+    by the decoder; you can only change the integrity check type (or
+    disable it) for the actual uncompressed data.
+
+    In userspace, LZMA2 is typically used with dictionary sizes of several
+    megabytes. The decoder needs to have the dictionary in RAM, thus big
+    dictionaries cannot be used for files that are intended to be decoded
+    by the kernel. 1 MiB is probably the maximum reasonable dictionary
+    size for in-kernel use (maybe more is OK for initramfs). The presets
+    in XZ Utils may not be optimal when creating files for the kernel,
+    so don't hesitate to use custom settings. Example:
+
+        xz --check=crc32 --lzma2=dict=512KiB inputfile
+
+    An exception to above dictionary size limitation is when the decoder
+    is used in single-call mode. Decompressing the kernel itself is an
+    example of this situation. In single-call mode, the memory usage
+    doesn't depend on the dictionary size, and it is perfectly fine to
+    use a big dictionary: for maximum compression, the dictionary should
+    be at least as big as the uncompressed data itself.
+
+Future plans
+
+    Creating a limited XZ encoder may be considered if people think it is
+    useful. LZMA2 is slower to compress than e.g. Deflate or LZO even at
+    the fastest settings, so it isn't clear if LZMA2 encoder is wanted
+    into the kernel.
+
+    Support for limited random-access reading is planned for the
+    decompression code. I don't know if it could have any use in the
+    kernel, but I know that it would be useful in some embedded projects
+    outside the Linux kernel.
+
+Conformance to the .xz file format specification
+
+    There are a couple of corner cases where things have been simplified
+    at expense of detecting errors as early as possible. These should not
+    matter in practice all, since they don't cause security issues. But
+    it is good to know this if testing the code e.g. with the test files
+    from XZ Utils.
+
+Reporting bugs
+
+    Before reporting a bug, please check that it's not fixed already
+    at upstream. See <http://tukaani.org/xz/embedded.html> to get the
+    latest code.
+
+    Report bugs to <lasse.collin@tukaani.org> or visit #tukaani on
+    Freenode and talk to Larhzu. I don't actively read LKML or other
+    kernel-related mailing lists, so if there's something I should know,
+    you should email to me personally or use IRC.
+
+    Don't bother Igor Pavlov with questions about the XZ implementation
+    in the kernel or about XZ Utils. While these two implementations
+    include essential code that is directly based on Igor Pavlov's code,
+    these implementations aren't maintained nor supported by him.
diff --git a/include/linux/xz.h b/include/linux/xz.h
new file mode 100644
index 000000000000..64cffa6ddfce
--- /dev/null
+++ b/include/linux/xz.h
@@ -0,0 +1,264 @@
+/*
+ * XZ decompressor
+ *
+ * Authors: Lasse Collin <lasse.collin@tukaani.org>
+ *          Igor Pavlov <http://7-zip.org/>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef XZ_H
+#define XZ_H
+
+#ifdef __KERNEL__
+#	include <linux/stddef.h>
+#	include <linux/types.h>
+#else
+#	include <stddef.h>
+#	include <stdint.h>
+#endif
+
+/* In Linux, this is used to make extern functions static when needed. */
+#ifndef XZ_EXTERN
+#	define XZ_EXTERN extern
+#endif
+
+/**
+ * enum xz_mode - Operation mode
+ *
+ * @XZ_SINGLE:              Single-call mode. This uses less RAM than
+ *                          than multi-call modes, because the LZMA2
+ *                          dictionary doesn't need to be allocated as
+ *                          part of the decoder state. All required data
+ *                          structures are allocated at initialization,
+ *                          so xz_dec_run() cannot return XZ_MEM_ERROR.
+ * @XZ_PREALLOC:            Multi-call mode with preallocated LZMA2
+ *                          dictionary buffer. All data structures are
+ *                          allocated at initialization, so xz_dec_run()
+ *                          cannot return XZ_MEM_ERROR.
+ * @XZ_DYNALLOC:            Multi-call mode. The LZMA2 dictionary is
+ *                          allocated once the required size has been
+ *                          parsed from the stream headers. If the
+ *                          allocation fails, xz_dec_run() will return
+ *                          XZ_MEM_ERROR.
+ *
+ * It is possible to enable support only for a subset of the above
+ * modes at compile time by defining XZ_DEC_SINGLE, XZ_DEC_PREALLOC,
+ * or XZ_DEC_DYNALLOC. The xz_dec kernel module is always compiled
+ * with support for all operation modes, but the preboot code may
+ * be built with fewer features to minimize code size.
+ */
+enum xz_mode {
+	XZ_SINGLE,
+	XZ_PREALLOC,
+	XZ_DYNALLOC
+};
+
+/**
+ * enum xz_ret - Return codes
+ * @XZ_OK:                  Everything is OK so far. More input or more
+ *                          output space is required to continue. This
+ *                          return code is possible only in multi-call mode
+ *                          (XZ_PREALLOC or XZ_DYNALLOC).
+ * @XZ_STREAM_END:          Operation finished successfully.
+ * @XZ_UNSUPPORTED_CHECK:   Integrity check type is not supported. Decoding
+ *                          is still possible in multi-call mode by simply
+ *                          calling xz_dec_run() again.
+ *                          Note that this return value is used only if
+ *                          XZ_DEC_ANY_CHECK was defined at build time,
+ *                          which is not used in the kernel. Unsupported
+ *                          check types return XZ_OPTIONS_ERROR if
+ *                          XZ_DEC_ANY_CHECK was not defined at build time.
+ * @XZ_MEM_ERROR:           Allocating memory failed. This return code is
+ *                          possible only if the decoder was initialized
+ *                          with XZ_DYNALLOC. The amount of memory that was
+ *                          tried to be allocated was no more than the
+ *                          dict_max argument given to xz_dec_init().
+ * @XZ_MEMLIMIT_ERROR:      A bigger LZMA2 dictionary would be needed than
+ *                          allowed by the dict_max argument given to
+ *                          xz_dec_init(). This return value is possible
+ *                          only in multi-call mode (XZ_PREALLOC or
+ *                          XZ_DYNALLOC); the single-call mode (XZ_SINGLE)
+ *                          ignores the dict_max argument.
+ * @XZ_FORMAT_ERROR:        File format was not recognized (wrong magic
+ *                          bytes).
+ * @XZ_OPTIONS_ERROR:       This implementation doesn't support the requested
+ *                          compression options. In the decoder this means
+ *                          that the header CRC32 matches, but the header
+ *                          itself specifies something that we don't support.
+ * @XZ_DATA_ERROR:          Compressed data is corrupt.
+ * @XZ_BUF_ERROR:           Cannot make any progress. Details are slightly
+ *                          different between multi-call and single-call
+ *                          mode; more information below.
+ *
+ * In multi-call mode, XZ_BUF_ERROR is returned when two consecutive calls
+ * to XZ code cannot consume any input and cannot produce any new output.
+ * This happens when there is no new input available, or the output buffer
+ * is full while at least one output byte is still pending. Assuming your
+ * code is not buggy, you can get this error only when decoding a compressed
+ * stream that is truncated or otherwise corrupt.
+ *
+ * In single-call mode, XZ_BUF_ERROR is returned only when the output buffer
+ * is too small or the compressed input is corrupt in a way that makes the
+ * decoder produce more output than the caller expected. When it is
+ * (relatively) clear that the compressed input is truncated, XZ_DATA_ERROR
+ * is used instead of XZ_BUF_ERROR.
+ */
+enum xz_ret {
+	XZ_OK,
+	XZ_STREAM_END,
+	XZ_UNSUPPORTED_CHECK,
+	XZ_MEM_ERROR,
+	XZ_MEMLIMIT_ERROR,
+	XZ_FORMAT_ERROR,
+	XZ_OPTIONS_ERROR,
+	XZ_DATA_ERROR,
+	XZ_BUF_ERROR
+};
+
+/**
+ * struct xz_buf - Passing input and output buffers to XZ code
+ * @in:         Beginning of the input buffer. This may be NULL if and only
+ *              if in_pos is equal to in_size.
+ * @in_pos:     Current position in the input buffer. This must not exceed
+ *              in_size.
+ * @in_size:    Size of the input buffer
+ * @out:        Beginning of the output buffer. This may be NULL if and only
+ *              if out_pos is equal to out_size.
+ * @out_pos:    Current position in the output buffer. This must not exceed
+ *              out_size.
+ * @out_size:   Size of the output buffer
+ *
+ * Only the contents of the output buffer from out[out_pos] onward, and
+ * the variables in_pos and out_pos are modified by the XZ code.
+ */
+struct xz_buf {
+	const uint8_t *in;
+	size_t in_pos;
+	size_t in_size;
+
+	uint8_t *out;
+	size_t out_pos;
+	size_t out_size;
+};
+
+/**
+ * struct xz_dec - Opaque type to hold the XZ decoder state
+ */
+struct xz_dec;
+
+/**
+ * xz_dec_init() - Allocate and initialize a XZ decoder state
+ * @mode:       Operation mode
+ * @dict_max:   Maximum size of the LZMA2 dictionary (history buffer) for
+ *              multi-call decoding. This is ignored in single-call mode
+ *              (mode == XZ_SINGLE). LZMA2 dictionary is always 2^n bytes
+ *              or 2^n + 2^(n-1) bytes (the latter sizes are less common
+ *              in practice), so other values for dict_max don't make sense.
+ *              In the kernel, dictionary sizes of 64 KiB, 128 KiB, 256 KiB,
+ *              512 KiB, and 1 MiB are probably the only reasonable values,
+ *              except for kernel and initramfs images where a bigger
+ *              dictionary can be fine and useful.
+ *
+ * Single-call mode (XZ_SINGLE): xz_dec_run() decodes the whole stream at
+ * once. The caller must provide enough output space or the decoding will
+ * fail. The output space is used as the dictionary buffer, which is why
+ * there is no need to allocate the dictionary as part of the decoder's
+ * internal state.
+ *
+ * Because the output buffer is used as the workspace, streams encoded using
+ * a big dictionary are not a problem in single-call mode. It is enough that
+ * the output buffer is big enough to hold the actual uncompressed data; it
+ * can be smaller than the dictionary size stored in the stream headers.
+ *
+ * Multi-call mode with preallocated dictionary (XZ_PREALLOC): dict_max bytes
+ * of memory is preallocated for the LZMA2 dictionary. This way there is no
+ * risk that xz_dec_run() could run out of memory, since xz_dec_run() will
+ * never allocate any memory. Instead, if the preallocated dictionary is too
+ * small for decoding the given input stream, xz_dec_run() will return
+ * XZ_MEMLIMIT_ERROR. Thus, it is important to know what kind of data will be
+ * decoded to avoid allocating excessive amount of memory for the dictionary.
+ *
+ * Multi-call mode with dynamically allocated dictionary (XZ_DYNALLOC):
+ * dict_max specifies the maximum allowed dictionary size that xz_dec_run()
+ * may allocate once it has parsed the dictionary size from the stream
+ * headers. This way excessive allocations can be avoided while still
+ * limiting the maximum memory usage to a sane value to prevent running the
+ * system out of memory when decompressing streams from untrusted sources.
+ *
+ * On success, xz_dec_init() returns a pointer to struct xz_dec, which is
+ * ready to be used with xz_dec_run(). If memory allocation fails,
+ * xz_dec_init() returns NULL.
+ */
+XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max);
+
+/**
+ * xz_dec_run() - Run the XZ decoder
+ * @s:          Decoder state allocated using xz_dec_init()
+ * @b:          Input and output buffers
+ *
+ * The possible return values depend on build options and operation mode.
+ * See enum xz_ret for details.
+ *
+ * Note that if an error occurs in single-call mode (return value is not
+ * XZ_STREAM_END), b->in_pos and b->out_pos are not modified and the
+ * contents of the output buffer from b->out[b->out_pos] onward are
+ * undefined. This is true even after XZ_BUF_ERROR, because with some filter
+ * chains, there may be a second pass over the output buffer, and this pass
+ * cannot be properly done if the output buffer is truncated. Thus, you
+ * cannot give the single-call decoder a too small buffer and then expect to
+ * get that amount valid data from the beginning of the stream. You must use
+ * the multi-call decoder if you don't want to uncompress the whole stream.
+ */
+XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b);
+
+/**
+ * xz_dec_reset() - Reset an already allocated decoder state
+ * @s:          Decoder state allocated using xz_dec_init()
+ *
+ * This function can be used to reset the multi-call decoder state without
+ * freeing and reallocating memory with xz_dec_end() and xz_dec_init().
+ *
+ * In single-call mode, xz_dec_reset() is always called in the beginning of
+ * xz_dec_run(). Thus, explicit call to xz_dec_reset() is useful only in
+ * multi-call mode.
+ */
+XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
+
+/**
+ * xz_dec_end() - Free the memory allocated for the decoder state
+ * @s:          Decoder state allocated using xz_dec_init(). If s is NULL,
+ *              this function does nothing.
+ */
+XZ_EXTERN void xz_dec_end(struct xz_dec *s);
+
+/*
+ * Standalone build (userspace build or in-kernel build for boot time use)
+ * needs a CRC32 implementation. For normal in-kernel use, kernel's own
+ * CRC32 module is used instead, and users of this module don't need to
+ * care about the functions below.
+ */
+#ifndef XZ_INTERNAL_CRC32
+#	ifdef __KERNEL__
+#		define XZ_INTERNAL_CRC32 0
+#	else
+#		define XZ_INTERNAL_CRC32 1
+#	endif
+#endif
+
+#if XZ_INTERNAL_CRC32
+/*
+ * This must be called before any other xz_* function to initialize
+ * the CRC32 lookup table.
+ */
+XZ_EXTERN void xz_crc32_init(void);
+
+/*
+ * Update CRC32 value using the polynomial from IEEE-802.3. To start a new
+ * calculation, the third argument must be zero. To continue the calculation,
+ * the previously returned value is passed as the third argument.
+ */
+XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc);
+#endif
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 3116aa631af6..2b8f8540d670 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -106,6 +106,8 @@ config LZO_COMPRESS
 config LZO_DECOMPRESS
 	tristate
 
+source "lib/xz/Kconfig"
+
 #
 # These all provide a common interface (hence the apparent duplication with
 # ZLIB_INFLATE; DECOMPRESS_GZIP is just a wrapper.)
diff --git a/lib/Makefile b/lib/Makefile
index 2f59e0a1dd8d..4df2d0297721 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 obj-$(CONFIG_LZO_COMPRESS) += lzo/
 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
+obj-$(CONFIG_XZ_DEC) += xz/
 obj-$(CONFIG_RAID6_PQ) += raid6/
 
 lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
new file mode 100644
index 000000000000..e3b6e18fdac5
--- /dev/null
+++ b/lib/xz/Kconfig
@@ -0,0 +1,59 @@
+config XZ_DEC
+	tristate "XZ decompression support"
+	select CRC32
+	help
+	  LZMA2 compression algorithm and BCJ filters are supported using
+	  the .xz file format as the container. For integrity checking,
+	  CRC32 is supported. See Documentation/xz.txt for more information.
+
+config XZ_DEC_X86
+	bool "x86 BCJ filter decoder" if EMBEDDED
+	default y
+	depends on XZ_DEC
+	select XZ_DEC_BCJ
+
+config XZ_DEC_POWERPC
+	bool "PowerPC BCJ filter decoder" if EMBEDDED
+	default y
+	depends on XZ_DEC
+	select XZ_DEC_BCJ
+
+config XZ_DEC_IA64
+	bool "IA-64 BCJ filter decoder" if EMBEDDED
+	default y
+	depends on XZ_DEC
+	select XZ_DEC_BCJ
+
+config XZ_DEC_ARM
+	bool "ARM BCJ filter decoder" if EMBEDDED
+	default y
+	depends on XZ_DEC
+	select XZ_DEC_BCJ
+
+config XZ_DEC_ARMTHUMB
+	bool "ARM-Thumb BCJ filter decoder" if EMBEDDED
+	default y
+	depends on XZ_DEC
+	select XZ_DEC_BCJ
+
+config XZ_DEC_SPARC
+	bool "SPARC BCJ filter decoder" if EMBEDDED
+	default y
+	depends on XZ_DEC
+	select XZ_DEC_BCJ
+
+config XZ_DEC_BCJ
+	bool
+	default n
+
+config XZ_DEC_TEST
+	tristate "XZ decompressor tester"
+	default n
+	depends on XZ_DEC
+	help
+	  This allows passing .xz files to the in-kernel XZ decoder via
+	  a character special file. It calculates CRC32 of the decompressed
+	  data and writes diagnostics to the system log.
+
+	  Unless you are developing the XZ decoder, you don't need this
+	  and should say N.
diff --git a/lib/xz/Makefile b/lib/xz/Makefile
new file mode 100644
index 000000000000..a7fa7693f0f3
--- /dev/null
+++ b/lib/xz/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_XZ_DEC) += xz_dec.o
+xz_dec-y := xz_dec_syms.o xz_dec_stream.o xz_dec_lzma2.o
+xz_dec-$(CONFIG_XZ_DEC_BCJ) += xz_dec_bcj.o
+
+obj-$(CONFIG_XZ_DEC_TEST) += xz_dec_test.o
diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c
new file mode 100644
index 000000000000..34532d14fd4c
--- /dev/null
+++ b/lib/xz/xz_crc32.c
@@ -0,0 +1,59 @@
+/*
+ * CRC32 using the polynomial from IEEE-802.3
+ *
+ * Authors: Lasse Collin <lasse.collin@tukaani.org>
+ *          Igor Pavlov <http://7-zip.org/>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+/*
+ * This is not the fastest implementation, but it is pretty compact.
+ * The fastest versions of xz_crc32() on modern CPUs without hardware
+ * accelerated CRC instruction are 3-5 times as fast as this version,
+ * but they are bigger and use more memory for the lookup table.
+ */
+
+#include "xz_private.h"
+
+/*
+ * STATIC_RW_DATA is used in the pre-boot environment on some architectures.
+ * See <linux/decompress/mm.h> for details.
+ */
+#ifndef STATIC_RW_DATA
+#	define STATIC_RW_DATA static
+#endif
+
+STATIC_RW_DATA uint32_t xz_crc32_table[256];
+
+XZ_EXTERN void xz_crc32_init(void)
+{
+	const uint32_t poly = 0xEDB88320;
+
+	uint32_t i;
+	uint32_t j;
+	uint32_t r;
+
+	for (i = 0; i < 256; ++i) {
+		r = i;
+		for (j = 0; j < 8; ++j)
+			r = (r >> 1) ^ (poly & ~((r & 1) - 1));
+
+		xz_crc32_table[i] = r;
+	}
+
+	return;
+}
+
+XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc)
+{
+	crc = ~crc;
+
+	while (size != 0) {
+		crc = xz_crc32_table[*buf++ ^ (crc & 0xFF)] ^ (crc >> 8);
+		--size;
+	}
+
+	return ~crc;
+}
diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c
new file mode 100644
index 000000000000..e51e2558ca9d
--- /dev/null
+++ b/lib/xz/xz_dec_bcj.c
@@ -0,0 +1,561 @@
+/*
+ * Branch/Call/Jump (BCJ) filter decoders
+ *
+ * Authors: Lasse Collin <lasse.collin@tukaani.org>
+ *          Igor Pavlov <http://7-zip.org/>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include "xz_private.h"
+
+/*
+ * The rest of the file is inside this ifdef. It makes things a little more
+ * convenient when building without support for any BCJ filters.
+ */
+#ifdef XZ_DEC_BCJ
+
+struct xz_dec_bcj {
+	/* Type of the BCJ filter being used */
+	enum {
+		BCJ_X86 = 4,        /* x86 or x86-64 */
+		BCJ_POWERPC = 5,    /* Big endian only */
+		BCJ_IA64 = 6,       /* Big or little endian */
+		BCJ_ARM = 7,        /* Little endian only */
+		BCJ_ARMTHUMB = 8,   /* Little endian only */
+		BCJ_SPARC = 9       /* Big or little endian */
+	} type;
+
+	/*
+	 * Return value of the next filter in the chain. We need to preserve
+	 * this information across calls, because we must not call the next
+	 * filter anymore once it has returned XZ_STREAM_END.
+	 */
+	enum xz_ret ret;
+
+	/* True if we are operating in single-call mode. */
+	bool single_call;
+
+	/*
+	 * Absolute position relative to the beginning of the uncompressed
+	 * data (in a single .xz Block). We care only about the lowest 32
+	 * bits so this doesn't need to be uint64_t even with big files.
+	 */
+	uint32_t pos;
+
+	/* x86 filter state */
+	uint32_t x86_prev_mask;
+
+	/* Temporary space to hold the variables from struct xz_buf */
+	uint8_t *out;
+	size_t out_pos;
+	size_t out_size;
+
+	struct {
+		/* Amount of already filtered data in the beginning of buf */
+		size_t filtered;
+
+		/* Total amount of data currently stored in buf  */
+		size_t size;
+
+		/*
+		 * Buffer to hold a mix of filtered and unfiltered data. This
+		 * needs to be big enough to hold Alignment + 2 * Look-ahead:
+		 *
+		 * Type         Alignment   Look-ahead
+		 * x86              1           4
+		 * PowerPC          4           0
+		 * IA-64           16           0
+		 * ARM              4           0
+		 * ARM-Thumb        2           2
+		 * SPARC            4           0
+		 */
+		uint8_t buf[16];
+	} temp;
+};
+
+#ifdef XZ_DEC_X86
+/*
+ * This is used to test the most significant byte of a memory address
+ * in an x86 instruction.
+ */
+static inline int bcj_x86_test_msbyte(uint8_t b)
+{
+	return b == 0x00 || b == 0xFF;
+}
+
+static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+	static const bool mask_to_allowed_status[8]
+		= { true, true, true, false, true, false, false, false };
+
+	static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
+
+	size_t i;
+	size_t prev_pos = (size_t)-1;
+	uint32_t prev_mask = s->x86_prev_mask;
+	uint32_t src;
+	uint32_t dest;
+	uint32_t j;
+	uint8_t b;
+
+	if (size <= 4)
+		return 0;
+
+	size -= 4;
+	for (i = 0; i < size; ++i) {
+		if ((buf[i] & 0xFE) != 0xE8)
+			continue;
+
+		prev_pos = i - prev_pos;
+		if (prev_pos > 3) {
+			prev_mask = 0;
+		} else {
+			prev_mask = (prev_mask << (prev_pos - 1)) & 7;
+			if (prev_mask != 0) {
+				b = buf[i + 4 - mask_to_bit_num[prev_mask]];
+				if (!mask_to_allowed_status[prev_mask]
+						|| bcj_x86_test_msbyte(b)) {
+					prev_pos = i;
+					prev_mask = (prev_mask << 1) | 1;
+					continue;
+				}
+			}
+		}
+
+		prev_pos = i;
+
+		if (bcj_x86_test_msbyte(buf[i + 4])) {
+			src = get_unaligned_le32(buf + i + 1);
+			while (true) {
+				dest = src - (s->pos + (uint32_t)i + 5);
+				if (prev_mask == 0)
+					break;
+
+				j = mask_to_bit_num[prev_mask] * 8;
+				b = (uint8_t)(dest >> (24 - j));
+				if (!bcj_x86_test_msbyte(b))
+					break;
+
+				src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
+			}
+
+			dest &= 0x01FFFFFF;
+			dest |= (uint32_t)0 - (dest & 0x01000000);
+			put_unaligned_le32(dest, buf + i + 1);
+			i += 4;
+		} else {
+			prev_mask = (prev_mask << 1) | 1;
+		}
+	}
+
+	prev_pos = i - prev_pos;
+	s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
+	return i;
+}
+#endif
+
+#ifdef XZ_DEC_POWERPC
+static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+	size_t i;
+	uint32_t instr;
+
+	for (i = 0; i + 4 <= size; i += 4) {
+		instr = get_unaligned_be32(buf + i);
+		if ((instr & 0xFC000003) == 0x48000001) {
+			instr &= 0x03FFFFFC;
+			instr -= s->pos + (uint32_t)i;
+			instr &= 0x03FFFFFC;
+			instr |= 0x48000001;
+			put_unaligned_be32(instr, buf + i);
+		}
+	}
+
+	return i;
+}
+#endif
+
+#ifdef XZ_DEC_IA64
+static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+	static const uint8_t branch_table[32] = {
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		4, 4, 6, 6, 0, 0, 7, 7,
+		4, 4, 0, 0, 4, 4, 0, 0
+	};
+
+	/*
+	 * The local variables take a little bit stack space, but it's less
+	 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
+	 * stack usage here without doing that for the LZMA2 decoder too.
+	 */
+
+	/* Loop counters */
+	size_t i;
+	size_t j;
+
+	/* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
+	uint32_t slot;
+
+	/* Bitwise offset of the instruction indicated by slot */
+	uint32_t bit_pos;
+
+	/* bit_pos split into byte and bit parts */
+	uint32_t byte_pos;
+	uint32_t bit_res;
+
+	/* Address part of an instruction */
+	uint32_t addr;
+
+	/* Mask used to detect which instructions to convert */
+	uint32_t mask;
+
+	/* 41-bit instruction stored somewhere in the lowest 48 bits */
+	uint64_t instr;
+
+	/* Instruction normalized with bit_res for easier manipulation */
+	uint64_t norm;
+
+	for (i = 0; i + 16 <= size; i += 16) {
+		mask = branch_table[buf[i] & 0x1F];
+		for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
+			if (((mask >> slot) & 1) == 0)
+				continue;
+
+			byte_pos = bit_pos >> 3;
+			bit_res = bit_pos & 7;
+			instr = 0;
+			for (j = 0; j < 6; ++j)
+				instr |= (uint64_t)(buf[i + j + byte_pos])
+						<< (8 * j);
+
+			norm = instr >> bit_res;
+
+			if (((norm >> 37) & 0x0F) == 0x05
+					&& ((norm >> 9) & 0x07) == 0) {
+				addr = (norm >> 13) & 0x0FFFFF;
+				addr |= ((uint32_t)(norm >> 36) & 1) << 20;
+				addr <<= 4;
+				addr -= s->pos + (uint32_t)i;
+				addr >>= 4;
+
+				norm &= ~((uint64_t)0x8FFFFF << 13);
+				norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
+				norm |= (uint64_t)(addr & 0x100000)
+						<< (36 - 20);
+
+				instr &= (1 << bit_res) - 1;
+				instr |= norm << bit_res;
+
+				for (j = 0; j < 6; j++)
+					buf[i + j + byte_pos]
+						= (uint8_t)(instr >> (8 * j));
+			}
+		}
+	}
+
+	return i;
+}
+#endif
+
+#ifdef XZ_DEC_ARM
+static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+	size_t i;
+	uint32_t addr;
+
+	for (i = 0; i + 4 <= size; i += 4) {
+		if (buf[i + 3] == 0xEB) {
+			addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
+					| ((uint32_t)buf[i + 2] << 16);
+			addr <<= 2;
+			addr -= s->pos + (uint32_t)i + 8;
+			addr >>= 2;
+			buf[i] = (uint8_t)addr;
+			buf[i + 1] = (uint8_t)(addr >> 8);
+			buf[i + 2] = (uint8_t)(addr >> 16);
+		}
+	}
+
+	return i;
+}
+#endif
+
+#ifdef XZ_DEC_ARMTHUMB
+static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+	size_t i;
+	uint32_t addr;
+
+	for (i = 0; i + 4 <= size; i += 2) {
+		if ((buf[i + 1] & 0xF8) == 0xF0
+				&& (buf[i + 3] & 0xF8) == 0xF8) {
+			addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
+					| ((uint32_t)buf[i] << 11)
+					| (((uint32_t)buf[i + 3] & 0x07) << 8)
+					| (uint32_t)buf[i + 2];
+			addr <<= 1;
+			addr -= s->pos + (uint32_t)i + 4;
+			addr >>= 1;
+			buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
+			buf[i] = (uint8_t)(addr >> 11);
+			buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
+			buf[i + 2] = (uint8_t)addr;
+			i += 2;
+		}
+	}
+
+	return i;
+}
+#endif
+
+#ifdef XZ_DEC_SPARC
+static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+	size_t i;
+	uint32_t instr;
+
+	for (i = 0; i + 4 <= size; i += 4) {
+		instr = get_unaligned_be32(buf + i);
+		if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
+			instr <<= 2;
+			instr -= s->pos + (uint32_t)i;
+			instr >>= 2;
+			instr = ((uint32_t)0x40000000 - (instr & 0x400000))
+					| 0x40000000 | (instr & 0x3FFFFF);
+			put_unaligned_be32(instr, buf + i);
+		}
+	}
+
+	return i;
+}
+#endif
+
+/*
+ * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
+ * of data that got filtered.
+ *
+ * NOTE: This is implemented as a switch statement to avoid using function
+ * pointers, which could be problematic in the kernel boot code, which must
+ * avoid pointers to static data (at least on x86).
+ */
+static void bcj_apply(struct xz_dec_bcj *s,
+		      uint8_t *buf, size_t *pos, size_t size)
+{
+	size_t filtered;
+
+	buf += *pos;
+	size -= *pos;
+
+	switch (s->type) {
+#ifdef XZ_DEC_X86
+	case BCJ_X86:
+		filtered = bcj_x86(s, buf, size);
+		break;
+#endif
+#ifdef XZ_DEC_POWERPC
+	case BCJ_POWERPC:
+		filtered = bcj_powerpc(s, buf, size);
+		break;
+#endif
+#ifdef XZ_DEC_IA64
+	case BCJ_IA64:
+		filtered = bcj_ia64(s, buf, size);
+		break;
+#endif
+#ifdef XZ_DEC_ARM
+	case BCJ_ARM:
+		filtered = bcj_arm(s, buf, size);
+		break;
+#endif
+#ifdef XZ_DEC_ARMTHUMB
+	case BCJ_ARMTHUMB:
+		filtered = bcj_armthumb(s, buf, size);
+		break;
+#endif
+#ifdef XZ_DEC_SPARC
+	case BCJ_SPARC:
+		filtered = bcj_sparc(s, buf, size);
+		break;
+#endif
+	default:
+		/* Never reached but silence compiler warnings. */
+		filtered = 0;
+		break;
+	}
+
+	*pos += filtered;
+	s->pos += filtered;
+}
+
+/*
+ * Flush pending filtered data from temp to the output buffer.
+ * Move the remaining mixture of possibly filtered and unfiltered
+ * data to the beginning of temp.
+ */
+static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
+{
+	size_t copy_size;
+
+	copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
+	memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
+	b->out_pos += copy_size;
+
+	s->temp.filtered -= copy_size;
+	s->temp.size -= copy_size;
+	memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
+}
+
+/*
+ * The BCJ filter functions are primitive in sense that they process the
+ * data in chunks of 1-16 bytes. To hide this issue, this function does
+ * some buffering.
+ */
+XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
+				     struct xz_dec_lzma2 *lzma2,
+				     struct xz_buf *b)
+{
+	size_t out_start;
+
+	/*
+	 * Flush pending already filtered data to the output buffer. Return
+	 * immediatelly if we couldn't flush everything, or if the next
+	 * filter in the chain had already returned XZ_STREAM_END.
+	 */
+	if (s->temp.filtered > 0) {
+		bcj_flush(s, b);
+		if (s->temp.filtered > 0)
+			return XZ_OK;
+
+		if (s->ret == XZ_STREAM_END)
+			return XZ_STREAM_END;
+	}
+
+	/*
+	 * If we have more output space than what is currently pending in
+	 * temp, copy the unfiltered data from temp to the output buffer
+	 * and try to fill the output buffer by decoding more data from the
+	 * next filter in the chain. Apply the BCJ filter on the new data
+	 * in the output buffer. If everything cannot be filtered, copy it
+	 * to temp and rewind the output buffer position accordingly.
+	 */
+	if (s->temp.size < b->out_size - b->out_pos) {
+		out_start = b->out_pos;
+		memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
+		b->out_pos += s->temp.size;
+
+		s->ret = xz_dec_lzma2_run(lzma2, b);
+		if (s->ret != XZ_STREAM_END
+				&& (s->ret != XZ_OK || s->single_call))
+			return s->ret;
+
+		bcj_apply(s, b->out, &out_start, b->out_pos);
+
+		/*
+		 * As an exception, if the next filter returned XZ_STREAM_END,
+		 * we can do that too, since the last few bytes that remain
+		 * unfiltered are meant to remain unfiltered.
+		 */
+		if (s->ret == XZ_STREAM_END)
+			return XZ_STREAM_END;
+
+		s->temp.size = b->out_pos - out_start;
+		b->out_pos -= s->temp.size;
+		memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
+	}
+
+	/*
+	 * If we have unfiltered data in temp, try to fill by decoding more
+	 * data from the next filter. Apply the BCJ filter on temp. Then we
+	 * hopefully can fill the actual output buffer by copying filtered
+	 * data from temp. A mix of filtered and unfiltered data may be left
+	 * in temp; it will be taken care on the next call to this function.
+	 */
+	if (s->temp.size > 0) {
+		/* Make b->out{,_pos,_size} temporarily point to s->temp. */
+		s->out = b->out;
+		s->out_pos = b->out_pos;
+		s->out_size = b->out_size;
+		b->out = s->temp.buf;
+		b->out_pos = s->temp.size;
+		b->out_size = sizeof(s->temp.buf);
+
+		s->ret = xz_dec_lzma2_run(lzma2, b);
+
+		s->temp.size = b->out_pos;
+		b->out = s->out;
+		b->out_pos = s->out_pos;
+		b->out_size = s->out_size;
+
+		if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
+			return s->ret;
+
+		bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
+
+		/*
+		 * If the next filter returned XZ_STREAM_END, we mark that
+		 * everything is filtered, since the last unfiltered bytes
+		 * of the stream are meant to be left as is.
+		 */
+		if (s->ret == XZ_STREAM_END)
+			s->temp.filtered = s->temp.size;
+
+		bcj_flush(s, b);
+		if (s->temp.filtered > 0)
+			return XZ_OK;
+	}
+
+	return s->ret;
+}
+
+XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)
+{
+	struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s != NULL)
+		s->single_call = single_call;
+
+	return s;
+}
+
+XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)
+{
+	switch (id) {
+#ifdef XZ_DEC_X86
+	case BCJ_X86:
+#endif
+#ifdef XZ_DEC_POWERPC
+	case BCJ_POWERPC:
+#endif
+#ifdef XZ_DEC_IA64
+	case BCJ_IA64:
+#endif
+#ifdef XZ_DEC_ARM
+	case BCJ_ARM:
+#endif
+#ifdef XZ_DEC_ARMTHUMB
+	case BCJ_ARMTHUMB:
+#endif
+#ifdef XZ_DEC_SPARC
+	case BCJ_SPARC:
+#endif
+		break;
+
+	default:
+		/* Unsupported Filter ID */
+		return XZ_OPTIONS_ERROR;
+	}
+
+	s->type = id;
+	s->ret = XZ_OK;
+	s->pos = 0;
+	s->x86_prev_mask = 0;
+	s->temp.filtered = 0;
+	s->temp.size = 0;
+
+	return XZ_OK;
+}
+
+#endif
diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
new file mode 100644
index 000000000000..ea5fa4fe9d67
--- /dev/null
+++ b/lib/xz/xz_dec_lzma2.c
@@ -0,0 +1,1171 @@
+/*
+ * LZMA2 decoder
+ *
+ * Authors: Lasse Collin <lasse.collin@tukaani.org>
+ *          Igor Pavlov <http://7-zip.org/>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include "xz_private.h"
+#include "xz_lzma2.h"
+
+/*
+ * Range decoder initialization eats the first five bytes of each LZMA chunk.
+ */
+#define RC_INIT_BYTES 5
+
+/*
+ * Minimum number of usable input buffer to safely decode one LZMA symbol.
+ * The worst case is that we decode 22 bits using probabilities and 26
+ * direct bits. This may decode at maximum of 20 bytes of input. However,
+ * lzma_main() does an extra normalization before returning, thus we
+ * need to put 21 here.
+ */
+#define LZMA_IN_REQUIRED 21
+
+/*
+ * Dictionary (history buffer)
+ *
+ * These are always true:
+ *    start <= pos <= full <= end
+ *    pos <= limit <= end
+ *
+ * In multi-call mode, also these are true:
+ *    end == size
+ *    size <= size_max
+ *    allocated <= size
+ *
+ * Most of these variables are size_t to support single-call mode,
+ * in which the dictionary variables address the actual output
+ * buffer directly.
+ */
+struct dictionary {
+	/* Beginning of the history buffer */
+	uint8_t *buf;
+
+	/* Old position in buf (before decoding more data) */
+	size_t start;
+
+	/* Position in buf */
+	size_t pos;
+
+	/*
+	 * How full dictionary is. This is used to detect corrupt input that
+	 * would read beyond the beginning of the uncompressed stream.
+	 */
+	size_t full;
+
+	/* Write limit; we don't write to buf[limit] or later bytes. */
+	size_t limit;
+
+	/*
+	 * End of the dictionary buffer. In multi-call mode, this is
+	 * the same as the dictionary size. In single-call mode, this
+	 * indicates the size of the output buffer.
+	 */
+	size_t end;
+
+	/*
+	 * Size of the dictionary as specified in Block Header. This is used
+	 * together with "full" to detect corrupt input that would make us
+	 * read beyond the beginning of the uncompressed stream.
+	 */
+	uint32_t size;
+
+	/*
+	 * Maximum allowed dictionary size in multi-call mode.
+	 * This is ignored in single-call mode.
+	 */
+	uint32_t size_max;
+
+	/*
+	 * Amount of memory currently allocated for the dictionary.
+	 * This is used only with XZ_DYNALLOC. (With XZ_PREALLOC,
+	 * size_max is always the same as the allocated size.)
+	 */
+	uint32_t allocated;
+
+	/* Operation mode */
+	enum xz_mode mode;
+};
+
+/* Range decoder */
+struct rc_dec {
+	uint32_t range;
+	uint32_t code;
+
+	/*
+	 * Number of initializing bytes remaining to be read
+	 * by rc_read_init().
+	 */
+	uint32_t init_bytes_left;
+
+	/*
+	 * Buffer from which we read our input. It can be either
+	 * temp.buf or the caller-provided input buffer.
+	 */
+	const uint8_t *in;
+	size_t in_pos;
+	size_t in_limit;
+};
+
+/* Probabilities for a length decoder. */
+struct lzma_len_dec {
+	/* Probability of match length being at least 10 */
+	uint16_t choice;
+
+	/* Probability of match length being at least 18 */
+	uint16_t choice2;
+
+	/* Probabilities for match lengths 2-9 */
+	uint16_t low[POS_STATES_MAX][LEN_LOW_SYMBOLS];
+
+	/* Probabilities for match lengths 10-17 */
+	uint16_t mid[POS_STATES_MAX][LEN_MID_SYMBOLS];
+
+	/* Probabilities for match lengths 18-273 */
+	uint16_t high[LEN_HIGH_SYMBOLS];
+};
+
+struct lzma_dec {
+	/* Distances of latest four matches */
+	uint32_t rep0;
+	uint32_t rep1;
+	uint32_t rep2;
+	uint32_t rep3;
+
+	/* Types of the most recently seen LZMA symbols */
+	enum lzma_state state;
+
+	/*
+	 * Length of a match. This is updated so that dict_repeat can
+	 * be called again to finish repeating the whole match.
+	 */
+	uint32_t len;
+
+	/*
+	 * LZMA properties or related bit masks (number of literal
+	 * context bits, a mask dervied from the number of literal
+	 * position bits, and a mask dervied from the number
+	 * position bits)
+	 */
+	uint32_t lc;
+	uint32_t literal_pos_mask; /* (1 << lp) - 1 */
+	uint32_t pos_mask;         /* (1 << pb) - 1 */
+
+	/* If 1, it's a match. Otherwise it's a single 8-bit literal. */
+	uint16_t is_match[STATES][POS_STATES_MAX];
+
+	/* If 1, it's a repeated match. The distance is one of rep0 .. rep3. */
+	uint16_t is_rep[STATES];
+
+	/*
+	 * If 0, distance of a repeated match is rep0.
+	 * Otherwise check is_rep1.
+	 */
+	uint16_t is_rep0[STATES];
+
+	/*
+	 * If 0, distance of a repeated match is rep1.
+	 * Otherwise check is_rep2.
+	 */
+	uint16_t is_rep1[STATES];
+
+	/* If 0, distance of a repeated match is rep2. Otherwise it is rep3. */
+	uint16_t is_rep2[STATES];
+
+	/*
+	 * If 1, the repeated match has length of one byte. Otherwise
+	 * the length is decoded from rep_len_decoder.
+	 */
+	uint16_t is_rep0_long[STATES][POS_STATES_MAX];
+
+	/*
+	 * Probability tree for the highest two bits of the match
+	 * distance. There is a separate probability tree for match
+	 * lengths of 2 (i.e. MATCH_LEN_MIN), 3, 4, and [5, 273].
+	 */
+	uint16_t dist_slot[DIST_STATES][DIST_SLOTS];
+
+	/*
+	 * Probility trees for additional bits for match distance
+	 * when the distance is in the range [4, 127].
+	 */
+	uint16_t dist_special[FULL_DISTANCES - DIST_MODEL_END];
+
+	/*
+	 * Probability tree for the lowest four bits of a match
+	 * distance that is equal to or greater than 128.
+	 */
+	uint16_t dist_align[ALIGN_SIZE];
+
+	/* Length of a normal match */
+	struct lzma_len_dec match_len_dec;
+
+	/* Length of a repeated match */
+	struct lzma_len_dec rep_len_dec;
+
+	/* Probabilities of literals */
+	uint16_t literal[LITERAL_CODERS_MAX][LITERAL_CODER_SIZE];
+};
+
+struct lzma2_dec {
+	/* Position in xz_dec_lzma2_run(). */
+	enum lzma2_seq {
+		SEQ_CONTROL,
+		SEQ_UNCOMPRESSED_1,
+		SEQ_UNCOMPRESSED_2,
+		SEQ_COMPRESSED_0,
+		SEQ_COMPRESSED_1,
+		SEQ_PROPERTIES,
+		SEQ_LZMA_PREPARE,
+		SEQ_LZMA_RUN,
+		SEQ_COPY
+	} sequence;
+
+	/* Next position after decoding the compressed size of the chunk. */
+	enum lzma2_seq next_sequence;
+
+	/* Uncompressed size of LZMA chunk (2 MiB at maximum) */
+	uint32_t uncompressed;
+
+	/*
+	 * Compressed size of LZMA chunk or compressed/uncompressed
+	 * size of uncompressed chunk (64 KiB at maximum)
+	 */
+	uint32_t compressed;
+
+	/*
+	 * True if dictionary reset is needed. This is false before
+	 * the first chunk (LZMA or uncompressed).
+	 */
+	bool need_dict_reset;
+
+	/*
+	 * True if new LZMA properties are needed. This is false
+	 * before the first LZMA chunk.
+	 */
+	bool need_props;
+};
+
+struct xz_dec_lzma2 {
+	/*
+	 * The order below is important on x86 to reduce code size and
+	 * it shouldn't hurt on other platforms. Everything up to and
+	 * including lzma.pos_mask are in the first 128 bytes on x86-32,
+	 * which allows using smaller instructions to access those
+	 * variables. On x86-64, fewer variables fit into the first 128
+	 * bytes, but this is still the best order without sacrificing
+	 * the readability by splitting the structures.
+	 */
+	struct rc_dec rc;
+	struct dictionary dict;
+	struct lzma2_dec lzma2;
+	struct lzma_dec lzma;
+
+	/*
+	 * Temporary buffer which holds small number of input bytes between
+	 * decoder calls. See lzma2_lzma() for details.
+	 */
+	struct {
+		uint32_t size;
+		uint8_t buf[3 * LZMA_IN_REQUIRED];
+	} temp;
+};
+
+/**************
+ * Dictionary *
+ **************/
+
+/*
+ * Reset the dictionary state. When in single-call mode, set up the beginning
+ * of the dictionary to point to the actual output buffer.
+ */
+static void dict_reset(struct dictionary *dict, struct xz_buf *b)
+{
+	if (DEC_IS_SINGLE(dict->mode)) {
+		dict->buf = b->out + b->out_pos;
+		dict->end = b->out_size - b->out_pos;
+	}
+
+	dict->start = 0;
+	dict->pos = 0;
+	dict->limit = 0;
+	dict->full = 0;
+}
+
+/* Set dictionary write limit */
+static void dict_limit(struct dictionary *dict, size_t out_max)
+{
+	if (dict->end - dict->pos <= out_max)
+		dict->limit = dict->end;
+	else
+		dict->limit = dict->pos + out_max;
+}
+
+/* Return true if at least one byte can be written into the dictionary. */
+static inline bool dict_has_space(const struct dictionary *dict)
+{
+	return dict->pos < dict->limit;
+}
+
+/*
+ * Get a byte from the dictionary at the given distance. The distance is
+ * assumed to valid, or as a special case, zero when the dictionary is
+ * still empty. This special case is needed for single-call decoding to
+ * avoid writing a '\0' to the end of the destination buffer.
+ */
+static inline uint32_t dict_get(const struct dictionary *dict, uint32_t dist)
+{
+	size_t offset = dict->pos - dist - 1;
+
+	if (dist >= dict->pos)
+		offset += dict->end;
+
+	return dict->full > 0 ? dict->buf[offset] : 0;
+}
+
+/*
+ * Put one byte into the dictionary. It is assumed that there is space for it.
+ */
+static inline void dict_put(struct dictionary *dict, uint8_t byte)
+{
+	dict->buf[dict->pos++] = byte;
+
+	if (dict->full < dict->pos)
+		dict->full = dict->pos;
+}
+
+/*
+ * Repeat given number of bytes from the given distance. If the distance is
+ * invalid, false is returned. On success, true is returned and *len is
+ * updated to indicate how many bytes were left to be repeated.
+ */
+static bool dict_repeat(struct dictionary *dict, uint32_t *len, uint32_t dist)
+{
+	size_t back;
+	uint32_t left;
+
+	if (dist >= dict->full || dist >= dict->size)
+		return false;
+
+	left = min_t(size_t, dict->limit - dict->pos, *len);
+	*len -= left;
+
+	back = dict->pos - dist - 1;
+	if (dist >= dict->pos)
+		back += dict->end;
+
+	do {
+		dict->buf[dict->pos++] = dict->buf[back++];
+		if (back == dict->end)
+			back = 0;
+	} while (--left > 0);
+
+	if (dict->full < dict->pos)
+		dict->full = dict->pos;
+
+	return true;
+}
+
+/* Copy uncompressed data as is from input to dictionary and output buffers. */
+static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
+			      uint32_t *left)
+{
+	size_t copy_size;
+
+	while (*left > 0 && b->in_pos < b->in_size
+			&& b->out_pos < b->out_size) {
+		copy_size = min(b->in_size - b->in_pos,
+				b->out_size - b->out_pos);
+		if (copy_size > dict->end - dict->pos)
+			copy_size = dict->end - dict->pos;
+		if (copy_size > *left)
+			copy_size = *left;
+
+		*left -= copy_size;
+
+		memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size);
+		dict->pos += copy_size;
+
+		if (dict->full < dict->pos)
+			dict->full = dict->pos;
+
+		if (DEC_IS_MULTI(dict->mode)) {
+			if (dict->pos == dict->end)
+				dict->pos = 0;
+
+			memcpy(b->out + b->out_pos, b->in + b->in_pos,
+					copy_size);
+		}
+
+		dict->start = dict->pos;
+
+		b->out_pos += copy_size;
+		b->in_pos += copy_size;
+	}
+}
+
+/*
+ * Flush pending data from dictionary to b->out. It is assumed that there is
+ * enough space in b->out. This is guaranteed because caller uses dict_limit()
+ * before decoding data into the dictionary.
+ */
+static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b)
+{
+	size_t copy_size = dict->pos - dict->start;
+
+	if (DEC_IS_MULTI(dict->mode)) {
+		if (dict->pos == dict->end)
+			dict->pos = 0;
+
+		memcpy(b->out + b->out_pos, dict->buf + dict->start,
+				copy_size);
+	}
+
+	dict->start = dict->pos;
+	b->out_pos += copy_size;
+	return copy_size;
+}
+
+/*****************
+ * Range decoder *
+ *****************/
+
+/* Reset the range decoder. */
+static void rc_reset(struct rc_dec *rc)
+{
+	rc->range = (uint32_t)-1;
+	rc->code = 0;
+	rc->init_bytes_left = RC_INIT_BYTES;
+}
+
+/*
+ * Read the first five initial bytes into rc->code if they haven't been
+ * read already. (Yes, the first byte gets completely ignored.)
+ */
+static bool rc_read_init(struct rc_dec *rc, struct xz_buf *b)
+{
+	while (rc->init_bytes_left > 0) {
+		if (b->in_pos == b->in_size)
+			return false;
+
+		rc->code = (rc->code << 8) + b->in[b->in_pos++];
+		--rc->init_bytes_left;
+	}
+
+	return true;
+}
+
+/* Return true if there may not be enough input for the next decoding loop. */
+static inline bool rc_limit_exceeded(const struct rc_dec *rc)
+{
+	return rc->in_pos > rc->in_limit;
+}
+
+/*
+ * Return true if it is possible (from point of view of range decoder) that
+ * we have reached the end of the LZMA chunk.
+ */
+static inline bool rc_is_finished(const struct rc_dec *rc)
+{
+	return rc->code == 0;
+}
+
+/* Read the next input byte if needed. */
+static __always_inline void rc_normalize(struct rc_dec *rc)
+{
+	if (rc->range < RC_TOP_VALUE) {
+		rc->range <<= RC_SHIFT_BITS;
+		rc->code = (rc->code << RC_SHIFT_BITS) + rc->in[rc->in_pos++];
+	}
+}
+
+/*
+ * Decode one bit. In some versions, this function has been splitted in three
+ * functions so that the compiler is supposed to be able to more easily avoid
+ * an extra branch. In this particular version of the LZMA decoder, this
+ * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3
+ * on x86). Using a non-splitted version results in nicer looking code too.
+ *
+ * NOTE: This must return an int. Do not make it return a bool or the speed
+ * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care,
+ * and it generates 10-20 % faster code than GCC 3.x from this file anyway.)
+ */
+static __always_inline int rc_bit(struct rc_dec *rc, uint16_t *prob)
+{
+	uint32_t bound;
+	int bit;
+
+	rc_normalize(rc);
+	bound = (rc->range >> RC_BIT_MODEL_TOTAL_BITS) * *prob;
+	if (rc->code < bound) {
+		rc->range = bound;
+		*prob += (RC_BIT_MODEL_TOTAL - *prob) >> RC_MOVE_BITS;
+		bit = 0;
+	} else {
+		rc->range -= bound;
+		rc->code -= bound;
+		*prob -= *prob >> RC_MOVE_BITS;
+		bit = 1;
+	}
+
+	return bit;
+}
+
+/* Decode a bittree starting from the most significant bit. */
+static __always_inline uint32_t rc_bittree(struct rc_dec *rc,
+					   uint16_t *probs, uint32_t limit)
+{
+	uint32_t symbol = 1;
+
+	do {
+		if (rc_bit(rc, &probs[symbol]))
+			symbol = (symbol << 1) + 1;
+		else
+			symbol <<= 1;
+	} while (symbol < limit);
+
+	return symbol;
+}
+
+/* Decode a bittree starting from the least significant bit. */
+static __always_inline void rc_bittree_reverse(struct rc_dec *rc,
+					       uint16_t *probs,
+					       uint32_t *dest, uint32_t limit)
+{
+	uint32_t symbol = 1;
+	uint32_t i = 0;
+
+	do {
+		if (rc_bit(rc, &probs[symbol])) {
+			symbol = (symbol << 1) + 1;
+			*dest += 1 << i;
+		} else {
+			symbol <<= 1;
+		}
+	} while (++i < limit);
+}
+
+/* Decode direct bits (fixed fifty-fifty probability) */
+static inline void rc_direct(struct rc_dec *rc, uint32_t *dest, uint32_t limit)
+{
+	uint32_t mask;
+
+	do {
+		rc_normalize(rc);
+		rc->range >>= 1;
+		rc->code -= rc->range;
+		mask = (uint32_t)0 - (rc->code >> 31);
+		rc->code += rc->range & mask;
+		*dest = (*dest << 1) + (mask + 1);
+	} while (--limit > 0);
+}
+
+/********
+ * LZMA *
+ ********/
+
+/* Get pointer to literal coder probability array. */
+static uint16_t *lzma_literal_probs(struct xz_dec_lzma2 *s)
+{
+	uint32_t prev_byte = dict_get(&s->dict, 0);
+	uint32_t low = prev_byte >> (8 - s->lzma.lc);
+	uint32_t high = (s->dict.pos & s->lzma.literal_pos_mask) << s->lzma.lc;
+	return s->lzma.literal[low + high];
+}
+
+/* Decode a literal (one 8-bit byte) */
+static void lzma_literal(struct xz_dec_lzma2 *s)
+{
+	uint16_t *probs;
+	uint32_t symbol;
+	uint32_t match_byte;
+	uint32_t match_bit;
+	uint32_t offset;
+	uint32_t i;
+
+	probs = lzma_literal_probs(s);
+
+	if (lzma_state_is_literal(s->lzma.state)) {
+		symbol = rc_bittree(&s->rc, probs, 0x100);
+	} else {
+		symbol = 1;
+		match_byte = dict_get(&s->dict, s->lzma.rep0) << 1;
+		offset = 0x100;
+
+		do {
+			match_bit = match_byte & offset;
+			match_byte <<= 1;
+			i = offset + match_bit + symbol;
+
+			if (rc_bit(&s->rc, &probs[i])) {
+				symbol = (symbol << 1) + 1;
+				offset &= match_bit;
+			} else {
+				symbol <<= 1;
+				offset &= ~match_bit;
+			}
+		} while (symbol < 0x100);
+	}
+
+	dict_put(&s->dict, (uint8_t)symbol);
+	lzma_state_literal(&s->lzma.state);
+}
+
+/* Decode the length of the match into s->lzma.len. */
+static void lzma_len(struct xz_dec_lzma2 *s, struct lzma_len_dec *l,
+		     uint32_t pos_state)
+{
+	uint16_t *probs;
+	uint32_t limit;
+
+	if (!rc_bit(&s->rc, &l->choice)) {
+		probs = l->low[pos_state];
+		limit = LEN_LOW_SYMBOLS;
+		s->lzma.len = MATCH_LEN_MIN;
+	} else {
+		if (!rc_bit(&s->rc, &l->choice2)) {
+			probs = l->mid[pos_state];
+			limit = LEN_MID_SYMBOLS;
+			s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS;
+		} else {
+			probs = l->high;
+			limit = LEN_HIGH_SYMBOLS;
+			s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS
+					+ LEN_MID_SYMBOLS;
+		}
+	}
+
+	s->lzma.len += rc_bittree(&s->rc, probs, limit) - limit;
+}
+
+/* Decode a match. The distance will be stored in s->lzma.rep0. */
+static void lzma_match(struct xz_dec_lzma2 *s, uint32_t pos_state)
+{
+	uint16_t *probs;
+	uint32_t dist_slot;
+	uint32_t limit;
+
+	lzma_state_match(&s->lzma.state);
+
+	s->lzma.rep3 = s->lzma.rep2;
+	s->lzma.rep2 = s->lzma.rep1;
+	s->lzma.rep1 = s->lzma.rep0;
+
+	lzma_len(s, &s->lzma.match_len_dec, pos_state);
+
+	probs = s->lzma.dist_slot[lzma_get_dist_state(s->lzma.len)];
+	dist_slot = rc_bittree(&s->rc, probs, DIST_SLOTS) - DIST_SLOTS;
+
+	if (dist_slot < DIST_MODEL_START) {
+		s->lzma.rep0 = dist_slot;
+	} else {
+		limit = (dist_slot >> 1) - 1;
+		s->lzma.rep0 = 2 + (dist_slot & 1);
+
+		if (dist_slot < DIST_MODEL_END) {
+			s->lzma.rep0 <<= limit;
+			probs = s->lzma.dist_special + s->lzma.rep0
+					- dist_slot - 1;
+			rc_bittree_reverse(&s->rc, probs,
+					&s->lzma.rep0, limit);
+		} else {
+			rc_direct(&s->rc, &s->lzma.rep0, limit - ALIGN_BITS);
+			s->lzma.rep0 <<= ALIGN_BITS;
+			rc_bittree_reverse(&s->rc, s->lzma.dist_align,
+					&s->lzma.rep0, ALIGN_BITS);
+		}
+	}
+}
+
+/*
+ * Decode a repeated match. The distance is one of the four most recently
+ * seen matches. The distance will be stored in s->lzma.rep0.
+ */
+static void lzma_rep_match(struct xz_dec_lzma2 *s, uint32_t pos_state)
+{
+	uint32_t tmp;
+
+	if (!rc_bit(&s->rc, &s->lzma.is_rep0[s->lzma.state])) {
+		if (!rc_bit(&s->rc, &s->lzma.is_rep0_long[
+				s->lzma.state][pos_state])) {
+			lzma_state_short_rep(&s->lzma.state);
+			s->lzma.len = 1;
+			return;
+		}
+	} else {
+		if (!rc_bit(&s->rc, &s->lzma.is_rep1[s->lzma.state])) {
+			tmp = s->lzma.rep1;
+		} else {
+			if (!rc_bit(&s->rc, &s->lzma.is_rep2[s->lzma.state])) {
+				tmp = s->lzma.rep2;
+			} else {
+				tmp = s->lzma.rep3;
+				s->lzma.rep3 = s->lzma.rep2;
+			}
+
+			s->lzma.rep2 = s->lzma.rep1;
+		}
+
+		s->lzma.rep1 = s->lzma.rep0;
+		s->lzma.rep0 = tmp;
+	}
+
+	lzma_state_long_rep(&s->lzma.state);
+	lzma_len(s, &s->lzma.rep_len_dec, pos_state);
+}
+
+/* LZMA decoder core */
+static bool lzma_main(struct xz_dec_lzma2 *s)
+{
+	uint32_t pos_state;
+
+	/*
+	 * If the dictionary was reached during the previous call, try to
+	 * finish the possibly pending repeat in the dictionary.
+	 */
+	if (dict_has_space(&s->dict) && s->lzma.len > 0)
+		dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0);
+
+	/*
+	 * Decode more LZMA symbols. One iteration may consume up to
+	 * LZMA_IN_REQUIRED - 1 bytes.
+	 */
+	while (dict_has_space(&s->dict) && !rc_limit_exceeded(&s->rc)) {
+		pos_state = s->dict.pos & s->lzma.pos_mask;
+
+		if (!rc_bit(&s->rc, &s->lzma.is_match[
+				s->lzma.state][pos_state])) {
+			lzma_literal(s);
+		} else {
+			if (rc_bit(&s->rc, &s->lzma.is_rep[s->lzma.state]))
+				lzma_rep_match(s, pos_state);
+			else
+				lzma_match(s, pos_state);
+
+			if (!dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0))
+				return false;
+		}
+	}
+
+	/*
+	 * Having the range decoder always normalized when we are outside
+	 * this function makes it easier to correctly handle end of the chunk.
+	 */
+	rc_normalize(&s->rc);
+
+	return true;
+}
+
+/*
+ * Reset the LZMA decoder and range decoder state. Dictionary is nore reset
+ * here, because LZMA state may be reset without resetting the dictionary.
+ */
+static void lzma_reset(struct xz_dec_lzma2 *s)
+{
+	uint16_t *probs;
+	size_t i;
+
+	s->lzma.state = STATE_LIT_LIT;
+	s->lzma.rep0 = 0;
+	s->lzma.rep1 = 0;
+	s->lzma.rep2 = 0;
+	s->lzma.rep3 = 0;
+
+	/*
+	 * All probabilities are initialized to the same value. This hack
+	 * makes the code smaller by avoiding a separate loop for each
+	 * probability array.
+	 *
+	 * This could be optimized so that only that part of literal
+	 * probabilities that are actually required. In the common case
+	 * we would write 12 KiB less.
+	 */
+	probs = s->lzma.is_match[0];
+	for (i = 0; i < PROBS_TOTAL; ++i)
+		probs[i] = RC_BIT_MODEL_TOTAL / 2;
+
+	rc_reset(&s->rc);
+}
+
+/*
+ * Decode and validate LZMA properties (lc/lp/pb) and calculate the bit masks
+ * from the decoded lp and pb values. On success, the LZMA decoder state is
+ * reset and true is returned.
+ */
+static bool lzma_props(struct xz_dec_lzma2 *s, uint8_t props)
+{
+	if (props > (4 * 5 + 4) * 9 + 8)
+		return false;
+
+	s->lzma.pos_mask = 0;
+	while (props >= 9 * 5) {
+		props -= 9 * 5;
+		++s->lzma.pos_mask;
+	}
+
+	s->lzma.pos_mask = (1 << s->lzma.pos_mask) - 1;
+
+	s->lzma.literal_pos_mask = 0;
+	while (props >= 9) {
+		props -= 9;
+		++s->lzma.literal_pos_mask;
+	}
+
+	s->lzma.lc = props;
+
+	if (s->lzma.lc + s->lzma.literal_pos_mask > 4)
+		return false;
+
+	s->lzma.literal_pos_mask = (1 << s->lzma.literal_pos_mask) - 1;
+
+	lzma_reset(s);
+
+	return true;
+}
+
+/*********
+ * LZMA2 *
+ *********/
+
+/*
+ * The LZMA decoder assumes that if the input limit (s->rc.in_limit) hasn't
+ * been exceeded, it is safe to read up to LZMA_IN_REQUIRED bytes. This
+ * wrapper function takes care of making the LZMA decoder's assumption safe.
+ *
+ * As long as there is plenty of input left to be decoded in the current LZMA
+ * chunk, we decode directly from the caller-supplied input buffer until
+ * there's LZMA_IN_REQUIRED bytes left. Those remaining bytes are copied into
+ * s->temp.buf, which (hopefully) gets filled on the next call to this
+ * function. We decode a few bytes from the temporary buffer so that we can
+ * continue decoding from the caller-supplied input buffer again.
+ */
+static bool lzma2_lzma(struct xz_dec_lzma2 *s, struct xz_buf *b)
+{
+	size_t in_avail;
+	uint32_t tmp;
+
+	in_avail = b->in_size - b->in_pos;
+	if (s->temp.size > 0 || s->lzma2.compressed == 0) {
+		tmp = 2 * LZMA_IN_REQUIRED - s->temp.size;
+		if (tmp > s->lzma2.compressed - s->temp.size)
+			tmp = s->lzma2.compressed - s->temp.size;
+		if (tmp > in_avail)
+			tmp = in_avail;
+
+		memcpy(s->temp.buf + s->temp.size, b->in + b->in_pos, tmp);
+
+		if (s->temp.size + tmp == s->lzma2.compressed) {
+			memzero(s->temp.buf + s->temp.size + tmp,
+					sizeof(s->temp.buf)
+						- s->temp.size - tmp);
+			s->rc.in_limit = s->temp.size + tmp;
+		} else if (s->temp.size + tmp < LZMA_IN_REQUIRED) {
+			s->temp.size += tmp;
+			b->in_pos += tmp;
+			return true;
+		} else {
+			s->rc.in_limit = s->temp.size + tmp - LZMA_IN_REQUIRED;
+		}
+
+		s->rc.in = s->temp.buf;
+		s->rc.in_pos = 0;
+
+		if (!lzma_main(s) || s->rc.in_pos > s->temp.size + tmp)
+			return false;
+
+		s->lzma2.compressed -= s->rc.in_pos;
+
+		if (s->rc.in_pos < s->temp.size) {
+			s->temp.size -= s->rc.in_pos;
+			memmove(s->temp.buf, s->temp.buf + s->rc.in_pos,
+					s->temp.size);
+			return true;
+		}
+
+		b->in_pos += s->rc.in_pos - s->temp.size;
+		s->temp.size = 0;
+	}
+
+	in_avail = b->in_size - b->in_pos;
+	if (in_avail >= LZMA_IN_REQUIRED) {
+		s->rc.in = b->in;
+		s->rc.in_pos = b->in_pos;
+
+		if (in_avail >= s->lzma2.compressed + LZMA_IN_REQUIRED)
+			s->rc.in_limit = b->in_pos + s->lzma2.compressed;
+		else
+			s->rc.in_limit = b->in_size - LZMA_IN_REQUIRED;
+
+		if (!lzma_main(s))
+			return false;
+
+		in_avail = s->rc.in_pos - b->in_pos;
+		if (in_avail > s->lzma2.compressed)
+			return false;
+
+		s->lzma2.compressed -= in_avail;
+		b->in_pos = s->rc.in_pos;
+	}
+
+	in_avail = b->in_size - b->in_pos;
+	if (in_avail < LZMA_IN_REQUIRED) {
+		if (in_avail > s->lzma2.compressed)
+			in_avail = s->lzma2.compressed;
+
+		memcpy(s->temp.buf, b->in + b->in_pos, in_avail);
+		s->temp.size = in_avail;
+		b->in_pos += in_avail;
+	}
+
+	return true;
+}
+
+/*
+ * Take care of the LZMA2 control layer, and forward the job of actual LZMA
+ * decoding or copying of uncompressed chunks to other functions.
+ */
+XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
+				       struct xz_buf *b)
+{
+	uint32_t tmp;
+
+	while (b->in_pos < b->in_size || s->lzma2.sequence == SEQ_LZMA_RUN) {
+		switch (s->lzma2.sequence) {
+		case SEQ_CONTROL:
+			/*
+			 * LZMA2 control byte
+			 *
+			 * Exact values:
+			 *   0x00   End marker
+			 *   0x01   Dictionary reset followed by
+			 *          an uncompressed chunk
+			 *   0x02   Uncompressed chunk (no dictionary reset)
+			 *
+			 * Highest three bits (s->control & 0xE0):
+			 *   0xE0   Dictionary reset, new properties and state
+			 *          reset, followed by LZMA compressed chunk
+			 *   0xC0   New properties and state reset, followed
+			 *          by LZMA compressed chunk (no dictionary
+			 *          reset)
+			 *   0xA0   State reset using old properties,
+			 *          followed by LZMA compressed chunk (no
+			 *          dictionary reset)
+			 *   0x80   LZMA chunk (no dictionary or state reset)
+			 *
+			 * For LZMA compressed chunks, the lowest five bits
+			 * (s->control & 1F) are the highest bits of the
+			 * uncompressed size (bits 16-20).
+			 *
+			 * A new LZMA2 stream must begin with a dictionary
+			 * reset. The first LZMA chunk must set new
+			 * properties and reset the LZMA state.
+			 *
+			 * Values that don't match anything described above
+			 * are invalid and we return XZ_DATA_ERROR.
+			 */
+			tmp = b->in[b->in_pos++];
+
+			if (tmp >= 0xE0 || tmp == 0x01) {
+				s->lzma2.need_props = true;
+				s->lzma2.need_dict_reset = false;
+				dict_reset(&s->dict, b);
+			} else if (s->lzma2.need_dict_reset) {
+				return XZ_DATA_ERROR;
+			}
+
+			if (tmp >= 0x80) {
+				s->lzma2.uncompressed = (tmp & 0x1F) << 16;
+				s->lzma2.sequence = SEQ_UNCOMPRESSED_1;
+
+				if (tmp >= 0xC0) {
+					/*
+					 * When there are new properties,
+					 * state reset is done at
+					 * SEQ_PROPERTIES.
+					 */
+					s->lzma2.need_props = false;
+					s->lzma2.next_sequence
+							= SEQ_PROPERTIES;
+
+				} else if (s->lzma2.need_props) {
+					return XZ_DATA_ERROR;
+
+				} else {
+					s->lzma2.next_sequence
+							= SEQ_LZMA_PREPARE;
+					if (tmp >= 0xA0)
+						lzma_reset(s);
+				}
+			} else {
+				if (tmp == 0x00)
+					return XZ_STREAM_END;
+
+				if (tmp > 0x02)
+					return XZ_DATA_ERROR;
+
+				s->lzma2.sequence = SEQ_COMPRESSED_0;
+				s->lzma2.next_sequence = SEQ_COPY;
+			}
+
+			break;
+
+		case SEQ_UNCOMPRESSED_1:
+			s->lzma2.uncompressed
+					+= (uint32_t)b->in[b->in_pos++] << 8;
+			s->lzma2.sequence = SEQ_UNCOMPRESSED_2;
+			break;
+
+		case SEQ_UNCOMPRESSED_2:
+			s->lzma2.uncompressed
+					+= (uint32_t)b->in[b->in_pos++] + 1;
+			s->lzma2.sequence = SEQ_COMPRESSED_0;
+			break;
+
+		case SEQ_COMPRESSED_0:
+			s->lzma2.compressed
+					= (uint32_t)b->in[b->in_pos++] << 8;
+			s->lzma2.sequence = SEQ_COMPRESSED_1;
+			break;
+
+		case SEQ_COMPRESSED_1:
+			s->lzma2.compressed
+					+= (uint32_t)b->in[b->in_pos++] + 1;
+			s->lzma2.sequence = s->lzma2.next_sequence;
+			break;
+
+		case SEQ_PROPERTIES:
+			if (!lzma_props(s, b->in[b->in_pos++]))
+				return XZ_DATA_ERROR;
+
+			s->lzma2.sequence = SEQ_LZMA_PREPARE;
+
+		case SEQ_LZMA_PREPARE:
+			if (s->lzma2.compressed < RC_INIT_BYTES)
+				return XZ_DATA_ERROR;
+
+			if (!rc_read_init(&s->rc, b))
+				return XZ_OK;
+
+			s->lzma2.compressed -= RC_INIT_BYTES;
+			s->lzma2.sequence = SEQ_LZMA_RUN;
+
+		case SEQ_LZMA_RUN:
+			/*
+			 * Set dictionary limit to indicate how much we want
+			 * to be encoded at maximum. Decode new data into the
+			 * dictionary. Flush the new data from dictionary to
+			 * b->out. Check if we finished decoding this chunk.
+			 * In case the dictionary got full but we didn't fill
+			 * the output buffer yet, we may run this loop
+			 * multiple times without changing s->lzma2.sequence.
+			 */
+			dict_limit(&s->dict, min_t(size_t,
+					b->out_size - b->out_pos,
+					s->lzma2.uncompressed));
+			if (!lzma2_lzma(s, b))
+				return XZ_DATA_ERROR;
+
+			s->lzma2.uncompressed -= dict_flush(&s->dict, b);
+
+			if (s->lzma2.uncompressed == 0) {
+				if (s->lzma2.compressed > 0 || s->lzma.len > 0
+						|| !rc_is_finished(&s->rc))
+					return XZ_DATA_ERROR;
+
+				rc_reset(&s->rc);
+				s->lzma2.sequence = SEQ_CONTROL;
+
+			} else if (b->out_pos == b->out_size
+					|| (b->in_pos == b->in_size
+						&& s->temp.size
+						< s->lzma2.compressed)) {
+				return XZ_OK;
+			}
+
+			break;
+
+		case SEQ_COPY:
+			dict_uncompressed(&s->dict, b, &s->lzma2.compressed);
+			if (s->lzma2.compressed > 0)
+				return XZ_OK;
+
+			s->lzma2.sequence = SEQ_CONTROL;
+			break;
+		}
+	}
+
+	return XZ_OK;
+}
+
+XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode,
+						   uint32_t dict_max)
+{
+	struct xz_dec_lzma2 *s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return NULL;
+
+	s->dict.mode = mode;
+	s->dict.size_max = dict_max;
+
+	if (DEC_IS_PREALLOC(mode)) {
+		s->dict.buf = vmalloc(dict_max);
+		if (s->dict.buf == NULL) {
+			kfree(s);
+			return NULL;
+		}
+	} else if (DEC_IS_DYNALLOC(mode)) {
+		s->dict.buf = NULL;
+		s->dict.allocated = 0;
+	}
+
+	return s;
+}
+
+XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props)
+{
+	/* This limits dictionary size to 3 GiB to keep parsing simpler. */
+	if (props > 39)
+		return XZ_OPTIONS_ERROR;
+
+	s->dict.size = 2 + (props & 1);
+	s->dict.size <<= (props >> 1) + 11;
+
+	if (DEC_IS_MULTI(s->dict.mode)) {
+		if (s->dict.size > s->dict.size_max)
+			return XZ_MEMLIMIT_ERROR;
+
+		s->dict.end = s->dict.size;
+
+		if (DEC_IS_DYNALLOC(s->dict.mode)) {
+			if (s->dict.allocated < s->dict.size) {
+				vfree(s->dict.buf);
+				s->dict.buf = vmalloc(s->dict.size);
+				if (s->dict.buf == NULL) {
+					s->dict.allocated = 0;
+					return XZ_MEM_ERROR;
+				}
+			}
+		}
+	}
+
+	s->lzma.len = 0;
+
+	s->lzma2.sequence = SEQ_CONTROL;
+	s->lzma2.need_dict_reset = true;
+
+	s->temp.size = 0;
+
+	return XZ_OK;
+}
+
+XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s)
+{
+	if (DEC_IS_MULTI(s->dict.mode))
+		vfree(s->dict.buf);
+
+	kfree(s);
+}
diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c
new file mode 100644
index 000000000000..ac809b1e64f7
--- /dev/null
+++ b/lib/xz/xz_dec_stream.c
@@ -0,0 +1,821 @@
+/*
+ * .xz Stream decoder
+ *
+ * Author: Lasse Collin <lasse.collin@tukaani.org>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include "xz_private.h"
+#include "xz_stream.h"
+
+/* Hash used to validate the Index field */
+struct xz_dec_hash {
+	vli_type unpadded;
+	vli_type uncompressed;
+	uint32_t crc32;
+};
+
+struct xz_dec {
+	/* Position in dec_main() */
+	enum {
+		SEQ_STREAM_HEADER,
+		SEQ_BLOCK_START,
+		SEQ_BLOCK_HEADER,
+		SEQ_BLOCK_UNCOMPRESS,
+		SEQ_BLOCK_PADDING,
+		SEQ_BLOCK_CHECK,
+		SEQ_INDEX,
+		SEQ_INDEX_PADDING,
+		SEQ_INDEX_CRC32,
+		SEQ_STREAM_FOOTER
+	} sequence;
+
+	/* Position in variable-length integers and Check fields */
+	uint32_t pos;
+
+	/* Variable-length integer decoded by dec_vli() */
+	vli_type vli;
+
+	/* Saved in_pos and out_pos */
+	size_t in_start;
+	size_t out_start;
+
+	/* CRC32 value in Block or Index */
+	uint32_t crc32;
+
+	/* Type of the integrity check calculated from uncompressed data */
+	enum xz_check check_type;
+
+	/* Operation mode */
+	enum xz_mode mode;
+
+	/*
+	 * True if the next call to xz_dec_run() is allowed to return
+	 * XZ_BUF_ERROR.
+	 */
+	bool allow_buf_error;
+
+	/* Information stored in Block Header */
+	struct {
+		/*
+		 * Value stored in the Compressed Size field, or
+		 * VLI_UNKNOWN if Compressed Size is not present.
+		 */
+		vli_type compressed;
+
+		/*
+		 * Value stored in the Uncompressed Size field, or
+		 * VLI_UNKNOWN if Uncompressed Size is not present.
+		 */
+		vli_type uncompressed;
+
+		/* Size of the Block Header field */
+		uint32_t size;
+	} block_header;
+
+	/* Information collected when decoding Blocks */
+	struct {
+		/* Observed compressed size of the current Block */
+		vli_type compressed;
+
+		/* Observed uncompressed size of the current Block */
+		vli_type uncompressed;
+
+		/* Number of Blocks decoded so far */
+		vli_type count;
+
+		/*
+		 * Hash calculated from the Block sizes. This is used to
+		 * validate the Index field.
+		 */
+		struct xz_dec_hash hash;
+	} block;
+
+	/* Variables needed when verifying the Index field */
+	struct {
+		/* Position in dec_index() */
+		enum {
+			SEQ_INDEX_COUNT,
+			SEQ_INDEX_UNPADDED,
+			SEQ_INDEX_UNCOMPRESSED
+		} sequence;
+
+		/* Size of the Index in bytes */
+		vli_type size;
+
+		/* Number of Records (matches block.count in valid files) */
+		vli_type count;
+
+		/*
+		 * Hash calculated from the Records (matches block.hash in
+		 * valid files).
+		 */
+		struct xz_dec_hash hash;
+	} index;
+
+	/*
+	 * Temporary buffer needed to hold Stream Header, Block Header,
+	 * and Stream Footer. The Block Header is the biggest (1 KiB)
+	 * so we reserve space according to that. buf[] has to be aligned
+	 * to a multiple of four bytes; the size_t variables before it
+	 * should guarantee this.
+	 */
+	struct {
+		size_t pos;
+		size_t size;
+		uint8_t buf[1024];
+	} temp;
+
+	struct xz_dec_lzma2 *lzma2;
+
+#ifdef XZ_DEC_BCJ
+	struct xz_dec_bcj *bcj;
+	bool bcj_active;
+#endif
+};
+
+#ifdef XZ_DEC_ANY_CHECK
+/* Sizes of the Check field with different Check IDs */
+static const uint8_t check_sizes[16] = {
+	0,
+	4, 4, 4,
+	8, 8, 8,
+	16, 16, 16,
+	32, 32, 32,
+	64, 64, 64
+};
+#endif
+
+/*
+ * Fill s->temp by copying data starting from b->in[b->in_pos]. Caller
+ * must have set s->temp.pos to indicate how much data we are supposed
+ * to copy into s->temp.buf. Return true once s->temp.pos has reached
+ * s->temp.size.
+ */
+static bool fill_temp(struct xz_dec *s, struct xz_buf *b)
+{
+	size_t copy_size = min_t(size_t,
+			b->in_size - b->in_pos, s->temp.size - s->temp.pos);
+
+	memcpy(s->temp.buf + s->temp.pos, b->in + b->in_pos, copy_size);
+	b->in_pos += copy_size;
+	s->temp.pos += copy_size;
+
+	if (s->temp.pos == s->temp.size) {
+		s->temp.pos = 0;
+		return true;
+	}
+
+	return false;
+}
+
+/* Decode a variable-length integer (little-endian base-128 encoding) */
+static enum xz_ret dec_vli(struct xz_dec *s, const uint8_t *in,
+			   size_t *in_pos, size_t in_size)
+{
+	uint8_t byte;
+
+	if (s->pos == 0)
+		s->vli = 0;
+
+	while (*in_pos < in_size) {
+		byte = in[*in_pos];
+		++*in_pos;
+
+		s->vli |= (vli_type)(byte & 0x7F) << s->pos;
+
+		if ((byte & 0x80) == 0) {
+			/* Don't allow non-minimal encodings. */
+			if (byte == 0 && s->pos != 0)
+				return XZ_DATA_ERROR;
+
+			s->pos = 0;
+			return XZ_STREAM_END;
+		}
+
+		s->pos += 7;
+		if (s->pos == 7 * VLI_BYTES_MAX)
+			return XZ_DATA_ERROR;
+	}
+
+	return XZ_OK;
+}
+
+/*
+ * Decode the Compressed Data field from a Block. Update and validate
+ * the observed compressed and uncompressed sizes of the Block so that
+ * they don't exceed the values possibly stored in the Block Header
+ * (validation assumes that no integer overflow occurs, since vli_type
+ * is normally uint64_t). Update the CRC32 if presence of the CRC32
+ * field was indicated in Stream Header.
+ *
+ * Once the decoding is finished, validate that the observed sizes match
+ * the sizes possibly stored in the Block Header. Update the hash and
+ * Block count, which are later used to validate the Index field.
+ */
+static enum xz_ret dec_block(struct xz_dec *s, struct xz_buf *b)
+{
+	enum xz_ret ret;
+
+	s->in_start = b->in_pos;
+	s->out_start = b->out_pos;
+
+#ifdef XZ_DEC_BCJ
+	if (s->bcj_active)
+		ret = xz_dec_bcj_run(s->bcj, s->lzma2, b);
+	else
+#endif
+		ret = xz_dec_lzma2_run(s->lzma2, b);
+
+	s->block.compressed += b->in_pos - s->in_start;
+	s->block.uncompressed += b->out_pos - s->out_start;
+
+	/*
+	 * There is no need to separately check for VLI_UNKNOWN, since
+	 * the observed sizes are always smaller than VLI_UNKNOWN.
+	 */
+	if (s->block.compressed > s->block_header.compressed
+			|| s->block.uncompressed
+				> s->block_header.uncompressed)
+		return XZ_DATA_ERROR;
+
+	if (s->check_type == XZ_CHECK_CRC32)
+		s->crc32 = xz_crc32(b->out + s->out_start,
+				b->out_pos - s->out_start, s->crc32);
+
+	if (ret == XZ_STREAM_END) {
+		if (s->block_header.compressed != VLI_UNKNOWN
+				&& s->block_header.compressed
+					!= s->block.compressed)
+			return XZ_DATA_ERROR;
+
+		if (s->block_header.uncompressed != VLI_UNKNOWN
+				&& s->block_header.uncompressed
+					!= s->block.uncompressed)
+			return XZ_DATA_ERROR;
+
+		s->block.hash.unpadded += s->block_header.size
+				+ s->block.compressed;
+
+#ifdef XZ_DEC_ANY_CHECK
+		s->block.hash.unpadded += check_sizes[s->check_type];
+#else
+		if (s->check_type == XZ_CHECK_CRC32)
+			s->block.hash.unpadded += 4;
+#endif
+
+		s->block.hash.uncompressed += s->block.uncompressed;
+		s->block.hash.crc32 = xz_crc32(
+				(const uint8_t *)&s->block.hash,
+				sizeof(s->block.hash), s->block.hash.crc32);
+
+		++s->block.count;
+	}
+
+	return ret;
+}
+
+/* Update the Index size and the CRC32 value. */
+static void index_update(struct xz_dec *s, const struct xz_buf *b)
+{
+	size_t in_used = b->in_pos - s->in_start;
+	s->index.size += in_used;
+	s->crc32 = xz_crc32(b->in + s->in_start, in_used, s->crc32);
+}
+
+/*
+ * Decode the Number of Records, Unpadded Size, and Uncompressed Size
+ * fields from the Index field. That is, Index Padding and CRC32 are not
+ * decoded by this function.
+ *
+ * This can return XZ_OK (more input needed), XZ_STREAM_END (everything
+ * successfully decoded), or XZ_DATA_ERROR (input is corrupt).
+ */
+static enum xz_ret dec_index(struct xz_dec *s, struct xz_buf *b)
+{
+	enum xz_ret ret;
+
+	do {
+		ret = dec_vli(s, b->in, &b->in_pos, b->in_size);
+		if (ret != XZ_STREAM_END) {
+			index_update(s, b);
+			return ret;
+		}
+
+		switch (s->index.sequence) {
+		case SEQ_INDEX_COUNT:
+			s->index.count = s->vli;
+
+			/*
+			 * Validate that the Number of Records field
+			 * indicates the same number of Records as
+			 * there were Blocks in the Stream.
+			 */
+			if (s->index.count != s->block.count)
+				return XZ_DATA_ERROR;
+
+			s->index.sequence = SEQ_INDEX_UNPADDED;
+			break;
+
+		case SEQ_INDEX_UNPADDED:
+			s->index.hash.unpadded += s->vli;
+			s->index.sequence = SEQ_INDEX_UNCOMPRESSED;
+			break;
+
+		case SEQ_INDEX_UNCOMPRESSED:
+			s->index.hash.uncompressed += s->vli;
+			s->index.hash.crc32 = xz_crc32(
+					(const uint8_t *)&s->index.hash,
+					sizeof(s->index.hash),
+					s->index.hash.crc32);
+			--s->index.count;
+			s->index.sequence = SEQ_INDEX_UNPADDED;
+			break;
+		}
+	} while (s->index.count > 0);
+
+	return XZ_STREAM_END;
+}
+
+/*
+ * Validate that the next four input bytes match the value of s->crc32.
+ * s->pos must be zero when starting to validate the first byte.
+ */
+static enum xz_ret crc32_validate(struct xz_dec *s, struct xz_buf *b)
+{
+	do {
+		if (b->in_pos == b->in_size)
+			return XZ_OK;
+
+		if (((s->crc32 >> s->pos) & 0xFF) != b->in[b->in_pos++])
+			return XZ_DATA_ERROR;
+
+		s->pos += 8;
+
+	} while (s->pos < 32);
+
+	s->crc32 = 0;
+	s->pos = 0;
+
+	return XZ_STREAM_END;
+}
+
+#ifdef XZ_DEC_ANY_CHECK
+/*
+ * Skip over the Check field when the Check ID is not supported.
+ * Returns true once the whole Check field has been skipped over.
+ */
+static bool check_skip(struct xz_dec *s, struct xz_buf *b)
+{
+	while (s->pos < check_sizes[s->check_type]) {
+		if (b->in_pos == b->in_size)
+			return false;
+
+		++b->in_pos;
+		++s->pos;
+	}
+
+	s->pos = 0;
+
+	return true;
+}
+#endif
+
+/* Decode the Stream Header field (the first 12 bytes of the .xz Stream). */
+static enum xz_ret dec_stream_header(struct xz_dec *s)
+{
+	if (!memeq(s->temp.buf, HEADER_MAGIC, HEADER_MAGIC_SIZE))
+		return XZ_FORMAT_ERROR;
+
+	if (xz_crc32(s->temp.buf + HEADER_MAGIC_SIZE, 2, 0)
+			!= get_le32(s->temp.buf + HEADER_MAGIC_SIZE + 2))
+		return XZ_DATA_ERROR;
+
+	if (s->temp.buf[HEADER_MAGIC_SIZE] != 0)
+		return XZ_OPTIONS_ERROR;
+
+	/*
+	 * Of integrity checks, we support only none (Check ID = 0) and
+	 * CRC32 (Check ID = 1). However, if XZ_DEC_ANY_CHECK is defined,
+	 * we will accept other check types too, but then the check won't
+	 * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given.
+	 */
+	s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1];
+
+#ifdef XZ_DEC_ANY_CHECK
+	if (s->check_type > XZ_CHECK_MAX)
+		return XZ_OPTIONS_ERROR;
+
+	if (s->check_type > XZ_CHECK_CRC32)
+		return XZ_UNSUPPORTED_CHECK;
+#else
+	if (s->check_type > XZ_CHECK_CRC32)
+		return XZ_OPTIONS_ERROR;
+#endif
+
+	return XZ_OK;
+}
+
+/* Decode the Stream Footer field (the last 12 bytes of the .xz Stream) */
+static enum xz_ret dec_stream_footer(struct xz_dec *s)
+{
+	if (!memeq(s->temp.buf + 10, FOOTER_MAGIC, FOOTER_MAGIC_SIZE))
+		return XZ_DATA_ERROR;
+
+	if (xz_crc32(s->temp.buf + 4, 6, 0) != get_le32(s->temp.buf))
+		return XZ_DATA_ERROR;
+
+	/*
+	 * Validate Backward Size. Note that we never added the size of the
+	 * Index CRC32 field to s->index.size, thus we use s->index.size / 4
+	 * instead of s->index.size / 4 - 1.
+	 */
+	if ((s->index.size >> 2) != get_le32(s->temp.buf + 4))
+		return XZ_DATA_ERROR;
+
+	if (s->temp.buf[8] != 0 || s->temp.buf[9] != s->check_type)
+		return XZ_DATA_ERROR;
+
+	/*
+	 * Use XZ_STREAM_END instead of XZ_OK to be more convenient
+	 * for the caller.
+	 */
+	return XZ_STREAM_END;
+}
+
+/* Decode the Block Header and initialize the filter chain. */
+static enum xz_ret dec_block_header(struct xz_dec *s)
+{
+	enum xz_ret ret;
+
+	/*
+	 * Validate the CRC32. We know that the temp buffer is at least
+	 * eight bytes so this is safe.
+	 */
+	s->temp.size -= 4;
+	if (xz_crc32(s->temp.buf, s->temp.size, 0)
+			!= get_le32(s->temp.buf + s->temp.size))
+		return XZ_DATA_ERROR;
+
+	s->temp.pos = 2;
+
+	/*
+	 * Catch unsupported Block Flags. We support only one or two filters
+	 * in the chain, so we catch that with the same test.
+	 */
+#ifdef XZ_DEC_BCJ
+	if (s->temp.buf[1] & 0x3E)
+#else
+	if (s->temp.buf[1] & 0x3F)
+#endif
+		return XZ_OPTIONS_ERROR;
+
+	/* Compressed Size */
+	if (s->temp.buf[1] & 0x40) {
+		if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size)
+					!= XZ_STREAM_END)
+			return XZ_DATA_ERROR;
+
+		s->block_header.compressed = s->vli;
+	} else {
+		s->block_header.compressed = VLI_UNKNOWN;
+	}
+
+	/* Uncompressed Size */
+	if (s->temp.buf[1] & 0x80) {
+		if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size)
+				!= XZ_STREAM_END)
+			return XZ_DATA_ERROR;
+
+		s->block_header.uncompressed = s->vli;
+	} else {
+		s->block_header.uncompressed = VLI_UNKNOWN;
+	}
+
+#ifdef XZ_DEC_BCJ
+	/* If there are two filters, the first one must be a BCJ filter. */
+	s->bcj_active = s->temp.buf[1] & 0x01;
+	if (s->bcj_active) {
+		if (s->temp.size - s->temp.pos < 2)
+			return XZ_OPTIONS_ERROR;
+
+		ret = xz_dec_bcj_reset(s->bcj, s->temp.buf[s->temp.pos++]);
+		if (ret != XZ_OK)
+			return ret;
+
+		/*
+		 * We don't support custom start offset,
+		 * so Size of Properties must be zero.
+		 */
+		if (s->temp.buf[s->temp.pos++] != 0x00)
+			return XZ_OPTIONS_ERROR;
+	}
+#endif
+
+	/* Valid Filter Flags always take at least two bytes. */
+	if (s->temp.size - s->temp.pos < 2)
+		return XZ_DATA_ERROR;
+
+	/* Filter ID = LZMA2 */
+	if (s->temp.buf[s->temp.pos++] != 0x21)
+		return XZ_OPTIONS_ERROR;
+
+	/* Size of Properties = 1-byte Filter Properties */
+	if (s->temp.buf[s->temp.pos++] != 0x01)
+		return XZ_OPTIONS_ERROR;
+
+	/* Filter Properties contains LZMA2 dictionary size. */
+	if (s->temp.size - s->temp.pos < 1)
+		return XZ_DATA_ERROR;
+
+	ret = xz_dec_lzma2_reset(s->lzma2, s->temp.buf[s->temp.pos++]);
+	if (ret != XZ_OK)
+		return ret;
+
+	/* The rest must be Header Padding. */
+	while (s->temp.pos < s->temp.size)
+		if (s->temp.buf[s->temp.pos++] != 0x00)
+			return XZ_OPTIONS_ERROR;
+
+	s->temp.pos = 0;
+	s->block.compressed = 0;
+	s->block.uncompressed = 0;
+
+	return XZ_OK;
+}
+
+static enum xz_ret dec_main(struct xz_dec *s, struct xz_buf *b)
+{
+	enum xz_ret ret;
+
+	/*
+	 * Store the start position for the case when we are in the middle
+	 * of the Index field.
+	 */
+	s->in_start = b->in_pos;
+
+	while (true) {
+		switch (s->sequence) {
+		case SEQ_STREAM_HEADER:
+			/*
+			 * Stream Header is copied to s->temp, and then
+			 * decoded from there. This way if the caller
+			 * gives us only little input at a time, we can
+			 * still keep the Stream Header decoding code
+			 * simple. Similar approach is used in many places
+			 * in this file.
+			 */
+			if (!fill_temp(s, b))
+				return XZ_OK;
+
+			/*
+			 * If dec_stream_header() returns
+			 * XZ_UNSUPPORTED_CHECK, it is still possible
+			 * to continue decoding if working in multi-call
+			 * mode. Thus, update s->sequence before calling
+			 * dec_stream_header().
+			 */
+			s->sequence = SEQ_BLOCK_START;
+
+			ret = dec_stream_header(s);
+			if (ret != XZ_OK)
+				return ret;
+
+		case SEQ_BLOCK_START:
+			/* We need one byte of input to continue. */
+			if (b->in_pos == b->in_size)
+				return XZ_OK;
+
+			/* See if this is the beginning of the Index field. */
+			if (b->in[b->in_pos] == 0) {
+				s->in_start = b->in_pos++;
+				s->sequence = SEQ_INDEX;
+				break;
+			}
+
+			/*
+			 * Calculate the size of the Block Header and
+			 * prepare to decode it.
+			 */
+			s->block_header.size
+				= ((uint32_t)b->in[b->in_pos] + 1) * 4;
+
+			s->temp.size = s->block_header.size;
+			s->temp.pos = 0;
+			s->sequence = SEQ_BLOCK_HEADER;
+
+		case SEQ_BLOCK_HEADER:
+			if (!fill_temp(s, b))
+				return XZ_OK;
+
+			ret = dec_block_header(s);
+			if (ret != XZ_OK)
+				return ret;
+
+			s->sequence = SEQ_BLOCK_UNCOMPRESS;
+
+		case SEQ_BLOCK_UNCOMPRESS:
+			ret = dec_block(s, b);
+			if (ret != XZ_STREAM_END)
+				return ret;
+
+			s->sequence = SEQ_BLOCK_PADDING;
+
+		case SEQ_BLOCK_PADDING:
+			/*
+			 * Size of Compressed Data + Block Padding
+			 * must be a multiple of four. We don't need
+			 * s->block.compressed for anything else
+			 * anymore, so we use it here to test the size
+			 * of the Block Padding field.
+			 */
+			while (s->block.compressed & 3) {
+				if (b->in_pos == b->in_size)
+					return XZ_OK;
+
+				if (b->in[b->in_pos++] != 0)
+					return XZ_DATA_ERROR;
+
+				++s->block.compressed;
+			}
+
+			s->sequence = SEQ_BLOCK_CHECK;
+
+		case SEQ_BLOCK_CHECK:
+			if (s->check_type == XZ_CHECK_CRC32) {
+				ret = crc32_validate(s, b);
+				if (ret != XZ_STREAM_END)
+					return ret;
+			}
+#ifdef XZ_DEC_ANY_CHECK
+			else if (!check_skip(s, b)) {
+				return XZ_OK;
+			}
+#endif
+
+			s->sequence = SEQ_BLOCK_START;
+			break;
+
+		case SEQ_INDEX:
+			ret = dec_index(s, b);
+			if (ret != XZ_STREAM_END)
+				return ret;
+
+			s->sequence = SEQ_INDEX_PADDING;
+
+		case SEQ_INDEX_PADDING:
+			while ((s->index.size + (b->in_pos - s->in_start))
+					& 3) {
+				if (b->in_pos == b->in_size) {
+					index_update(s, b);
+					return XZ_OK;
+				}
+
+				if (b->in[b->in_pos++] != 0)
+					return XZ_DATA_ERROR;
+			}
+
+			/* Finish the CRC32 value and Index size. */
+			index_update(s, b);
+
+			/* Compare the hashes to validate the Index field. */
+			if (!memeq(&s->block.hash, &s->index.hash,
+					sizeof(s->block.hash)))
+				return XZ_DATA_ERROR;
+
+			s->sequence = SEQ_INDEX_CRC32;
+
+		case SEQ_INDEX_CRC32:
+			ret = crc32_validate(s, b);
+			if (ret != XZ_STREAM_END)
+				return ret;
+
+			s->temp.size = STREAM_HEADER_SIZE;
+			s->sequence = SEQ_STREAM_FOOTER;
+
+		case SEQ_STREAM_FOOTER:
+			if (!fill_temp(s, b))
+				return XZ_OK;
+
+			return dec_stream_footer(s);
+		}
+	}
+
+	/* Never reached */
+}
+
+/*
+ * xz_dec_run() is a wrapper for dec_main() to handle some special cases in
+ * multi-call and single-call decoding.
+ *
+ * In multi-call mode, we must return XZ_BUF_ERROR when it seems clear that we
+ * are not going to make any progress anymore. This is to prevent the caller
+ * from calling us infinitely when the input file is truncated or otherwise
+ * corrupt. Since zlib-style API allows that the caller fills the input buffer
+ * only when the decoder doesn't produce any new output, we have to be careful
+ * to avoid returning XZ_BUF_ERROR too easily: XZ_BUF_ERROR is returned only
+ * after the second consecutive call to xz_dec_run() that makes no progress.
+ *
+ * In single-call mode, if we couldn't decode everything and no error
+ * occurred, either the input is truncated or the output buffer is too small.
+ * Since we know that the last input byte never produces any output, we know
+ * that if all the input was consumed and decoding wasn't finished, the file
+ * must be corrupt. Otherwise the output buffer has to be too small or the
+ * file is corrupt in a way that decoding it produces too big output.
+ *
+ * If single-call decoding fails, we reset b->in_pos and b->out_pos back to
+ * their original values. This is because with some filter chains there won't
+ * be any valid uncompressed data in the output buffer unless the decoding
+ * actually succeeds (that's the price to pay of using the output buffer as
+ * the workspace).
+ */
+XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b)
+{
+	size_t in_start;
+	size_t out_start;
+	enum xz_ret ret;
+
+	if (DEC_IS_SINGLE(s->mode))
+		xz_dec_reset(s);
+
+	in_start = b->in_pos;
+	out_start = b->out_pos;
+	ret = dec_main(s, b);
+
+	if (DEC_IS_SINGLE(s->mode)) {
+		if (ret == XZ_OK)
+			ret = b->in_pos == b->in_size
+					? XZ_DATA_ERROR : XZ_BUF_ERROR;
+
+		if (ret != XZ_STREAM_END) {
+			b->in_pos = in_start;
+			b->out_pos = out_start;
+		}
+
+	} else if (ret == XZ_OK && in_start == b->in_pos
+			&& out_start == b->out_pos) {
+		if (s->allow_buf_error)
+			ret = XZ_BUF_ERROR;
+
+		s->allow_buf_error = true;
+	} else {
+		s->allow_buf_error = false;
+	}
+
+	return ret;
+}
+
+XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max)
+{
+	struct xz_dec *s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return NULL;
+
+	s->mode = mode;
+
+#ifdef XZ_DEC_BCJ
+	s->bcj = xz_dec_bcj_create(DEC_IS_SINGLE(mode));
+	if (s->bcj == NULL)
+		goto error_bcj;
+#endif
+
+	s->lzma2 = xz_dec_lzma2_create(mode, dict_max);
+	if (s->lzma2 == NULL)
+		goto error_lzma2;
+
+	xz_dec_reset(s);
+	return s;
+
+error_lzma2:
+#ifdef XZ_DEC_BCJ
+	xz_dec_bcj_end(s->bcj);
+error_bcj:
+#endif
+	kfree(s);
+	return NULL;
+}
+
+XZ_EXTERN void xz_dec_reset(struct xz_dec *s)
+{
+	s->sequence = SEQ_STREAM_HEADER;
+	s->allow_buf_error = false;
+	s->pos = 0;
+	s->crc32 = 0;
+	memzero(&s->block, sizeof(s->block));
+	memzero(&s->index, sizeof(s->index));
+	s->temp.pos = 0;
+	s->temp.size = STREAM_HEADER_SIZE;
+}
+
+XZ_EXTERN void xz_dec_end(struct xz_dec *s)
+{
+	if (s != NULL) {
+		xz_dec_lzma2_end(s->lzma2);
+#ifdef XZ_DEC_BCJ
+		xz_dec_bcj_end(s->bcj);
+#endif
+		kfree(s);
+	}
+}
diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c
new file mode 100644
index 000000000000..32eb3c03aede
--- /dev/null
+++ b/lib/xz/xz_dec_syms.c
@@ -0,0 +1,26 @@
+/*
+ * XZ decoder module information
+ *
+ * Author: Lasse Collin <lasse.collin@tukaani.org>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include <linux/module.h>
+#include <linux/xz.h>
+
+EXPORT_SYMBOL(xz_dec_init);
+EXPORT_SYMBOL(xz_dec_reset);
+EXPORT_SYMBOL(xz_dec_run);
+EXPORT_SYMBOL(xz_dec_end);
+
+MODULE_DESCRIPTION("XZ decompressor");
+MODULE_VERSION("1.0");
+MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org> and Igor Pavlov");
+
+/*
+ * This code is in the public domain, but in Linux it's simplest to just
+ * say it's GPL and consider the authors as the copyright holders.
+ */
+MODULE_LICENSE("GPL");
diff --git a/lib/xz/xz_dec_test.c b/lib/xz/xz_dec_test.c
new file mode 100644
index 000000000000..da28a19d6c98
--- /dev/null
+++ b/lib/xz/xz_dec_test.c
@@ -0,0 +1,220 @@
+/*
+ * XZ decoder tester
+ *
+ * Author: Lasse Collin <lasse.collin@tukaani.org>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/crc32.h>
+#include <linux/xz.h>
+
+/* Maximum supported dictionary size */
+#define DICT_MAX (1 << 20)
+
+/* Device name to pass to register_chrdev(). */
+#define DEVICE_NAME "xz_dec_test"
+
+/* Dynamically allocated device major number */
+static int device_major;
+
+/*
+ * We reuse the same decoder state, and thus can decode only one
+ * file at a time.
+ */
+static bool device_is_open;
+
+/* XZ decoder state */
+static struct xz_dec *state;
+
+/*
+ * Return value of xz_dec_run(). We need to avoid calling xz_dec_run() after
+ * it has returned XZ_STREAM_END, so we make this static.
+ */
+static enum xz_ret ret;
+
+/*
+ * Input and output buffers. The input buffer is used as a temporary safe
+ * place for the data coming from the userspace.
+ */
+static uint8_t buffer_in[1024];
+static uint8_t buffer_out[1024];
+
+/*
+ * Structure to pass the input and output buffers to the XZ decoder.
+ * A few of the fields are never modified so we initialize them here.
+ */
+static struct xz_buf buffers = {
+	.in = buffer_in,
+	.out = buffer_out,
+	.out_size = sizeof(buffer_out)
+};
+
+/*
+ * CRC32 of uncompressed data. This is used to give the user a simple way
+ * to check that the decoder produces correct output.
+ */
+static uint32_t crc;
+
+static int xz_dec_test_open(struct inode *i, struct file *f)
+{
+	if (device_is_open)
+		return -EBUSY;
+
+	device_is_open = true;
+
+	xz_dec_reset(state);
+	ret = XZ_OK;
+	crc = 0xFFFFFFFF;
+
+	buffers.in_pos = 0;
+	buffers.in_size = 0;
+	buffers.out_pos = 0;
+
+	printk(KERN_INFO DEVICE_NAME ": opened\n");
+	return 0;
+}
+
+static int xz_dec_test_release(struct inode *i, struct file *f)
+{
+	device_is_open = false;
+
+	if (ret == XZ_OK)
+		printk(KERN_INFO DEVICE_NAME ": input was truncated\n");
+
+	printk(KERN_INFO DEVICE_NAME ": closed\n");
+	return 0;
+}
+
+/*
+ * Decode the data given to us from the userspace. CRC32 of the uncompressed
+ * data is calculated and is printed at the end of successful decoding. The
+ * uncompressed data isn't stored anywhere for further use.
+ *
+ * The .xz file must have exactly one Stream and no Stream Padding. The data
+ * after the first Stream is considered to be garbage.
+ */
+static ssize_t xz_dec_test_write(struct file *file, const char __user *buf,
+				 size_t size, loff_t *pos)
+{
+	size_t remaining;
+
+	if (ret != XZ_OK) {
+		if (size > 0)
+			printk(KERN_INFO DEVICE_NAME ": %zu bytes of "
+					"garbage at the end of the file\n",
+					size);
+
+		return -ENOSPC;
+	}
+
+	printk(KERN_INFO DEVICE_NAME ": decoding %zu bytes of input\n",
+			size);
+
+	remaining = size;
+	while ((remaining > 0 || buffers.out_pos == buffers.out_size)
+			&& ret == XZ_OK) {
+		if (buffers.in_pos == buffers.in_size) {
+			buffers.in_pos = 0;
+			buffers.in_size = min(remaining, sizeof(buffer_in));
+			if (copy_from_user(buffer_in, buf, buffers.in_size))
+				return -EFAULT;
+
+			buf += buffers.in_size;
+			remaining -= buffers.in_size;
+		}
+
+		buffers.out_pos = 0;
+		ret = xz_dec_run(state, &buffers);
+		crc = crc32(crc, buffer_out, buffers.out_pos);
+	}
+
+	switch (ret) {
+	case XZ_OK:
+		printk(KERN_INFO DEVICE_NAME ": XZ_OK\n");
+		return size;
+
+	case XZ_STREAM_END:
+		printk(KERN_INFO DEVICE_NAME ": XZ_STREAM_END, "
+				"CRC32 = 0x%08X\n", ~crc);
+		return size - remaining - (buffers.in_size - buffers.in_pos);
+
+	case XZ_MEMLIMIT_ERROR:
+		printk(KERN_INFO DEVICE_NAME ": XZ_MEMLIMIT_ERROR\n");
+		break;
+
+	case XZ_FORMAT_ERROR:
+		printk(KERN_INFO DEVICE_NAME ": XZ_FORMAT_ERROR\n");
+		break;
+
+	case XZ_OPTIONS_ERROR:
+		printk(KERN_INFO DEVICE_NAME ": XZ_OPTIONS_ERROR\n");
+		break;
+
+	case XZ_DATA_ERROR:
+		printk(KERN_INFO DEVICE_NAME ": XZ_DATA_ERROR\n");
+		break;
+
+	case XZ_BUF_ERROR:
+		printk(KERN_INFO DEVICE_NAME ": XZ_BUF_ERROR\n");
+		break;
+
+	default:
+		printk(KERN_INFO DEVICE_NAME ": Bug detected!\n");
+		break;
+	}
+
+	return -EIO;
+}
+
+/* Allocate the XZ decoder state and register the character device. */
+static int __init xz_dec_test_init(void)
+{
+	static const struct file_operations fileops = {
+		.owner = THIS_MODULE,
+		.open = &xz_dec_test_open,
+		.release = &xz_dec_test_release,
+		.write = &xz_dec_test_write
+	};
+
+	state = xz_dec_init(XZ_PREALLOC, DICT_MAX);
+	if (state == NULL)
+		return -ENOMEM;
+
+	device_major = register_chrdev(0, DEVICE_NAME, &fileops);
+	if (device_major < 0) {
+		xz_dec_end(state);
+		return device_major;
+	}
+
+	printk(KERN_INFO DEVICE_NAME ": module loaded\n");
+	printk(KERN_INFO DEVICE_NAME ": Create a device node with "
+			"'mknod " DEVICE_NAME " c %d 0' and write .xz files "
+			"to it.\n", device_major);
+	return 0;
+}
+
+static void __exit xz_dec_test_exit(void)
+{
+	unregister_chrdev(device_major, DEVICE_NAME);
+	xz_dec_end(state);
+	printk(KERN_INFO DEVICE_NAME ": module unloaded\n");
+}
+
+module_init(xz_dec_test_init);
+module_exit(xz_dec_test_exit);
+
+MODULE_DESCRIPTION("XZ decompressor tester");
+MODULE_VERSION("1.0");
+MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org>");
+
+/*
+ * This code is in the public domain, but in Linux it's simplest to just
+ * say it's GPL and consider the authors as the copyright holders.
+ */
+MODULE_LICENSE("GPL");
diff --git a/lib/xz/xz_lzma2.h b/lib/xz/xz_lzma2.h
new file mode 100644
index 000000000000..071d67bee9f5
--- /dev/null
+++ b/lib/xz/xz_lzma2.h
@@ -0,0 +1,204 @@
+/*
+ * LZMA2 definitions
+ *
+ * Authors: Lasse Collin <lasse.collin@tukaani.org>
+ *          Igor Pavlov <http://7-zip.org/>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef XZ_LZMA2_H
+#define XZ_LZMA2_H
+
+/* Range coder constants */
+#define RC_SHIFT_BITS 8
+#define RC_TOP_BITS 24
+#define RC_TOP_VALUE (1 << RC_TOP_BITS)
+#define RC_BIT_MODEL_TOTAL_BITS 11
+#define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS)
+#define RC_MOVE_BITS 5
+
+/*
+ * Maximum number of position states. A position state is the lowest pb
+ * number of bits of the current uncompressed offset. In some places there
+ * are different sets of probabilities for different position states.
+ */
+#define POS_STATES_MAX (1 << 4)
+
+/*
+ * This enum is used to track which LZMA symbols have occurred most recently
+ * and in which order. This information is used to predict the next symbol.
+ *
+ * Symbols:
+ *  - Literal: One 8-bit byte
+ *  - Match: Repeat a chunk of data at some distance
+ *  - Long repeat: Multi-byte match at a recently seen distance
+ *  - Short repeat: One-byte repeat at a recently seen distance
+ *
+ * The symbol names are in from STATE_oldest_older_previous. REP means
+ * either short or long repeated match, and NONLIT means any non-literal.
+ */
+enum lzma_state {
+	STATE_LIT_LIT,
+	STATE_MATCH_LIT_LIT,
+	STATE_REP_LIT_LIT,
+	STATE_SHORTREP_LIT_LIT,
+	STATE_MATCH_LIT,
+	STATE_REP_LIT,
+	STATE_SHORTREP_LIT,
+	STATE_LIT_MATCH,
+	STATE_LIT_LONGREP,
+	STATE_LIT_SHORTREP,
+	STATE_NONLIT_MATCH,
+	STATE_NONLIT_REP
+};
+
+/* Total number of states */
+#define STATES 12
+
+/* The lowest 7 states indicate that the previous state was a literal. */
+#define LIT_STATES 7
+
+/* Indicate that the latest symbol was a literal. */
+static inline void lzma_state_literal(enum lzma_state *state)
+{
+	if (*state <= STATE_SHORTREP_LIT_LIT)
+		*state = STATE_LIT_LIT;
+	else if (*state <= STATE_LIT_SHORTREP)
+		*state -= 3;
+	else
+		*state -= 6;
+}
+
+/* Indicate that the latest symbol was a match. */
+static inline void lzma_state_match(enum lzma_state *state)
+{
+	*state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH;
+}
+
+/* Indicate that the latest state was a long repeated match. */
+static inline void lzma_state_long_rep(enum lzma_state *state)
+{
+	*state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP;
+}
+
+/* Indicate that the latest symbol was a short match. */
+static inline void lzma_state_short_rep(enum lzma_state *state)
+{
+	*state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP;
+}
+
+/* Test if the previous symbol was a literal. */
+static inline bool lzma_state_is_literal(enum lzma_state state)
+{
+	return state < LIT_STATES;
+}
+
+/* Each literal coder is divided in three sections:
+ *   - 0x001-0x0FF: Without match byte
+ *   - 0x101-0x1FF: With match byte; match bit is 0
+ *   - 0x201-0x2FF: With match byte; match bit is 1
+ *
+ * Match byte is used when the previous LZMA symbol was something else than
+ * a literal (that is, it was some kind of match).
+ */
+#define LITERAL_CODER_SIZE 0x300
+
+/* Maximum number of literal coders */
+#define LITERAL_CODERS_MAX (1 << 4)
+
+/* Minimum length of a match is two bytes. */
+#define MATCH_LEN_MIN 2
+
+/* Match length is encoded with 4, 5, or 10 bits.
+ *
+ * Length   Bits
+ *  2-9      4 = Choice=0 + 3 bits
+ * 10-17     5 = Choice=1 + Choice2=0 + 3 bits
+ * 18-273   10 = Choice=1 + Choice2=1 + 8 bits
+ */
+#define LEN_LOW_BITS 3
+#define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS)
+#define LEN_MID_BITS 3
+#define LEN_MID_SYMBOLS (1 << LEN_MID_BITS)
+#define LEN_HIGH_BITS 8
+#define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS)
+#define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS)
+
+/*
+ * Maximum length of a match is 273 which is a result of the encoding
+ * described above.
+ */
+#define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1)
+
+/*
+ * Different sets of probabilities are used for match distances that have
+ * very short match length: Lengths of 2, 3, and 4 bytes have a separate
+ * set of probabilities for each length. The matches with longer length
+ * use a shared set of probabilities.
+ */
+#define DIST_STATES 4
+
+/*
+ * Get the index of the appropriate probability array for decoding
+ * the distance slot.
+ */
+static inline uint32_t lzma_get_dist_state(uint32_t len)
+{
+	return len < DIST_STATES + MATCH_LEN_MIN
+			? len - MATCH_LEN_MIN : DIST_STATES - 1;
+}
+
+/*
+ * The highest two bits of a 32-bit match distance are encoded using six bits.
+ * This six-bit value is called a distance slot. This way encoding a 32-bit
+ * value takes 6-36 bits, larger values taking more bits.
+ */
+#define DIST_SLOT_BITS 6
+#define DIST_SLOTS (1 << DIST_SLOT_BITS)
+
+/* Match distances up to 127 are fully encoded using probabilities. Since
+ * the highest two bits (distance slot) are always encoded using six bits,
+ * the distances 0-3 don't need any additional bits to encode, since the
+ * distance slot itself is the same as the actual distance. DIST_MODEL_START
+ * indicates the first distance slot where at least one additional bit is
+ * needed.
+ */
+#define DIST_MODEL_START 4
+
+/*
+ * Match distances greater than 127 are encoded in three pieces:
+ *   - distance slot: the highest two bits
+ *   - direct bits: 2-26 bits below the highest two bits
+ *   - alignment bits: four lowest bits
+ *
+ * Direct bits don't use any probabilities.
+ *
+ * The distance slot value of 14 is for distances 128-191.
+ */
+#define DIST_MODEL_END 14
+
+/* Distance slots that indicate a distance <= 127. */
+#define FULL_DISTANCES_BITS (DIST_MODEL_END / 2)
+#define FULL_DISTANCES (1 << FULL_DISTANCES_BITS)
+
+/*
+ * For match distances greater than 127, only the highest two bits and the
+ * lowest four bits (alignment) is encoded using probabilities.
+ */
+#define ALIGN_BITS 4
+#define ALIGN_SIZE (1 << ALIGN_BITS)
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+/* Total number of all probability variables */
+#define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE)
+
+/*
+ * LZMA remembers the four most recent match distances. Reusing these
+ * distances tends to take less space than re-encoding the actual
+ * distance value.
+ */
+#define REPS 4
+
+#endif
diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h
new file mode 100644
index 000000000000..a65633e06962
--- /dev/null
+++ b/lib/xz/xz_private.h
@@ -0,0 +1,156 @@
+/*
+ * Private includes and definitions
+ *
+ * Author: Lasse Collin <lasse.collin@tukaani.org>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef XZ_PRIVATE_H
+#define XZ_PRIVATE_H
+
+#ifdef __KERNEL__
+#	include <linux/xz.h>
+#	include <asm/byteorder.h>
+#	include <asm/unaligned.h>
+	/* XZ_PREBOOT may be defined only via decompress_unxz.c. */
+#	ifndef XZ_PREBOOT
+#		include <linux/slab.h>
+#		include <linux/vmalloc.h>
+#		include <linux/string.h>
+#		ifdef CONFIG_XZ_DEC_X86
+#			define XZ_DEC_X86
+#		endif
+#		ifdef CONFIG_XZ_DEC_POWERPC
+#			define XZ_DEC_POWERPC
+#		endif
+#		ifdef CONFIG_XZ_DEC_IA64
+#			define XZ_DEC_IA64
+#		endif
+#		ifdef CONFIG_XZ_DEC_ARM
+#			define XZ_DEC_ARM
+#		endif
+#		ifdef CONFIG_XZ_DEC_ARMTHUMB
+#			define XZ_DEC_ARMTHUMB
+#		endif
+#		ifdef CONFIG_XZ_DEC_SPARC
+#			define XZ_DEC_SPARC
+#		endif
+#		define memeq(a, b, size) (memcmp(a, b, size) == 0)
+#		define memzero(buf, size) memset(buf, 0, size)
+#	endif
+#	define get_le32(p) le32_to_cpup((const uint32_t *)(p))
+#else
+	/*
+	 * For userspace builds, use a separate header to define the required
+	 * macros and functions. This makes it easier to adapt the code into
+	 * different environments and avoids clutter in the Linux kernel tree.
+	 */
+#	include "xz_config.h"
+#endif
+
+/* If no specific decoding mode is requested, enable support for all modes. */
+#if !defined(XZ_DEC_SINGLE) && !defined(XZ_DEC_PREALLOC) \
+		&& !defined(XZ_DEC_DYNALLOC)
+#	define XZ_DEC_SINGLE
+#	define XZ_DEC_PREALLOC
+#	define XZ_DEC_DYNALLOC
+#endif
+
+/*
+ * The DEC_IS_foo(mode) macros are used in "if" statements. If only some
+ * of the supported modes are enabled, these macros will evaluate to true or
+ * false at compile time and thus allow the compiler to omit unneeded code.
+ */
+#ifdef XZ_DEC_SINGLE
+#	define DEC_IS_SINGLE(mode) ((mode) == XZ_SINGLE)
+#else
+#	define DEC_IS_SINGLE(mode) (false)
+#endif
+
+#ifdef XZ_DEC_PREALLOC
+#	define DEC_IS_PREALLOC(mode) ((mode) == XZ_PREALLOC)
+#else
+#	define DEC_IS_PREALLOC(mode) (false)
+#endif
+
+#ifdef XZ_DEC_DYNALLOC
+#	define DEC_IS_DYNALLOC(mode) ((mode) == XZ_DYNALLOC)
+#else
+#	define DEC_IS_DYNALLOC(mode) (false)
+#endif
+
+#if !defined(XZ_DEC_SINGLE)
+#	define DEC_IS_MULTI(mode) (true)
+#elif defined(XZ_DEC_PREALLOC) || defined(XZ_DEC_DYNALLOC)
+#	define DEC_IS_MULTI(mode) ((mode) != XZ_SINGLE)
+#else
+#	define DEC_IS_MULTI(mode) (false)
+#endif
+
+/*
+ * If any of the BCJ filter decoders are wanted, define XZ_DEC_BCJ.
+ * XZ_DEC_BCJ is used to enable generic support for BCJ decoders.
+ */
+#ifndef XZ_DEC_BCJ
+#	if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \
+			|| defined(XZ_DEC_IA64) || defined(XZ_DEC_ARM) \
+			|| defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \
+			|| defined(XZ_DEC_SPARC)
+#		define XZ_DEC_BCJ
+#	endif
+#endif
+
+/*
+ * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used
+ * before calling xz_dec_lzma2_run().
+ */
+XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode,
+						   uint32_t dict_max);
+
+/*
+ * Decode the LZMA2 properties (one byte) and reset the decoder. Return
+ * XZ_OK on success, XZ_MEMLIMIT_ERROR if the preallocated dictionary is not
+ * big enough, and XZ_OPTIONS_ERROR if props indicates something that this
+ * decoder doesn't support.
+ */
+XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s,
+					 uint8_t props);
+
+/* Decode raw LZMA2 stream from b->in to b->out. */
+XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
+				       struct xz_buf *b);
+
+/* Free the memory allocated for the LZMA2 decoder. */
+XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s);
+
+#ifdef XZ_DEC_BCJ
+/*
+ * Allocate memory for BCJ decoders. xz_dec_bcj_reset() must be used before
+ * calling xz_dec_bcj_run().
+ */
+XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call);
+
+/*
+ * Decode the Filter ID of a BCJ filter. This implementation doesn't
+ * support custom start offsets, so no decoding of Filter Properties
+ * is needed. Returns XZ_OK if the given Filter ID is supported.
+ * Otherwise XZ_OPTIONS_ERROR is returned.
+ */
+XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id);
+
+/*
+ * Decode raw BCJ + LZMA2 stream. This must be used only if there actually is
+ * a BCJ filter in the chain. If the chain has only LZMA2, xz_dec_lzma2_run()
+ * must be called directly.
+ */
+XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
+				     struct xz_dec_lzma2 *lzma2,
+				     struct xz_buf *b);
+
+/* Free the memory allocated for the BCJ filters. */
+#define xz_dec_bcj_end(s) kfree(s)
+#endif
+
+#endif
diff --git a/lib/xz/xz_stream.h b/lib/xz/xz_stream.h
new file mode 100644
index 000000000000..66cb5a7055ec
--- /dev/null
+++ b/lib/xz/xz_stream.h
@@ -0,0 +1,62 @@
+/*
+ * Definitions for handling the .xz file format
+ *
+ * Author: Lasse Collin <lasse.collin@tukaani.org>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef XZ_STREAM_H
+#define XZ_STREAM_H
+
+#if defined(__KERNEL__) && !XZ_INTERNAL_CRC32
+#	include <linux/crc32.h>
+#	undef crc32
+#	define xz_crc32(buf, size, crc) \
+		(~crc32_le(~(uint32_t)(crc), buf, size))
+#endif
+
+/*
+ * See the .xz file format specification at
+ * http://tukaani.org/xz/xz-file-format.txt
+ * to understand the container format.
+ */
+
+#define STREAM_HEADER_SIZE 12
+
+#define HEADER_MAGIC "\3757zXZ"
+#define HEADER_MAGIC_SIZE 6
+
+#define FOOTER_MAGIC "YZ"
+#define FOOTER_MAGIC_SIZE 2
+
+/*
+ * Variable-length integer can hold a 63-bit unsigned integer or a special
+ * value indicating that the value is unknown.
+ *
+ * Experimental: vli_type can be defined to uint32_t to save a few bytes
+ * in code size (no effect on speed). Doing so limits the uncompressed and
+ * compressed size of the file to less than 256 MiB and may also weaken
+ * error detection slightly.
+ */
+typedef uint64_t vli_type;
+
+#define VLI_MAX ((vli_type)-1 / 2)
+#define VLI_UNKNOWN ((vli_type)-1)
+
+/* Maximum encoded size of a VLI */
+#define VLI_BYTES_MAX (sizeof(vli_type) * 8 / 7)
+
+/* Integrity Check types */
+enum xz_check {
+	XZ_CHECK_NONE = 0,
+	XZ_CHECK_CRC32 = 1,
+	XZ_CHECK_CRC64 = 4,
+	XZ_CHECK_SHA256 = 10
+};
+
+/* Maximum possible Check ID */
+#define XZ_CHECK_MAX 15
+
+#endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 396da16aabf8..1c702ca8aac8 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -262,6 +262,34 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \
 	lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
 	(rm -f $@ ; false)
 
+# XZ
+# ---------------------------------------------------------------------------
+# Use xzkern to compress the kernel image and xzmisc to compress other things.
+#
+# xzkern uses a big LZMA2 dictionary since it doesn't increase memory usage
+# of the kernel decompressor. A BCJ filter is used if it is available for
+# the target architecture. xzkern also appends uncompressed size of the data
+# using size_append. The .xz format has the size information available at
+# the end of the file too, but it's in more complex format and it's good to
+# avoid changing the part of the boot code that reads the uncompressed size.
+# Note that the bytes added by size_append will make the xz tool think that
+# the file is corrupt. This is expected.
+#
+# xzmisc doesn't use size_append, so it can be used to create normal .xz
+# files. xzmisc uses smaller LZMA2 dictionary than xzkern, because a very
+# big dictionary would increase the memory usage too much in the multi-call
+# decompression mode. A BCJ filter isn't used either.
+quiet_cmd_xzkern = XZKERN  $@
+cmd_xzkern = (cat $(filter-out FORCE,$^) | \
+	sh $(srctree)/scripts/xz_wrap.sh && \
+	$(call size_append, $(filter-out FORCE,$^))) > $@ || \
+	(rm -f $@ ; false)
+
+quiet_cmd_xzmisc = XZMISC  $@
+cmd_xzmisc = (cat $(filter-out FORCE,$^) | \
+	xz --check=crc32 --lzma2=dict=1MiB) > $@ || \
+	(rm -f $@ ; false)
+
 # misc stuff
 # ---------------------------------------------------------------------------
 quote:="
diff --git a/scripts/xz_wrap.sh b/scripts/xz_wrap.sh
new file mode 100644
index 000000000000..17a5798c29da
--- /dev/null
+++ b/scripts/xz_wrap.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+#
+# This is a wrapper for xz to compress the kernel image using appropriate
+# compression options depending on the architecture.
+#
+# Author: Lasse Collin <lasse.collin@tukaani.org>
+#
+# This file has been put into the public domain.
+# You can do whatever you want with this file.
+#
+
+BCJ=
+LZMA2OPTS=
+
+case $ARCH in
+	x86|x86_64)     BCJ=--x86 ;;
+	powerpc)        BCJ=--powerpc ;;
+	ia64)           BCJ=--ia64; LZMA2OPTS=pb=4 ;;
+	arm)            BCJ=--arm ;;
+	sparc)          BCJ=--sparc ;;
+esac
+
+exec xz --check=crc32 $BCJ --lzma2=$LZMA2OPTS,dict=32MiB
-- 
cgit v1.2.3-71-gd317


From 3ebe12439ba7fc62e1d6ecb569b7287771716ca1 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Wed, 12 Jan 2011 17:01:23 -0800
Subject: decompressors: add boot-time XZ support

This implements the API defined in <linux/decompress/generic.h> which is
used for kernel, initramfs, and initrd decompression.  This patch together
with the first patch is enough for XZ-compressed initramfs and initrd;
XZ-compressed kernel will need arch-specific changes.

The buffering requirements described in decompress_unxz.c are stricter
than with gzip, so the relevant changes should be done to the
arch-specific code when adding support for XZ-compressed kernel.
Similarly, the heap size in arch-specific pre-boot code may need to be
increased (30 KiB is enough).

The XZ decompressor needs memmove(), memeq() (memcmp() == 0), and
memzero() (memset(ptr, 0, size)), which aren't available in all
arch-specific pre-boot environments.  I'm including simple versions in
decompress_unxz.c, but a cleaner solution would naturally be nicer.

Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alain Knaff <alain@knaff.lu>
Cc: Albin Tonnerre <albin.tonnerre@free-electrons.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/decompress/unxz.h |  19 ++
 init/Kconfig                    |  20 +-
 lib/Kconfig                     |   4 +
 lib/Makefile                    |   1 +
 lib/decompress.c                |   5 +
 lib/decompress_unxz.c           | 397 ++++++++++++++++++++++++++++++++++++++++
 scripts/gen_initramfs_list.sh   |   2 +
 usr/Kconfig                     |  18 ++
 usr/Makefile                    |   5 +-
 9 files changed, 469 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/decompress/unxz.h
 create mode 100644 lib/decompress_unxz.c

(limited to 'include/linux')

diff --git a/include/linux/decompress/unxz.h b/include/linux/decompress/unxz.h
new file mode 100644
index 000000000000..41728fc6c8a1
--- /dev/null
+++ b/include/linux/decompress/unxz.h
@@ -0,0 +1,19 @@
+/*
+ * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd
+ *
+ * Author: Lasse Collin <lasse.collin@tukaani.org>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#ifndef DECOMPRESS_UNXZ_H
+#define DECOMPRESS_UNXZ_H
+
+int unxz(unsigned char *in, int in_size,
+	 int (*fill)(void *dest, unsigned int size),
+	 int (*flush)(void *src, unsigned int size),
+	 unsigned char *out, int *in_used,
+	 void (*error)(char *x));
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 8dfd094e6875..ea176e8edbdd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -130,13 +130,16 @@ config HAVE_KERNEL_BZIP2
 config HAVE_KERNEL_LZMA
 	bool
 
+config HAVE_KERNEL_XZ
+	bool
+
 config HAVE_KERNEL_LZO
 	bool
 
 choice
 	prompt "Kernel compression mode"
 	default KERNEL_GZIP
-	depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_LZO
+	depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO
 	help
 	  The linux kernel is a kind of self-extracting executable.
 	  Several compression algorithms are available, which differ
@@ -181,6 +184,21 @@ config KERNEL_LZMA
 	  two. Compression is slowest.	The kernel size is about 33%
 	  smaller with LZMA in comparison to gzip.
 
+config KERNEL_XZ
+	bool "XZ"
+	depends on HAVE_KERNEL_XZ
+	help
+	  XZ uses the LZMA2 algorithm and instruction set specific
+	  BCJ filters which can improve compression ratio of executable
+	  code. The size of the kernel is about 30% smaller with XZ in
+	  comparison to gzip. On architectures for which there is a BCJ
+	  filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ
+	  will create a few percent smaller kernel than plain LZMA.
+
+	  The speed is about the same as with LZMA: The decompression
+	  speed of XZ is better than that of bzip2 but worse than gzip
+	  and LZO. Compression is slow.
+
 config KERNEL_LZO
 	bool "LZO"
 	depends on HAVE_KERNEL_LZO
diff --git a/lib/Kconfig b/lib/Kconfig
index 2b8f8540d670..0ee67e08ad3e 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -122,6 +122,10 @@ config DECOMPRESS_BZIP2
 config DECOMPRESS_LZMA
 	tristate
 
+config DECOMPRESS_XZ
+	select XZ_DEC
+	tristate
+
 config DECOMPRESS_LZO
 	select LZO_DECOMPRESS
 	tristate
diff --git a/lib/Makefile b/lib/Makefile
index 4df2d0297721..cbb774f7d41d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_RAID6_PQ) += raid6/
 lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
 lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
 lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
+lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o
 lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o
 
 obj-$(CONFIG_TEXTSEARCH) += textsearch.o
diff --git a/lib/decompress.c b/lib/decompress.c
index a7606815541f..3d766b7f60ab 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -8,6 +8,7 @@
 
 #include <linux/decompress/bunzip2.h>
 #include <linux/decompress/unlzma.h>
+#include <linux/decompress/unxz.h>
 #include <linux/decompress/inflate.h>
 #include <linux/decompress/unlzo.h>
 
@@ -23,6 +24,9 @@
 #ifndef CONFIG_DECOMPRESS_LZMA
 # define unlzma NULL
 #endif
+#ifndef CONFIG_DECOMPRESS_XZ
+# define unxz NULL
+#endif
 #ifndef CONFIG_DECOMPRESS_LZO
 # define unlzo NULL
 #endif
@@ -36,6 +40,7 @@ static const struct compress_format {
 	{ {037, 0236}, "gzip", gunzip },
 	{ {0x42, 0x5a}, "bzip2", bunzip2 },
 	{ {0x5d, 0x00}, "lzma", unlzma },
+	{ {0xfd, 0x37}, "xz", unxz },
 	{ {0x89, 0x4c}, "lzo", unlzo },
 	{ {0, 0}, NULL, NULL }
 };
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
new file mode 100644
index 000000000000..cecd23df2b9a
--- /dev/null
+++ b/lib/decompress_unxz.c
@@ -0,0 +1,397 @@
+/*
+ * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd
+ *
+ * Author: Lasse Collin <lasse.collin@tukaani.org>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+/*
+ * Important notes about in-place decompression
+ *
+ * At least on x86, the kernel is decompressed in place: the compressed data
+ * is placed to the end of the output buffer, and the decompressor overwrites
+ * most of the compressed data. There must be enough safety margin to
+ * guarantee that the write position is always behind the read position.
+ *
+ * The safety margin for XZ with LZMA2 or BCJ+LZMA2 is calculated below.
+ * Note that the margin with XZ is bigger than with Deflate (gzip)!
+ *
+ * The worst case for in-place decompression is that the beginning of
+ * the file is compressed extremely well, and the rest of the file is
+ * uncompressible. Thus, we must look for worst-case expansion when the
+ * compressor is encoding uncompressible data.
+ *
+ * The structure of the .xz file in case of a compresed kernel is as follows.
+ * Sizes (as bytes) of the fields are in parenthesis.
+ *
+ *    Stream Header (12)
+ *    Block Header:
+ *      Block Header (8-12)
+ *      Compressed Data (N)
+ *      Block Padding (0-3)
+ *      CRC32 (4)
+ *    Index (8-20)
+ *    Stream Footer (12)
+ *
+ * Normally there is exactly one Block, but let's assume that there are
+ * 2-4 Blocks just in case. Because Stream Header and also Block Header
+ * of the first Block don't make the decompressor produce any uncompressed
+ * data, we can ignore them from our calculations. Block Headers of possible
+ * additional Blocks have to be taken into account still. With these
+ * assumptions, it is safe to assume that the total header overhead is
+ * less than 128 bytes.
+ *
+ * Compressed Data contains LZMA2 or BCJ+LZMA2 encoded data. Since BCJ
+ * doesn't change the size of the data, it is enough to calculate the
+ * safety margin for LZMA2.
+ *
+ * LZMA2 stores the data in chunks. Each chunk has a header whose size is
+ * a maximum of 6 bytes, but to get round 2^n numbers, let's assume that
+ * the maximum chunk header size is 8 bytes. After the chunk header, there
+ * may be up to 64 KiB of actual payload in the chunk. Often the payload is
+ * quite a bit smaller though; to be safe, let's assume that an average
+ * chunk has only 32 KiB of payload.
+ *
+ * The maximum uncompressed size of the payload is 2 MiB. The minimum
+ * uncompressed size of the payload is in practice never less than the
+ * payload size itself. The LZMA2 format would allow uncompressed size
+ * to be less than the payload size, but no sane compressor creates such
+ * files. LZMA2 supports storing uncompressible data in uncompressed form,
+ * so there's never a need to create payloads whose uncompressed size is
+ * smaller than the compressed size.
+ *
+ * The assumption, that the uncompressed size of the payload is never
+ * smaller than the payload itself, is valid only when talking about
+ * the payload as a whole. It is possible that the payload has parts where
+ * the decompressor consumes more input than it produces output. Calculating
+ * the worst case for this would be tricky. Instead of trying to do that,
+ * let's simply make sure that the decompressor never overwrites any bytes
+ * of the payload which it is currently reading.
+ *
+ * Now we have enough information to calculate the safety margin. We need
+ *   - 128 bytes for the .xz file format headers;
+ *   - 8 bytes per every 32 KiB of uncompressed size (one LZMA2 chunk header
+ *     per chunk, each chunk having average payload size of 32 KiB); and
+ *   - 64 KiB (biggest possible LZMA2 chunk payload size) to make sure that
+ *     the decompressor never overwrites anything from the LZMA2 chunk
+ *     payload it is currently reading.
+ *
+ * We get the following formula:
+ *
+ *    safety_margin = 128 + uncompressed_size * 8 / 32768 + 65536
+ *                  = 128 + (uncompressed_size >> 12) + 65536
+ *
+ * For comparision, according to arch/x86/boot/compressed/misc.c, the
+ * equivalent formula for Deflate is this:
+ *
+ *    safety_margin = 18 + (uncompressed_size >> 12) + 32768
+ *
+ * Thus, when updating Deflate-only in-place kernel decompressor to
+ * support XZ, the fixed overhead has to be increased from 18+32768 bytes
+ * to 128+65536 bytes.
+ */
+
+/*
+ * STATIC is defined to "static" if we are being built for kernel
+ * decompression (pre-boot code). <linux/decompress/mm.h> will define
+ * STATIC to empty if it wasn't already defined. Since we will need to
+ * know later if we are being used for kernel decompression, we define
+ * XZ_PREBOOT here.
+ */
+#ifdef STATIC
+#	define XZ_PREBOOT
+#endif
+#ifdef __KERNEL__
+#	include <linux/decompress/mm.h>
+#endif
+#define XZ_EXTERN STATIC
+
+#ifndef XZ_PREBOOT
+#	include <linux/slab.h>
+#	include <linux/xz.h>
+#else
+/*
+ * Use the internal CRC32 code instead of kernel's CRC32 module, which
+ * is not available in early phase of booting.
+ */
+#define XZ_INTERNAL_CRC32 1
+
+/*
+ * For boot time use, we enable only the BCJ filter of the current
+ * architecture or none if no BCJ filter is available for the architecture.
+ */
+#ifdef CONFIG_X86
+#	define XZ_DEC_X86
+#endif
+#ifdef CONFIG_PPC
+#	define XZ_DEC_POWERPC
+#endif
+#ifdef CONFIG_ARM
+#	define XZ_DEC_ARM
+#endif
+#ifdef CONFIG_IA64
+#	define XZ_DEC_IA64
+#endif
+#ifdef CONFIG_SPARC
+#	define XZ_DEC_SPARC
+#endif
+
+/*
+ * This will get the basic headers so that memeq() and others
+ * can be defined.
+ */
+#include "xz/xz_private.h"
+
+/*
+ * Replace the normal allocation functions with the versions from
+ * <linux/decompress/mm.h>. vfree() needs to support vfree(NULL)
+ * when XZ_DYNALLOC is used, but the pre-boot free() doesn't support it.
+ * Workaround it here because the other decompressors don't need it.
+ */
+#undef kmalloc
+#undef kfree
+#undef vmalloc
+#undef vfree
+#define kmalloc(size, flags) malloc(size)
+#define kfree(ptr) free(ptr)
+#define vmalloc(size) malloc(size)
+#define vfree(ptr) do { if (ptr != NULL) free(ptr); } while (0)
+
+/*
+ * FIXME: Not all basic memory functions are provided in architecture-specific
+ * files (yet). We define our own versions here for now, but this should be
+ * only a temporary solution.
+ *
+ * memeq and memzero are not used much and any remotely sane implementation
+ * is fast enough. memcpy/memmove speed matters in multi-call mode, but
+ * the kernel image is decompressed in single-call mode, in which only
+ * memcpy speed can matter and only if there is a lot of uncompressible data
+ * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the
+ * functions below should just be kept small; it's probably not worth
+ * optimizing for speed.
+ */
+
+#ifndef memeq
+static bool memeq(const void *a, const void *b, size_t size)
+{
+	const uint8_t *x = a;
+	const uint8_t *y = b;
+	size_t i;
+
+	for (i = 0; i < size; ++i)
+		if (x[i] != y[i])
+			return false;
+
+	return true;
+}
+#endif
+
+#ifndef memzero
+static void memzero(void *buf, size_t size)
+{
+	uint8_t *b = buf;
+	uint8_t *e = b + size;
+
+	while (b != e)
+		*b++ = '\0';
+}
+#endif
+
+#ifndef memmove
+/* Not static to avoid a conflict with the prototype in the Linux headers. */
+void *memmove(void *dest, const void *src, size_t size)
+{
+	uint8_t *d = dest;
+	const uint8_t *s = src;
+	size_t i;
+
+	if (d < s) {
+		for (i = 0; i < size; ++i)
+			d[i] = s[i];
+	} else if (d > s) {
+		i = size;
+		while (i-- > 0)
+			d[i] = s[i];
+	}
+
+	return dest;
+}
+#endif
+
+/*
+ * Since we need memmove anyway, would use it as memcpy too.
+ * Commented out for now to avoid breaking things.
+ */
+/*
+#ifndef memcpy
+#	define memcpy memmove
+#endif
+*/
+
+#include "xz/xz_crc32.c"
+#include "xz/xz_dec_stream.c"
+#include "xz/xz_dec_lzma2.c"
+#include "xz/xz_dec_bcj.c"
+
+#endif /* XZ_PREBOOT */
+
+/* Size of the input and output buffers in multi-call mode */
+#define XZ_IOBUF_SIZE 4096
+
+/*
+ * This function implements the API defined in <linux/decompress/generic.h>.
+ *
+ * This wrapper will automatically choose single-call or multi-call mode
+ * of the native XZ decoder API. The single-call mode can be used only when
+ * both input and output buffers are available as a single chunk, i.e. when
+ * fill() and flush() won't be used.
+ */
+STATIC int INIT unxz(unsigned char *in, int in_size,
+		     int (*fill)(void *dest, unsigned int size),
+		     int (*flush)(void *src, unsigned int size),
+		     unsigned char *out, int *in_used,
+		     void (*error)(char *x))
+{
+	struct xz_buf b;
+	struct xz_dec *s;
+	enum xz_ret ret;
+	bool must_free_in = false;
+
+#if XZ_INTERNAL_CRC32
+	xz_crc32_init();
+#endif
+
+	if (in_used != NULL)
+		*in_used = 0;
+
+	if (fill == NULL && flush == NULL)
+		s = xz_dec_init(XZ_SINGLE, 0);
+	else
+		s = xz_dec_init(XZ_DYNALLOC, (uint32_t)-1);
+
+	if (s == NULL)
+		goto error_alloc_state;
+
+	if (flush == NULL) {
+		b.out = out;
+		b.out_size = (size_t)-1;
+	} else {
+		b.out_size = XZ_IOBUF_SIZE;
+		b.out = malloc(XZ_IOBUF_SIZE);
+		if (b.out == NULL)
+			goto error_alloc_out;
+	}
+
+	if (in == NULL) {
+		must_free_in = true;
+		in = malloc(XZ_IOBUF_SIZE);
+		if (in == NULL)
+			goto error_alloc_in;
+	}
+
+	b.in = in;
+	b.in_pos = 0;
+	b.in_size = in_size;
+	b.out_pos = 0;
+
+	if (fill == NULL && flush == NULL) {
+		ret = xz_dec_run(s, &b);
+	} else {
+		do {
+			if (b.in_pos == b.in_size && fill != NULL) {
+				if (in_used != NULL)
+					*in_used += b.in_pos;
+
+				b.in_pos = 0;
+
+				in_size = fill(in, XZ_IOBUF_SIZE);
+				if (in_size < 0) {
+					/*
+					 * This isn't an optimal error code
+					 * but it probably isn't worth making
+					 * a new one either.
+					 */
+					ret = XZ_BUF_ERROR;
+					break;
+				}
+
+				b.in_size = in_size;
+			}
+
+			ret = xz_dec_run(s, &b);
+
+			if (flush != NULL && (b.out_pos == b.out_size
+					|| (ret != XZ_OK && b.out_pos > 0))) {
+				/*
+				 * Setting ret here may hide an error
+				 * returned by xz_dec_run(), but probably
+				 * it's not too bad.
+				 */
+				if (flush(b.out, b.out_pos) != (int)b.out_pos)
+					ret = XZ_BUF_ERROR;
+
+				b.out_pos = 0;
+			}
+		} while (ret == XZ_OK);
+
+		if (must_free_in)
+			free(in);
+
+		if (flush != NULL)
+			free(b.out);
+	}
+
+	if (in_used != NULL)
+		*in_used += b.in_pos;
+
+	xz_dec_end(s);
+
+	switch (ret) {
+	case XZ_STREAM_END:
+		return 0;
+
+	case XZ_MEM_ERROR:
+		/* This can occur only in multi-call mode. */
+		error("XZ decompressor ran out of memory");
+		break;
+
+	case XZ_FORMAT_ERROR:
+		error("Input is not in the XZ format (wrong magic bytes)");
+		break;
+
+	case XZ_OPTIONS_ERROR:
+		error("Input was encoded with settings that are not "
+				"supported by this XZ decoder");
+		break;
+
+	case XZ_DATA_ERROR:
+	case XZ_BUF_ERROR:
+		error("XZ-compressed data is corrupt");
+		break;
+
+	default:
+		error("Bug in the XZ decompressor");
+		break;
+	}
+
+	return -1;
+
+error_alloc_in:
+	if (flush != NULL)
+		free(b.out);
+
+error_alloc_out:
+	xz_dec_end(s);
+
+error_alloc_state:
+	error("XZ decompressor ran out of memory");
+	return -1;
+}
+
+/*
+ * This macro is used by architecture-specific files to decompress
+ * the kernel image.
+ */
+#define decompress unxz
diff --git a/scripts/gen_initramfs_list.sh b/scripts/gen_initramfs_list.sh
index 5958fffb2114..55caecdad995 100644
--- a/scripts/gen_initramfs_list.sh
+++ b/scripts/gen_initramfs_list.sh
@@ -243,6 +243,8 @@ case "$arg" in
 		echo "$output_file" | grep -q "\.gz$" && compr="gzip -9 -f"
 		echo "$output_file" | grep -q "\.bz2$" && compr="bzip2 -9 -f"
 		echo "$output_file" | grep -q "\.lzma$" && compr="lzma -9 -f"
+		echo "$output_file" | grep -q "\.xz$" && \
+				compr="xz --check=crc32 --lzma2=dict=1MiB"
 		echo "$output_file" | grep -q "\.lzo$" && compr="lzop -9 -f"
 		echo "$output_file" | grep -q "\.cpio$" && compr="cat"
 		shift
diff --git a/usr/Kconfig b/usr/Kconfig
index c2c7fe2f717d..4780deac5974 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -72,6 +72,15 @@ config RD_LZMA
 	  Support loading of a LZMA encoded initial ramdisk or cpio buffer
 	  If unsure, say N.
 
+config RD_XZ
+	bool "Support initial ramdisks compressed using XZ" if EMBEDDED
+	default !EMBEDDED
+	depends on BLK_DEV_INITRD
+	select DECOMPRESS_XZ
+	help
+	  Support loading of a XZ encoded initial ramdisk or cpio buffer.
+	  If unsure, say N.
+
 config RD_LZO
 	bool "Support initial ramdisks compressed using LZO" if EMBEDDED
 	default !EMBEDDED
@@ -139,6 +148,15 @@ config INITRAMFS_COMPRESSION_LZMA
 	  three. Compression is slowest. The initramfs size is about 33%
 	  smaller with LZMA in comparison to gzip.
 
+config INITRAMFS_COMPRESSION_XZ
+	bool "XZ"
+	depends on RD_XZ
+	help
+	  XZ uses the LZMA2 algorithm. The initramfs size is about 30%
+	  smaller with XZ in comparison to gzip. Decompression speed
+	  is better than that of bzip2 but worse than gzip and LZO.
+	  Compression is slow.
+
 config INITRAMFS_COMPRESSION_LZO
 	bool "LZO"
 	depends on RD_LZO
diff --git a/usr/Makefile b/usr/Makefile
index 6faa444b7087..029ffe6cd0d8 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -15,6 +15,9 @@ suffix_$(CONFIG_INITRAMFS_COMPRESSION_BZIP2)  = .bz2
 # Lzma
 suffix_$(CONFIG_INITRAMFS_COMPRESSION_LZMA)   = .lzma
 
+# XZ
+suffix_$(CONFIG_INITRAMFS_COMPRESSION_XZ)     = .xz
+
 # Lzo
 suffix_$(CONFIG_INITRAMFS_COMPRESSION_LZO)   = .lzo
 
@@ -50,7 +53,7 @@ endif
 quiet_cmd_initfs = GEN     $@
       cmd_initfs = $(initramfs) -o $@ $(ramfs-args) $(ramfs-input)
 
-targets := initramfs_data.cpio.gz initramfs_data.cpio.bz2 initramfs_data.cpio.lzma initramfs_data.cpio.lzo initramfs_data.cpio
+targets := initramfs_data.cpio.gz initramfs_data.cpio.bz2 initramfs_data.cpio.lzma initramfs_data.cpio.xz initramfs_data.cpio.lzo initramfs_data.cpio
 # do not try to update files included in initramfs
 $(deps_initramfs): ;
 
-- 
cgit v1.2.3-71-gd317


From 5cb2fad28fd8f95e911bed8c9342435a5b8e67de Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Wed, 12 Jan 2011 17:01:26 -0800
Subject: decompressors: remove unused constant from inflate.h

Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/decompress/inflate.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/decompress/inflate.h b/include/linux/decompress/inflate.h
index f9b06ccc3e5c..8c0aef1ba5f5 100644
--- a/include/linux/decompress/inflate.h
+++ b/include/linux/decompress/inflate.h
@@ -1,9 +1,6 @@
 #ifndef INFLATE_H
 #define INFLATE_H
 
-/* Other housekeeping constants */
-#define INBUFSIZ 4096
-
 int gunzip(unsigned char *inbuf, int len,
 	   int(*fill)(void*, unsigned int),
 	   int(*flush)(void*, unsigned int),
-- 
cgit v1.2.3-71-gd317


From 84c89557a302e18414a011cc52b1abd034860743 Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Thu, 13 Jan 2011 19:59:47 +0000
Subject: dm ioctl: allow rename to fill empty uuid

Allow the uuid of a mapped device to be set after device creation.
Previously the uuid (which is optional) could only be set by
DM_DEV_CREATE.  If no uuid was supplied it could not be set later.

Sometimes it's necessary to create the device before the uuid is known,
and in such cases the uuid must be filled in after the creation.

This patch extends DM_DEV_RENAME to accept a uuid accompanied by
a new flag DM_UUID_FLAG.  This can only be done once and if no
uuid was previously supplied.  It cannot be used to change an
existing uuid.

DM_VERSION_MINOR is also bumped to 19 to indicate this interface
extension is available.

Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c    | 103 +++++++++++++++++++++++++++++++++++------------
 include/linux/dm-ioctl.h |  12 ++++--
 2 files changed, 87 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..f3481d922206 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
 		DMWARN("remove_all left %d open device(s)", dev_skipped);
 }
 
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+	mutex_lock(&dm_hash_cells_mutex);
+	hc->uuid = new_uuid;
+	mutex_unlock(&dm_hash_cells_mutex);
+
+	list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+	char *old_name;
+
+	/*
+	 * Rename and move the name cell.
+	 */
+	list_del(&hc->name_list);
+	old_name = hc->name;
+
+	mutex_lock(&dm_hash_cells_mutex);
+	hc->name = new_name;
+	mutex_unlock(&dm_hash_cells_mutex);
+
+	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+	return old_name;
+}
+
 static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 					    const char *new)
 {
-	char *new_name, *old_name;
+	char *new_data, *old_name = NULL;
 	struct hash_cell *hc;
 	struct dm_table *table;
 	struct mapped_device *md;
+	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
 	/*
 	 * duplicate new.
 	 */
-	new_name = kstrdup(new, GFP_KERNEL);
-	if (!new_name)
+	new_data = kstrdup(new, GFP_KERNEL);
+	if (!new_data)
 		return ERR_PTR(-ENOMEM);
 
 	down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	/*
 	 * Is new free ?
 	 */
-	hc = __get_name_cell(new);
+	if (change_uuid)
+		hc = __get_uuid_cell(new);
+	else
+		hc = __get_name_cell(new);
+
 	if (hc) {
-		DMWARN("asked to rename to an already-existing name %s -> %s",
+		DMWARN("Unable to change %s on mapped device %s to one that "
+		       "already exists: %s",
+		       change_uuid ? "uuid" : "name",
 		       param->name, new);
 		dm_put(hc->md);
 		up_write(&_hash_lock);
-		kfree(new_name);
+		kfree(new_data);
 		return ERR_PTR(-EBUSY);
 	}
 
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	 */
 	hc = __get_name_cell(param->name);
 	if (!hc) {
-		DMWARN("asked to rename a non-existent device %s -> %s",
-		       param->name, new);
+		DMWARN("Unable to rename non-existent device, %s to %s%s",
+		       param->name, change_uuid ? "uuid " : "", new);
 		up_write(&_hash_lock);
-		kfree(new_name);
+		kfree(new_data);
 		return ERR_PTR(-ENXIO);
 	}
 
 	/*
-	 * rename and move the name cell.
+	 * Does this device already have a uuid?
 	 */
-	list_del(&hc->name_list);
-	old_name = hc->name;
-	mutex_lock(&dm_hash_cells_mutex);
-	hc->name = new_name;
-	mutex_unlock(&dm_hash_cells_mutex);
-	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+	if (change_uuid && hc->uuid) {
+		DMWARN("Unable to change uuid of mapped device %s to %s "
+		       "because uuid is already set to %s",
+		       param->name, new, hc->uuid);
+		dm_put(hc->md);
+		up_write(&_hash_lock);
+		kfree(new_data);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (change_uuid)
+		__set_cell_uuid(hc, new_data);
+	else
+		old_name = __change_cell_name(hc, new_data);
 
 	/*
 	 * Wake up any dm event waiters.
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
 static int dev_rename(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
-	char *new_name = (char *) param + param->data_start;
+	char *new_data = (char *) param + param->data_start;
 	struct mapped_device *md;
+	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
-	if (new_name < param->data ||
-	    invalid_str(new_name, (void *) param + param_size) ||
-	    strlen(new_name) > DM_NAME_LEN - 1) {
-		DMWARN("Invalid new logical volume name supplied.");
+	if (new_data < param->data ||
+	    invalid_str(new_data, (void *) param + param_size) ||
+	    strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
+		DMWARN("Invalid new mapped device name or uuid string supplied.");
 		return -EINVAL;
 	}
 
-	r = check_name(new_name);
-	if (r)
-		return r;
+	if (!change_uuid) {
+		r = check_name(new_data);
+		if (r)
+			return r;
+	}
 
-	md = dm_hash_rename(param, new_name);
+	md = dm_hash_rename(param, new_data);
 	if (IS_ERR(md))
 		return PTR_ERR(md);
 
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 49eab360d5d4..e453e1174ae1 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -44,7 +44,7 @@
  * Remove a device, destroy any tables.
  *
  * DM_DEV_RENAME:
- * Rename a device.
+ * Rename a device or set its uuid if none was previously supplied.
  *
  * DM_SUSPEND:
  * This performs both suspend and resume, depending which flag is
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	18
+#define DM_VERSION_MINOR	19
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2010-06-29)"
+#define DM_VERSION_EXTRA	"-ioctl (2010-10-14)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -322,4 +322,10 @@ enum {
  */
 #define DM_UEVENT_GENERATED_FLAG	(1 << 13) /* Out */
 
+/*
+ * If set, rename changes the uuid not the name.  Only permitted
+ * if no uuid was previously supplied: an existing uuid cannot be changed.
+ */
+#define DM_UUID_FLAG			(1 << 14) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */
-- 
cgit v1.2.3-71-gd317


From 86a54a4802df10d23ccd655e2083e812fe990243 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:52 +0000
Subject: dm log userspace: add version number to comms

This patch adds a 'version' field to the 'dm_ulog_request'
structure.

The 'version' field is taken from a portion of the unused
'padding' field in the 'dm_ulog_request' structure.  This
was done to avoid changing the size of the structure and
possibly disrupting backwards compatibility.

The version number will help notify user-space daemons
when a change has been made to the kernel/userspace
log API.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c     |  6 ++++--
 drivers/md/dm-log-userspace-transfer.c |  1 +
 include/linux/dm-log-userspace.h       | 13 ++++++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 31e1687e7bf6..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,6 +12,8 @@
 
 #include "dm-log-userspace-transfer.h"
 
+#define DM_LOG_USERSPACE_VSN "1.1.0"
+
 struct flush_entry {
 	int type;
 	region_t region;
@@ -765,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
 		return r;
 	}
 
-	DMINFO("version 1.0.0 loaded");
+	DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
 	return 0;
 }
 
@@ -775,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
 	dm_ulog_tfr_exit();
 	mempool_destroy(flush_entry_pool);
 
-	DMINFO("version 1.0.0 unloaded");
+	DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
 	return;
 }
 
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
 
 	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
 	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->version = DM_ULOG_REQUEST_VERSION;
 	tfr->luid = luid;
 	tfr->seq = dm_ulog_seq++;
 
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
index 0c3c3a2110c4..eeace7d3ff15 100644
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -370,6 +370,16 @@
 #define DM_ULOG_REQUEST_TYPE(request_type) \
 	(DM_ULOG_REQUEST_MASK & (request_type))
 
+/*
+ * DM_ULOG_REQUEST_VERSION is incremented when there is a
+ * change to the way information is passed between kernel
+ * and userspace.  This could be a structure change of
+ * dm_ulog_request or a change in the way requests are
+ * issued/handled.  Changes are outlined here:
+ *	version 1:  Initial implementation
+ */
+#define DM_ULOG_REQUEST_VERSION 1
+
 struct dm_ulog_request {
 	/*
 	 * The local unique identifier (luid) and the universally unique
@@ -383,8 +393,9 @@ struct dm_ulog_request {
 	 */
 	uint64_t luid;
 	char uuid[DM_UUID_LEN];
-	char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+	char padding[3];        /* Padding because DM_UUID_LEN = 129 */
 
+	uint32_t version;       /* See DM_ULOG_REQUEST_VERSION */
 	int32_t error;          /* Used to report back processing errors */
 
 	uint32_t seq;           /* Sequence number for request */
-- 
cgit v1.2.3-71-gd317


From 9c4376de98719d2768dd919553843de34bb094a6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jan 2011 19:59:58 +0000
Subject: dm: use non reentrant workqueues if equivalent

kmirrord_wq, kcopyd_work and md->wq are created per dm instance and
serve only a single work item from the dm instance, so non-reentrant
workqueues would provide the same ordering guarantees as ordered ones
while allowing CPU affinity and use of the workqueues for other
purposes.  Switch them to non-reentrant workqueues.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c   | 3 ++-
 drivers/md/dm-raid1.c    | 5 +++--
 drivers/md/dm.c          | 3 ++-
 include/linux/dm-ioctl.h | 4 ++--
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 63d67169c7f4..924f5f0084c2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -672,7 +672,8 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 		goto bad_slab;
 
 	INIT_WORK(&kc->kcopyd_work, do_work);
-	kc->kcopyd_wq = alloc_ordered_workqueue("kcopyd", WQ_MEM_RECLAIM);
+	kc->kcopyd_wq = alloc_workqueue("kcopyd",
+					WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!kc->kcopyd_wq)
 		goto bad_workqueue;
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 39917430f9f8..dee326775c60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1085,7 +1085,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_requests = 1;
 	ti->num_discard_requests = 1;
 
-	ms->kmirrord_wq = alloc_ordered_workqueue("kmirrord", WQ_MEM_RECLAIM);
+	ms->kmirrord_wq = alloc_workqueue("kmirrord",
+					  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
 		DMERR("couldn't start kmirrord");
 		r = -ENOMEM;
@@ -1414,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 12, 0},
+	.version = {1, 12, 1},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 39aaa9290128..e504bb40d60e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1883,7 +1883,8 @@ static struct mapped_device *alloc_dev(int minor)
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
-	md->wq = alloc_ordered_workqueue("kdmflush", WQ_MEM_RECLAIM);
+	md->wq = alloc_workqueue("kdmflush",
+				 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!md->wq)
 		goto bad_thread;
 
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index e453e1174ae1..78bbf47bbb96 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -268,8 +268,8 @@ enum {
 
 #define DM_VERSION_MAJOR	4
 #define DM_VERSION_MINOR	19
-#define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2010-10-14)"
+#define DM_VERSION_PATCHLEVEL	1
+#define DM_VERSION_EXTRA	"-ioctl (2011-01-07)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3-71-gd317


From 9d357b0787bb3c91835d5e658c3bda178f9ca419 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Jan 2011 20:00:01 +0000
Subject: dm: introduce target callbacks and congestion callback

DM currently implements congestion checking by checking on congestion
in each component device.  For raid456 we need to also check if the
stripe cache is congested.

Add per-target congestion checker callback support.

Extending the target_callbacks structure with additional callback
functions allows for establishing multiple callbacks per-target (a
callback is also needed for unplug).

Cc: linux-raid@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 14 ++++++++++++++
 include/linux/device-mapper.h | 11 +++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30e..7e2ec3c05550 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
 	void *event_context;
 
 	struct dm_md_mempools *mempools;
+
+	struct list_head target_callbacks;
 };
 
 /*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&t->devices);
+	INIT_LIST_HEAD(&t->target_callbacks);
 	atomic_set(&t->holders, 0);
 	t->discards_supported = 1;
 
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
 	return 0;
 }
 
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+	list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
+
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_target_callbacks *cb;
 	int r = 0;
 
 	list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 				     bdevname(dd->dm_dev.bdev, b));
 	}
 
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->congested_fn)
+			r |= cb->congested_fn(cb, bdi_bits);
+
 	return r;
 }
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2970022faa63..4b1c63d478ab 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -193,6 +193,12 @@ struct dm_target {
 	char *error;
 };
 
+/* Each target can link one of these into the table */
+struct dm_target_callbacks {
+	struct list_head list;
+	int (*congested_fn) (struct dm_target_callbacks *, int);
+};
+
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
@@ -268,6 +274,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 int dm_table_add_target(struct dm_table *t, const char *type,
 			sector_t start, sector_t len, char *params);
 
+/*
+ * Target_ctr should call this if it needs to add any callbacks.
+ */
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
+
 /*
  * Finally call this to make the table ready for use.
  */
-- 
cgit v1.2.3-71-gd317


From 99d03c141b40914b67d63c9d23b8da4386422ed7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Jan 2011 20:00:02 +0000
Subject: dm: per target unplug callback support

Add per-target unplug callback support.

Cc: linux-raid@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 5 +++++
 include/linux/device-mapper.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7e2ec3c05550..dffa0ac7c4f0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1278,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_target_callbacks *cb;
 
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1290,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
 				     dm_device_name(t->md),
 				     bdevname(dd->dm_dev.bdev, b));
 	}
+
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->unplug_fn)
+			cb->unplug_fn(cb);
 }
 
 struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 4b1c63d478ab..272496d1fae4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -197,6 +197,7 @@ struct dm_target {
 struct dm_target_callbacks {
 	struct list_head list;
 	int (*congested_fn) (struct dm_target_callbacks *, int);
+	void (*unplug_fn)(struct dm_target_callbacks *);
 };
 
 int dm_register_target(struct target_type *t);
-- 
cgit v1.2.3-71-gd317


From d8a3515e2a9523f8ed56d1f4537d16338bda2bc2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 13 Jan 2011 17:26:46 -0800
Subject: Revert "gpiolib: annotate gpio-intialization with __must_check"

This reverts commit 0fdae42d361bbb431ca0ab0efed5126a94821177, which
wasn't really supposed to go in, and causes lots of annoying warnings.

Quoth Andrew:
  "Complete brainfart - I meant to drop that patch ages ago."

Quoth Greg:
  "Ick, yeah, that patch isn't ok to go in as-is, all of the callers
   need to be fixed up first, which is what I thought we had agreed on..."

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/gpio.txt     |  2 +-
 include/asm-generic/gpio.h | 10 +++++-----
 include/linux/gpio.h       |  6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index a492d92bb098..792faa3c06cf 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction:
 	int gpio_direction_input(unsigned gpio);
 	int gpio_direction_output(unsigned gpio, int value);
 
-The return value is zero for success, else a negative errno.  It must
+The return value is zero for success, else a negative errno.  It should
 be checked, since the get/set calls don't have error returns and since
 misconfiguration is possible.  You should normally issue these calls from
 a task context.  However, for spinlock-safe GPIOs it's OK to use them
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 6098cae2af8e..ff5c66080c8c 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data,
 /* Always use the library code for GPIO management calls,
  * or when sleeping may be involved.
  */
-extern int __must_check gpio_request(unsigned gpio, const char *label);
+extern int gpio_request(unsigned gpio, const char *label);
 extern void gpio_free(unsigned gpio);
 
-extern int __must_check gpio_direction_input(unsigned gpio);
-extern int __must_check gpio_direction_output(unsigned gpio, int value);
+extern int gpio_direction_input(unsigned gpio);
+extern int gpio_direction_output(unsigned gpio, int value);
 
 extern int gpio_set_debounce(unsigned gpio, unsigned debounce);
 
@@ -192,8 +192,8 @@ struct gpio {
 	const char	*label;
 };
 
-extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-extern int __must_check gpio_request_array(struct gpio *array, size_t num);
+extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+extern int gpio_request_array(struct gpio *array, size_t num);
 extern void gpio_free_array(struct gpio *array, size_t num);
 
 #ifdef CONFIG_GPIO_SYSFS
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index f79d67f413e4..4b47ed96f131 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number)
 	return 0;
 }
 
-static inline int __must_check gpio_request(unsigned gpio, const char *label)
+static inline int gpio_request(unsigned gpio, const char *label)
 {
 	return -ENOSYS;
 }
@@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num)
 	WARN_ON(1);
 }
 
-static inline int __must_check gpio_direction_input(unsigned gpio)
+static inline int gpio_direction_input(unsigned gpio)
 {
 	return -ENOSYS;
 }
 
-static inline int __must_check gpio_direction_output(unsigned gpio, int value)
+static inline int gpio_direction_output(unsigned gpio, int value)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3-71-gd317


From 6c9ae009b298753a3baf71298d676a68b5a10c8f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 13 Jan 2011 15:45:38 -0800
Subject: irq: use per_cpu kstat_irqs

Use modern per_cpu API to increment {soft|hard}irq counters, and use
per_cpu allocation for (struct irq_desc)->kstats_irq instead of an array.

This gives better SMP/NUMA locality and saves few instructions per irq.

With small nr_cpuids values (8 for example), kstats_irq was a small array
(less than L1_CACHE_BYTES), potentially source of false sharing.

In the !CONFIG_SPARSE_IRQ case, remove the huge, NUMA/cache unfriendly
kstat_irqs_all[NR_IRQS][NR_CPUS] array.

Note: we still populate kstats_irq for all possible irqs in
early_irq_init().  We probably could use on-demand allocations.  (Code
included in alloc_descs()).  Problem is not all IRQS are used with a prior
alloc_descs() call.

kstat_irqs_this_cpu() is not used anymore, remove it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/irqdesc.h     |  2 +-
 include/linux/kernel_stat.h | 19 +++++++++----------
 kernel/irq/irqdesc.c        | 40 ++++++++++++++++++++++++++++++----------
 3 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 979c68cc7458..6a64c6fa81af 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -57,7 +57,7 @@ struct irq_desc {
 #endif
 
 	struct timer_rand_state *timer_rand_state;
-	unsigned int		*kstat_irqs;
+	unsigned int __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
 	struct irqaction	*action;	/* IRQ action list */
 	unsigned int		status;		/* IRQ status */
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44e83ba12b5b..0cce2db580c3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 extern unsigned long long nr_context_switches(void);
 
 #ifndef CONFIG_GENERIC_HARDIRQS
-#define kstat_irqs_this_cpu(irq) \
-	(this_cpu_read(kstat.irqs[irq])
 
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 					    struct irq_desc *desc)
 {
-	kstat_this_cpu.irqs[irq]++;
-	kstat_this_cpu.irqs_sum++;
+	__this_cpu_inc(kstat.irqs[irq]);
+	__this_cpu_inc(kstat.irqs_sum);
 }
 
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
@@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 #else
 #include <linux/irq.h>
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-#define kstat_irqs_this_cpu(DESC) \
-	((DESC)->kstat_irqs[smp_processor_id()])
-#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\
-	((DESC)->kstat_irqs[smp_processor_id()]++);\
-	kstat_this_cpu.irqs_sum++; } while (0)
+
+#define kstat_incr_irqs_this_cpu(irqno, DESC)		\
+do {							\
+	__this_cpu_inc(*(DESC)->kstat_irqs);		\
+	__this_cpu_inc(kstat.irqs_sum);			\
+} while (0)
 
 #endif
 
 static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
 {
-	kstat_this_cpu.softirqs[irq]++;
+	__this_cpu_inc(kstat.softirqs[irq]);
 }
 
 static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
 
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 {
+	int cpu;
+
 	desc->irq_data.irq = irq;
 	desc->irq_data.chip = &no_irq_chip;
 	desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 	desc->name = NULL;
-	memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
 	desc_smp_init(desc, node);
 }
 
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
 	if (!desc)
 		return NULL;
 	/* allocate based on nr_cpu_ids */
-	desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
-					 gfp, node);
+	desc->kstat_irqs = alloc_percpu(unsigned int);
 	if (!desc->kstat_irqs)
 		goto err_desc;
 
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
 	return desc;
 
 err_kstat:
-	kfree(desc->kstat_irqs);
+	free_percpu(desc->kstat_irqs);
 err_desc:
 	kfree(desc);
 	return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
 	mutex_unlock(&sparse_irq_lock);
 
 	free_masks(desc);
-	kfree(desc->kstat_irqs);
+	free_percpu(desc->kstat_irqs);
 	kfree(desc);
 }
 
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
 	int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc[i].irq_data.irq = i;
 		desc[i].irq_data.chip = &no_irq_chip;
-		desc[i].kstat_irqs = kstat_irqs_all[i];
+		/* TODO : do this allocation on-demand ... */
+		desc[i].kstat_irqs = alloc_percpu(unsigned int);
 		alloc_masks(desc + i, GFP_KERNEL, node);
 		desc_smp_init(desc + i, node);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+	struct irq_desc *desc;
+	unsigned int i;
+
+	for (i = 0; i < cnt; i++) {
+		desc = irq_to_desc(start + i);
+		if (desc && !desc->kstat_irqs) {
+			unsigned int __percpu *stats = alloc_percpu(unsigned int);
+
+			if (!stats)
+				return -1;
+			if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+				free_percpu(stats);
+		}
+	}
+#endif
 	return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	return desc ? desc->kstat_irqs[cpu] : 0;
+
+	return desc && desc->kstat_irqs ?
+			*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
 	int cpu;
 	int sum = 0;
 
-	if (!desc)
+	if (!desc || !desc->kstat_irqs)
 		return 0;
 	for_each_possible_cpu(cpu)
-		sum += desc->kstat_irqs[cpu];
+		sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
 	return sum;
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
-- 
cgit v1.2.3-71-gd317


From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 13 Jan 2011 15:45:40 -0800
Subject: sched: remove long deprecated CLONE_STOPPED flag

This warning was added in commit bdff746a3915 ("clone: prepare to recycle
CLONE_STOPPED") three years ago.  2.6.26 came and went.  As far as I know,
no-one is actually using CLONE_STOPPED.

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  3 ++-
 kernel/fork.c         | 28 +---------------------------
 2 files changed, 3 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96e23215e276..07402530fc70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -21,7 +21,8 @@
 #define CLONE_DETACHED		0x00400000	/* Unused, ignored */
 #define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
 #define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
-#define CLONE_STOPPED		0x02000000	/* Start in stopped state */
+/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
+   and is now available for re-use. */
 #define CLONE_NEWUTS		0x04000000	/* New utsname group? */
 #define CLONE_NEWIPC		0x08000000	/* New ipcs */
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..76a1fdd80bdf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1409,23 +1409,6 @@ long do_fork(unsigned long clone_flags,
 			return -EPERM;
 	}
 
-	/*
-	 * We hope to recycle these flags after 2.6.26
-	 */
-	if (unlikely(clone_flags & CLONE_STOPPED)) {
-		static int __read_mostly count = 100;
-
-		if (count > 0 && printk_ratelimit()) {
-			char comm[TASK_COMM_LEN];
-
-			count--;
-			printk(KERN_INFO "fork(): process `%s' used deprecated "
-					"clone flags 0x%lx\n",
-				get_task_comm(comm, current),
-				clone_flags & CLONE_STOPPED);
-		}
-	}
-
 	/*
 	 * When called from kernel_thread, don't do user tracing stuff.
 	 */
@@ -1464,16 +1447,7 @@ long do_fork(unsigned long clone_flags,
 		 */
 		p->flags &= ~PF_STARTING;
 
-		if (unlikely(clone_flags & CLONE_STOPPED)) {
-			/*
-			 * We'll start up with an immediate SIGSTOP.
-			 */
-			sigaddset(&p->pending.signal, SIGSTOP);
-			set_tsk_thread_flag(p, TIF_SIGPENDING);
-			__set_task_state(p, TASK_STOPPED);
-		} else {
-			wake_up_new_task(p, clone_flags);
-		}
+		wake_up_new_task(p, clone_flags);
 
 		tracehook_report_clone_complete(trace, regs,
 						clone_flags, nr, p);
-- 
cgit v1.2.3-71-gd317


From 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:41 -0800
Subject: mm: page allocator: adjust the per-cpu counter threshold when memory
 is low

Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES.  To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold.  On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high.  The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues.  The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.

Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing.  This patch takes a different approach.  For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level.  This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node.  There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.

To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat().  We are still using an expensive
function but limiting how often it is called.

When the test case is reproduced, the time spent in the watermark
functions is reduced.  The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().

vanilla                      11.6615%
disable-threshold            0.2584%

David said:

: We had to pull aa454840 "mm: page allocator: calculate a better estimate
: of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36
: internally because tests showed that it would cause the machine to stall
: as the result of heavy kswapd activity.  I merged it back with this fix as
: it is pending in the -mm tree and it solves the issue we were seeing, so I
: definitely think this should be pushed to -stable (and I would seriously
: consider it for 2.6.37 inclusion even at this late date).

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Tested-by: Nicolas Bareil <nico@chdir.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: <stable@kernel.org>		[2.6.37.1, 2.6.36.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 10 +++-----
 include/linux/vmstat.h |  5 ++++
 mm/mmzone.c            | 21 ----------------
 mm/page_alloc.c        | 35 ++++++++++++++++++++------
 mm/vmscan.c            | 23 +++++++++--------
 mm/vmstat.c            | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 115 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39c24ebe9cfd..48906629335c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
 
-#ifdef CONFIG_SMP
-unsigned long zone_nr_free_pages(struct zone *zone);
-#else
-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
-#endif /* CONFIG_SMP */
-
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -661,7 +655,9 @@ typedef struct pglist_data {
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(void *data);
 void wakeup_kswapd(struct zone *zone, int order);
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 enum memmap_context {
 	MEMMAP_EARLY,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index eaaea37b3b75..e4cc21cf5870 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
 void refresh_cpu_vm_stats(int);
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
 #else /* CONFIG_SMP */
 
 /*
@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
 #define dec_zone_page_state __dec_zone_page_state
 #define mod_zone_page_state __mod_zone_page_state
 
+static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+
 static inline void refresh_cpu_vm_stats(int cpu) { }
 #endif
 
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
 	return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
-	unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
-	/*
-	 * While kswapd is awake, it is considered the zone is under some
-	 * memory pressure. Under pressure, there is a risk that
-	 * per-cpu-counter-drift will allow the min watermark to be breached
-	 * potentially causing a live-lock. While kswapd is awake and
-	 * free pages are low, get a better estimate for free pages
-	 */
-	if (nr_free_pages < zone->percpu_drift_mark &&
-			!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-		return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
-	return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..22a1bb7723e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		      int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
-	long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
 	int o;
 
+	free_pages -= (1 << order) + 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-		return 0;
+		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		min >>= 1;
 
 		if (free_pages <= min)
-			return 0;
+			return false;
 	}
-	return 1;
+	return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+					zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+								free_pages);
 }
 
 #ifdef CONFIG_NUMA
@@ -2442,7 +2461,7 @@ void show_free_areas(void)
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
-			K(zone_nr_free_pages(zone)),
+			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..5da4295e7d67 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
 		if (zone->all_unreclaimable)
 			continue;
 
-		if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+		if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
 								0, 0))
 			return 1;
 	}
@@ -2230,7 +2230,7 @@ loop_again:
 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
 							&sc, priority, 0);
 
-			if (!zone_watermark_ok(zone, order,
+			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), 0, 0)) {
 				end_zone = i;
 				break;
@@ -2276,7 +2276,7 @@ loop_again:
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
 			 */
-			if (!zone_watermark_ok(zone, order,
+			if (!zone_watermark_ok_safe(zone, order,
 					8*high_wmark_pages(zone), end_zone, 0))
 				shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
@@ -2297,7 +2297,7 @@ loop_again:
 			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
 				sc.may_writepage = 1;
 
-			if (!zone_watermark_ok(zone, order,
+			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), end_zone, 0)) {
 				all_zones_ok = 0;
 				/*
@@ -2305,7 +2305,7 @@ loop_again:
 				 * means that we have a GFP_ATOMIC allocation
 				 * failure risk. Hurry up!
 				 */
-				if (!zone_watermark_ok(zone, order,
+				if (!zone_watermark_ok_safe(zone, order,
 					    min_wmark_pages(zone), end_zone, 0))
 					has_under_min_watermark_zone = 1;
 			} else {
@@ -2448,7 +2448,9 @@ static int kswapd(void *p)
 				 */
 				if (!sleeping_prematurely(pgdat, order, remaining)) {
 					trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+					restore_pgdat_percpu_threshold(pgdat);
 					schedule();
+					reduce_pgdat_percpu_threshold(pgdat);
 				} else {
 					if (remaining)
 						count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order)
 	if (!populated_zone(zone))
 		return;
 
-	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 		return;
+	pgdat = zone->zone_pgdat;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
-	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
-	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-		return;
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
+	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+		return;
+
+	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 312d728976f1..bc0f095791b4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat);
 
 #ifdef CONFIG_SMP
 
+static int calculate_pressure_threshold(struct zone *zone)
+{
+	int threshold;
+	int watermark_distance;
+
+	/*
+	 * As vmstats are not up to date, there is drift between the estimated
+	 * and real values. For high thresholds and a high number of CPUs, it
+	 * is possible for the min watermark to be breached while the estimated
+	 * value looks fine. The pressure threshold is a reduced value such
+	 * that even the maximum amount of drift will not accidentally breach
+	 * the min watermark
+	 */
+	watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+	/*
+	 * Maximum threshold is 125
+	 */
+	threshold = min(125, threshold);
+
+	return threshold;
+}
+
 static int calculate_threshold(struct zone *zone)
 {
 	int threshold;
@@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void)
 	}
 }
 
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+	struct zone *zone;
+	int cpu;
+	int threshold;
+	int i;
+
+	get_online_cpus();
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		zone = &pgdat->node_zones[i];
+		if (!zone->percpu_drift_mark)
+			continue;
+
+		threshold = calculate_pressure_threshold(zone);
+		for_each_online_cpu(cpu)
+			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+							= threshold;
+	}
+	put_online_cpus();
+}
+
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+	struct zone *zone;
+	int cpu;
+	int threshold;
+	int i;
+
+	get_online_cpus();
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		zone = &pgdat->node_zones[i];
+		if (!zone->percpu_drift_mark)
+			continue;
+
+		threshold = calculate_threshold(zone);
+		for_each_online_cpu(cpu)
+			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+							= threshold;
+	}
+	put_online_cpus();
+}
+
 /*
  * For use when we know that interrupts are disabled.
  */
@@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        scanned  %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu",
-		   zone_nr_free_pages(zone),
+		   zone_page_state(zone, NR_FREE_PAGES),
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
-- 
cgit v1.2.3-71-gd317


From b44129b30652c8771db2265939bb8b463724043d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:43 -0800
Subject: mm: vmstat: use a single setter function and callback for adjusting
 percpu thresholds

reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist
to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid
errors due to counter drift.  The functions duplicate some code so this
patch replaces them with a single set_pgdat_percpu_threshold() that takes
a callback function to calculate the desired threshold as a parameter.

[akpm@linux-foundation.org: readability tweak]
[kosaki.motohiro@jp.fujitsu.com: set_pgdat_percpu_threshold(): don't use for_each_online_cpu]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 10 ++++++----
 mm/vmscan.c            | 19 +++++++++++++++++--
 mm/vmstat.c            | 36 +++++++-----------------------------
 3 files changed, 30 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index e4cc21cf5870..833e676d6d92 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
 void refresh_cpu_vm_stats(int);
-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
-void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
+
+int calculate_pressure_threshold(struct zone *zone);
+int calculate_normal_threshold(struct zone *zone);
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+				int (*calculate_pressure)(struct zone *));
 #else /* CONFIG_SMP */
 
 /*
@@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page,
 #define dec_zone_page_state __dec_zone_page_state
 #define mod_zone_page_state __mod_zone_page_state
 
-static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
-static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+#define set_pgdat_percpu_threshold(pgdat, callback) { }
 
 static inline void refresh_cpu_vm_stats(int cpu) { }
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5da4295e7d67..86f8c3418795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2448,9 +2448,24 @@ static int kswapd(void *p)
 				 */
 				if (!sleeping_prematurely(pgdat, order, remaining)) {
 					trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
-					restore_pgdat_percpu_threshold(pgdat);
+
+					/*
+					 * vmstat counters are not perfectly
+					 * accurate and the estimated value
+					 * for counters such as NR_FREE_PAGES
+					 * can deviate from the true value by
+					 * nr_online_cpus * threshold. To
+					 * avoid the zone watermarks being
+					 * breached while under pressure, we
+					 * reduce the per-cpu vmstat threshold
+					 * while kswapd is awake and restore
+					 * them before going back to sleep.
+					 */
+					set_pgdat_percpu_threshold(pgdat,
+						calculate_normal_threshold);
 					schedule();
-					reduce_pgdat_percpu_threshold(pgdat);
+					set_pgdat_percpu_threshold(pgdat,
+						calculate_pressure_threshold);
 				} else {
 					if (remaining)
 						count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index bc0f095791b4..751a65e00aac 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,7 @@ EXPORT_SYMBOL(vm_stat);
 
 #ifdef CONFIG_SMP
 
-static int calculate_pressure_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
 {
 	int threshold;
 	int watermark_distance;
@@ -107,7 +107,7 @@ static int calculate_pressure_threshold(struct zone *zone)
 	return threshold;
 }
 
-static int calculate_threshold(struct zone *zone)
+int calculate_normal_threshold(struct zone *zone)
 {
 	int threshold;
 	int mem;	/* memory in 128 MB units */
@@ -166,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
 	for_each_populated_zone(zone) {
 		unsigned long max_drift, tolerate_drift;
 
-		threshold = calculate_threshold(zone);
+		threshold = calculate_normal_threshold(zone);
 
 		for_each_online_cpu(cpu)
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -185,46 +185,24 @@ static void refresh_zone_stat_thresholds(void)
 	}
 }
 
-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+				int (*calculate_pressure)(struct zone *))
 {
 	struct zone *zone;
 	int cpu;
 	int threshold;
 	int i;
 
-	get_online_cpus();
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		zone = &pgdat->node_zones[i];
-		if (!zone->percpu_drift_mark)
-			continue;
-
-		threshold = calculate_pressure_threshold(zone);
-		for_each_online_cpu(cpu)
-			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-							= threshold;
-	}
-	put_online_cpus();
-}
-
-void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
-{
-	struct zone *zone;
-	int cpu;
-	int threshold;
-	int i;
-
-	get_online_cpus();
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		zone = &pgdat->node_zones[i];
 		if (!zone->percpu_drift_mark)
 			continue;
 
-		threshold = calculate_threshold(zone);
-		for_each_online_cpu(cpu)
+		threshold = (*calculate_pressure)(zone);
+		for_each_possible_cpu(cpu)
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 							= threshold;
 	}
-	put_online_cpus();
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 3e7d344970673c5334cf7b5bb27c8c0942b06126 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:56 -0800
Subject: mm: vmscan: reclaim order-0 and use compaction instead of lumpy
 reclaim

Lumpy reclaim is disruptive.  It reclaims a large number of pages and
ignores the age of the pages it reclaims.  This can incur significant
stalls and potentially increase the number of major faults.

Compaction has reached the point where it is considered reasonably stable
(meaning it has passed a lot of testing) and is a potential candidate for
displacing lumpy reclaim.  This patch introduces an alternative to lumpy
reclaim whe compaction is available called reclaim/compaction.  The basic
operation is very simple - instead of selecting a contiguous range of
pages to reclaim, a number of order-0 pages are reclaimed and then
compaction is later by either kswapd (compact_zone_order()) or direct
compaction (__alloc_pages_direct_compact()).

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: use conventional task_struct naming]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  14 +++++++
 include/linux/kernel.h     |   7 ++++
 mm/compaction.c            |  89 ++++++++++++++++++++++++---------------
 mm/migrate.c               |  17 ++++++++
 mm/page_alloc.c            |  16 +++++++
 mm/vmscan.c                | 102 ++++++++++++++++++++++++++++++++++++++-------
 6 files changed, 196 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 5ac51552d908..2592883d862d 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+						gfp_t gfp_mask);
 
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	return COMPACT_CONTINUE;
 }
 
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+	return COMPACT_SKIPPED;
+}
+
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+						gfp_t gfp_mask)
+{
+	return 0;
+}
+
 static inline void defer_compaction(struct zone *zone)
 {
 }
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 57dac7022b63..5a9d9059520b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
 
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
+
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/mm/compaction.c b/mm/compaction.c
index 20011a850fef..8fe917ec7c11 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone,
 	return COMPACT_CONTINUE;
 }
 
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+	int fragindex;
+	unsigned long watermark;
+
+	/*
+	 * Watermarks for order-0 must be met for compaction. Note the 2UL.
+	 * This is because during migration, copies of pages need to be
+	 * allocated and for a short time, the footprint is higher
+	 */
+	watermark = low_wmark_pages(zone) + (2UL << order);
+	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+		return COMPACT_SKIPPED;
+
+	/*
+	 * fragmentation index determines if allocation failures are due to
+	 * low memory or external fragmentation
+	 *
+	 * index of -1 implies allocations might succeed dependingon watermarks
+	 * index towards 0 implies failure is due to lack of memory
+	 * index towards 1000 implies failure is due to fragmentation
+	 *
+	 * Only compact if a failure would be due to fragmentation.
+	 */
+	fragindex = fragmentation_index(zone, order);
+	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+		return COMPACT_SKIPPED;
+
+	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+		return COMPACT_PARTIAL;
+
+	return COMPACT_CONTINUE;
+}
+
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	int ret;
 
+	ret = compaction_suitable(zone, cc->order);
+	switch (ret) {
+	case COMPACT_PARTIAL:
+	case COMPACT_SKIPPED:
+		/* Compaction is likely to fail */
+		return ret;
+	case COMPACT_CONTINUE:
+		/* Fall through to compaction */
+		;
+	}
+
 	/* Setup to move all movable pages to the end of the zone */
 	cc->migrate_pfn = zone->zone_start_pfn;
 	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
 						int order, gfp_t gfp_mask)
 {
 	struct compact_control cc = {
@@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
 	int may_perform_io = gfp_mask & __GFP_IO;
-	unsigned long watermark;
 	struct zoneref *z;
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
@@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
 								nodemask) {
-		int fragindex;
 		int status;
 
-		/*
-		 * Watermarks for order-0 must be met for compaction. Note
-		 * the 2UL. This is because during migration, copies of
-		 * pages need to be allocated and for a short time, the
-		 * footprint is higher
-		 */
-		watermark = low_wmark_pages(zone) + (2UL << order);
-		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-			continue;
-
-		/*
-		 * fragmentation index determines if allocation failures are
-		 * due to low memory or external fragmentation
-		 *
-		 * index of -1 implies allocations might succeed depending
-		 * 	on watermarks
-		 * index towards 0 implies failure is due to lack of memory
-		 * index towards 1000 implies failure is due to fragmentation
-		 *
-		 * Only compact if a failure would be due to fragmentation.
-		 */
-		fragindex = fragmentation_index(zone, order);
-		if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-			continue;
-
-		if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-			rc = COMPACT_PARTIAL;
-			break;
-		}
-
 		status = compact_zone_order(zone, order, gfp_mask);
 		rc = max(status, rc);
 
-		if (zone_watermark_ok(zone, order, watermark, 0, 0))
+		/* If a normal allocation would succeed, stop compacting */
+		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
 			break;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..94875b265928 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	if (!trylock_page(page)) {
 		if (!force)
 			goto move_newpage;
+
+		/*
+		 * It's not safe for direct compaction to call lock_page.
+		 * For example, during page readahead pages are added locked
+		 * to the LRU. Later, when the IO completes the pages are
+		 * marked uptodate and unlocked. However, the queueing
+		 * could be merging multiple pages for one bio (e.g.
+		 * mpage_readpages). If an allocation happens for the
+		 * second or third page, the process can end up locking
+		 * the same page twice and deadlocking. Rather than
+		 * trying to be clever about what pages can be locked,
+		 * avoid the use of lock_page for direct compaction
+		 * altogether.
+		 */
+		if (current->flags & PF_MEMALLOC)
+			goto move_newpage;
+
 		lock_page(page);
 	}
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22a1bb7723e4..03a66a31bfcd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page;
+	struct task_struct *tsk = current;
 
 	if (!order || compaction_deferred(preferred_zone))
 		return NULL;
 
+	tsk->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 								nodemask);
+	tsk->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 
 		/* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
+	} else {
+		/*
+		 * High-order allocations do not necessarily loop after
+		 * direct reclaim and reclaim/compaction depends on compaction
+		 * being called after reclaim so call directly if necessary
+		 */
+		page = __alloc_pages_direct_compact(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask,
+					alloc_flags, preferred_zone,
+					migratetype, &did_some_progress);
+		if (page)
+			goto got_pg;
 	}
 
 nopage:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3464312bde07..10ebd74a423c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -59,12 +60,15 @@
  * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
  *			page from the LRU and reclaim all pages within a
  *			naturally aligned range
+ * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *			order-0 pages and then compact the zone
  */
 typedef unsigned __bitwise__ lumpy_mode;
 #define LUMPY_MODE_SINGLE		((__force lumpy_mode)0x01u)
 #define LUMPY_MODE_ASYNC		((__force lumpy_mode)0x02u)
 #define LUMPY_MODE_SYNC			((__force lumpy_mode)0x04u)
 #define LUMPY_MODE_CONTIGRECLAIM	((__force lumpy_mode)0x08u)
+#define LUMPY_MODE_COMPACTION		((__force lumpy_mode)0x10u)
 
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
@@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
 	lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
 
 	/*
-	 * Some reclaim have alredy been failed. No worth to try synchronous
-	 * lumpy reclaim.
+	 * Initially assume we are entering either lumpy reclaim or
+	 * reclaim/compaction.Depending on the order, we will either set the
+	 * sync mode or just reclaim order-0 pages later.
 	 */
-	if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE)
-		return;
+	if (COMPACTION_BUILD)
+		sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
+	else
+		sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
 
 	/*
-	 * If we need a large contiguous chunk of memory, or have
-	 * trouble getting a small set of contiguous pages, we
-	 * will reclaim both active and inactive pages.
+	 * Avoid using lumpy reclaim or reclaim/compaction if possible by
+	 * restricting when its set to either costly allocations or when
+	 * under memory pressure
 	 */
-	sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
 	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
 		sc->lumpy_reclaim_mode |= syncmode;
 	else if (sc->order && priority < DEF_PRIORITY - 2)
@@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 	if (scanning_global_lru(sc)) {
 		nr_taken = isolate_pages_global(nr_to_scan,
 			&page_list, &nr_scanned, sc->order,
-			sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
-					ISOLATE_INACTIVE : ISOLATE_BOTH,
+			sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
+					ISOLATE_BOTH : ISOLATE_INACTIVE,
 			zone, 0, file);
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
@@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 	} else {
 		nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
 			&page_list, &nr_scanned, sc->order,
-			sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
-					ISOLATE_INACTIVE : ISOLATE_BOTH,
+			sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
+					ISOLATE_BOTH : ISOLATE_INACTIVE,
 			zone, sc->mem_cgroup,
 			0, file);
 		/*
@@ -1814,6 +1820,57 @@ out:
 	}
 }
 
+/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+					unsigned long nr_reclaimed,
+					unsigned long nr_scanned,
+					struct scan_control *sc)
+{
+	unsigned long pages_for_compaction;
+	unsigned long inactive_lru_pages;
+
+	/* If not in reclaim/compaction mode, stop */
+	if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
+		return false;
+
+	/*
+	 * If we failed to reclaim and have scanned the full list, stop.
+	 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+	 *       faster but obviously would be less likely to succeed
+	 *       allocation. If this is desirable, use GFP_REPEAT to decide
+	 *       if both reclaimed and scanned should be checked or just
+	 *       reclaimed
+	 */
+	if (!nr_reclaimed && !nr_scanned)
+		return false;
+
+	/*
+	 * If we have not reclaimed enough pages for compaction and the
+	 * inactive lists are large enough, continue reclaiming
+	 */
+	pages_for_compaction = (2UL << sc->order);
+	inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+				zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+	if (sc->nr_reclaimed < pages_for_compaction &&
+			inactive_lru_pages > pages_for_compaction)
+		return true;
+
+	/* If compaction would go ahead or the allocation would succeed, stop */
+	switch (compaction_suitable(zone, sc->order)) {
+	case COMPACT_PARTIAL:
+	case COMPACT_CONTINUE:
+		return false;
+	default:
+		return true;
+	}
+}
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone,
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
 	enum lru_list l;
-	unsigned long nr_reclaimed = sc->nr_reclaimed;
+	unsigned long nr_reclaimed;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+	unsigned long nr_scanned = sc->nr_scanned;
 
+restart:
+	nr_reclaimed = 0;
 	get_scan_count(zone, sc, nr, priority);
 
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone,
 		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
 			break;
 	}
-
-	sc->nr_reclaimed = nr_reclaimed;
+	sc->nr_reclaimed += nr_reclaimed;
 
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
@@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone,
 	if (inactive_anon_is_low(zone, sc))
 		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
 
+	/* reclaim/compaction might need reclaim to continue */
+	if (should_continue_reclaim(zone, nr_reclaimed,
+					sc->nr_scanned - nr_scanned, sc))
+		goto restart;
+
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
@@ -2307,6 +2371,14 @@ loop_again:
 			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
 				sc.may_writepage = 1;
 
+			/*
+			 * Compact the zone for higher orders to reduce
+			 * latencies for higher-order allocations that
+			 * would ordinarily call try_to_compact_pages()
+			 */
+			if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
+				compact_zone_order(zone, sc.order, sc.gfp_mask);
+
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), end_zone, 0)) {
 				all_zones_ok = 0;
-- 
cgit v1.2.3-71-gd317


From 77f1fe6b08b13a87391549c8a820ddc817b6f50e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:57 -0800
Subject: mm: migration: allow migration to operate asynchronously and avoid
 synchronous compaction in the faster path

Migration synchronously waits for writeback if the initial passes fails.
Callers of memory compaction do not necessarily want this behaviour if the
caller is latency sensitive or expects that synchronous migration is not
going to have a significantly better success rate.

This patch adds a sync parameter to migrate_pages() allowing the caller to
indicate if wait_on_page_writeback() is allowed within migration or not.
For reclaim/compaction, try_to_compact_pages() is first called
asynchronously, direct reclaim runs and then try_to_compact_pages() is
called synchronously as there is a greater expectation that it'll succeed.

[akpm@linux-foundation.org: build/merge fix]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 10 ++++++----
 include/linux/migrate.h    | 12 ++++++++----
 mm/compaction.c            | 14 ++++++++++----
 mm/memory-failure.c        |  8 +++++---
 mm/memory_hotplug.c        |  3 ++-
 mm/mempolicy.c             |  4 ++--
 mm/migrate.c               | 22 +++++++++++++---------
 mm/page_alloc.c            | 21 +++++++++++++++------
 mm/vmscan.c                |  3 ++-
 9 files changed, 63 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 2592883d862d..72cba4034785 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -21,10 +21,11 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *mask);
+			int order, gfp_t gfp_mask, nodemask_t *mask,
+			bool sync);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 extern unsigned long compact_zone_order(struct zone *zone, int order,
-						gfp_t gfp_mask);
+						gfp_t gfp_mask, bool sync);
 
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -57,7 +58,8 @@ static inline bool compaction_deferred(struct zone *zone)
 
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *nodemask)
+			int order, gfp_t gfp_mask, nodemask_t *nodemask,
+			bool sync)
 {
 	return COMPACT_CONTINUE;
 }
@@ -68,7 +70,7 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order)
 }
 
 static inline unsigned long compact_zone_order(struct zone *zone, int order,
-						gfp_t gfp_mask)
+						gfp_t gfp_mask, bool sync)
 {
 	return 0;
 }
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 085527fb8261..fa31902803fd 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
-			unsigned long private, int offlining);
+			unsigned long private, int offlining,
+			bool sync);
 extern int migrate_huge_pages(struct list_head *l, new_page_t x,
-			unsigned long private, int offlining);
+			unsigned long private, int offlining,
+			bool sync);
 
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, int offlining) { return -ENOSYS; }
+		unsigned long private, int offlining,
+		bool sync) { return -ENOSYS; }
 static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
-		unsigned long private, int offlining) { return -ENOSYS; }
+		unsigned long private, int offlining,
+		bool sync) { return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
index 8fe917ec7c11..47fca1069343 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -33,6 +33,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
+	bool sync;			/* Synchronous migration */
 
 	/* Account for isolated anon and file pages */
 	unsigned long nr_anon;
@@ -455,7 +456,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		migrate_pages(&cc->migratepages, compaction_alloc,
-						(unsigned long)cc, 0);
+				(unsigned long)cc, 0,
+				cc->sync);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
 
@@ -482,7 +484,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 }
 
 unsigned long compact_zone_order(struct zone *zone,
-						int order, gfp_t gfp_mask)
+						int order, gfp_t gfp_mask,
+						bool sync)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -490,6 +493,7 @@ unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
+		.sync = sync,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -505,11 +509,13 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
  *
  * This is the main entry point for direct page compaction.
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *nodemask)
+			int order, gfp_t gfp_mask, nodemask_t *nodemask,
+			bool sync)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -533,7 +539,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask);
+		status = compact_zone_order(zone, order, gfp_mask, sync);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c044b0e..2323a8039a98 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1290,9 +1290,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	/* Keep page count to indicate a given hugepage is isolated. */
 
 	list_add(&hpage->lru, &pagelist);
-	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+				true);
 	if (ret) {
-			putback_lru_pages(&pagelist);
+		putback_lru_pages(&pagelist);
 		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
 			 pfn, ret, page->flags);
 		if (ret > 0)
@@ -1413,7 +1414,8 @@ int soft_offline_page(struct page *page, int flags)
 		LIST_HEAD(pagelist);
 
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+								0, true);
 		if (ret) {
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
 				pfn, ret, page->flags);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..584fc5588fdd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -733,7 +733,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			goto out;
 		}
 		/* this function returns # of failed pages */
-		ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+		ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+								1, true);
 		if (ret)
 			putback_lru_pages(&source);
 	}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11ff260fb282..9db274593086 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -935,7 +935,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 		return PTR_ERR(vma);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest, 0);
+		err = migrate_pages(&pagelist, new_node_page, dest, 0, true);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
@@ -1155,7 +1155,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 		if (!list_empty(&pagelist)) {
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-						(unsigned long)vma, 0);
+						(unsigned long)vma, 0, true);
 			if (nr_failed)
 				putback_lru_pages(&pagelist);
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 94875b265928..dc47f6c40353 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, int offlining)
+			struct page *page, int force, int offlining, bool sync)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -682,7 +682,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	BUG_ON(charge);
 
 	if (PageWriteback(page)) {
-		if (!force)
+		if (!force || !sync)
 			goto uncharge;
 		wait_on_page_writeback(page);
 	}
@@ -827,7 +827,7 @@ move_newpage:
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
 				unsigned long private, struct page *hpage,
-				int force, int offlining)
+				int force, int offlining, bool sync)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -841,7 +841,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	rc = -EAGAIN;
 
 	if (!trylock_page(hpage)) {
-		if (!force)
+		if (!force || !sync)
 			goto out;
 		lock_page(hpage);
 	}
@@ -909,7 +909,8 @@ out:
  * Return: Number of pages not migrated or error code.
  */
 int migrate_pages(struct list_head *from,
-		new_page_t get_new_page, unsigned long private, int offlining)
+		new_page_t get_new_page, unsigned long private, int offlining,
+		bool sync)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -929,7 +930,8 @@ int migrate_pages(struct list_head *from,
 			cond_resched();
 
 			rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, offlining);
+						page, pass > 2, offlining,
+						sync);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -958,7 +960,8 @@ out:
 }
 
 int migrate_huge_pages(struct list_head *from,
-		new_page_t get_new_page, unsigned long private, int offlining)
+		new_page_t get_new_page, unsigned long private, int offlining,
+		bool sync)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -974,7 +977,8 @@ int migrate_huge_pages(struct list_head *from,
 			cond_resched();
 
 			rc = unmap_and_move_huge_page(get_new_page,
-					private, page, pass > 2, offlining);
+					private, page, pass > 2, offlining,
+					sync);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1107,7 +1111,7 @@ set_status:
 	err = 0;
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_page_node,
-				(unsigned long)pm, 0);
+				(unsigned long)pm, 0, true);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 03a66a31bfcd..0fd486467b4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1812,7 +1812,8 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, unsigned long *did_some_progress)
+	int migratetype, unsigned long *did_some_progress,
+	bool sync_migration)
 {
 	struct page *page;
 	struct task_struct *tsk = current;
@@ -1822,7 +1823,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	tsk->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-								nodemask);
+						nodemask, sync_migration);
 	tsk->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 
@@ -1859,7 +1860,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, unsigned long *did_some_progress)
+	int migratetype, unsigned long *did_some_progress,
+	bool sync_migration)
 {
 	return NULL;
 }
@@ -2001,6 +2003,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	struct task_struct *p = current;
+	bool sync_migration = false;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2063,14 +2066,19 @@ rebalance:
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 
-	/* Try direct compaction */
+	/*
+	 * Try direct compaction. The first pass is asynchronous. Subsequent
+	 * attempts after direct reclaim are synchronous
+	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					migratetype, &did_some_progress);
+					migratetype, &did_some_progress,
+					sync_migration);
 	if (page)
 		goto got_pg;
+	sync_migration = true;
 
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2134,7 +2142,8 @@ rebalance:
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					migratetype, &did_some_progress);
+					migratetype, &did_some_progress,
+					sync_migration);
 		if (page)
 			goto got_pg;
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 10ebd74a423c..8320d115c85d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2377,7 +2377,8 @@ loop_again:
 			 * would ordinarily call try_to_compact_pages()
 			 */
 			if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
-				compact_zone_order(zone, sc.order, sc.gfp_mask);
+				compact_zone_order(zone, sc.order, sc.gfp_mask,
+							false);
 
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), end_zone, 0)) {
-- 
cgit v1.2.3-71-gd317


From 7f0f24967b0349798803260b2e4bf347cffa1990 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:58 -0800
Subject: mm: migration: cleanup migrate_pages API by matching types for
 offlining and sync

With the introduction of the boolean sync parameter, the API looks a
little inconsistent as offlining is still an int.  Convert offlining to a
bool for the sake of being tidy.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 8 ++++----
 mm/compaction.c         | 2 +-
 mm/memory_hotplug.c     | 2 +-
 mm/mempolicy.c          | 6 ++++--
 mm/migrate.c            | 8 ++++----
 5 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fa31902803fd..e39aeecfe9a2 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,10 +13,10 @@ extern void putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
-			unsigned long private, int offlining,
+			unsigned long private, bool offlining,
 			bool sync);
 extern int migrate_huge_pages(struct list_head *l, new_page_t x,
-			unsigned long private, int offlining,
+			unsigned long private, bool offlining,
 			bool sync);
 
 extern int fail_migrate_page(struct address_space *,
@@ -35,10 +35,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, int offlining,
+		unsigned long private, bool offlining,
 		bool sync) { return -ENOSYS; }
 static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
-		unsigned long private, int offlining,
+		unsigned long private, bool offlining,
 		bool sync) { return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
index 47fca1069343..e005a30e968c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -456,7 +456,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		migrate_pages(&cc->migratepages, compaction_alloc,
-				(unsigned long)cc, 0,
+				(unsigned long)cc, false,
 				cc->sync);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 584fc5588fdd..a2832c092509 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -734,7 +734,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		}
 		/* this function returns # of failed pages */
 		ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
-								1, true);
+								true, true);
 		if (ret)
 			putback_lru_pages(&source);
 	}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9db274593086..7ee55af8d79c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -935,7 +935,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 		return PTR_ERR(vma);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest, 0, true);
+		err = migrate_pages(&pagelist, new_node_page, dest,
+								false, true);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
@@ -1155,7 +1156,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 		if (!list_empty(&pagelist)) {
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-						(unsigned long)vma, 0, true);
+						(unsigned long)vma,
+						false, true);
 			if (nr_failed)
 				putback_lru_pages(&pagelist);
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index dc47f6c40353..690d0de993af 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, int offlining, bool sync)
+			struct page *page, int force, bool offlining, bool sync)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -827,7 +827,7 @@ move_newpage:
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
 				unsigned long private, struct page *hpage,
-				int force, int offlining, bool sync)
+				int force, bool offlining, bool sync)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -909,7 +909,7 @@ out:
  * Return: Number of pages not migrated or error code.
  */
 int migrate_pages(struct list_head *from,
-		new_page_t get_new_page, unsigned long private, int offlining,
+		new_page_t get_new_page, unsigned long private, bool offlining,
 		bool sync)
 {
 	int retry = 1;
@@ -960,7 +960,7 @@ out:
 }
 
 int migrate_huge_pages(struct list_head *from,
-		new_page_t get_new_page, unsigned long private, int offlining,
+		new_page_t get_new_page, unsigned long private, bool offlining,
 		bool sync)
 {
 	int retry = 1;
-- 
cgit v1.2.3-71-gd317


From e5a5623b28198aa91ea71ee5d3846757fc76bc87 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 13 Jan 2011 15:46:00 -0800
Subject: mm: remove unused get_vm_area_node

get_vm_area_node() is unused in the kernel and can thus be removed.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 3 ---
 mm/vmalloc.c            | 7 -------
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 44b54f619ac6..cb73c755fac8 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -90,9 +90,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
 					unsigned long flags,
 					unsigned long start, unsigned long end,
 					void *caller);
-extern struct vm_struct *get_vm_area_node(unsigned long size,
-					  unsigned long flags, int node,
-					  gfp_t gfp_mask);
 extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 48245af07b10..78ec9d8bc57c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
 						-1, GFP_KERNEL, caller);
 }
 
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
-				   int node, gfp_t gfp_mask)
-{
-	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-				  node, gfp_mask, __builtin_return_address(0));
-}
-
 static struct vm_struct *find_vm_area(const void *addr)
 {
 	struct vmap_area *va;
-- 
cgit v1.2.3-71-gd317


From ec3f64fc9c196a304c4b7db3e1ff56d640628509 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 13 Jan 2011 15:46:01 -0800
Subject: mm: remove gfp mask from pcpu_get_vm_areas

pcpu_get_vm_areas() only uses GFP_KERNEL allocations, so remove the gfp_t
formal and use the mask internally.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  2 +-
 mm/percpu-vm.c          |  2 +-
 mm/vmalloc.c            | 21 +++++++++------------
 3 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index cb73c755fac8..c7348b8d0a81 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -117,7 +117,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 #ifdef CONFIG_SMP
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align, gfp_t gfp_mask);
+				     size_t align);
 
 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
 #endif
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
 		return NULL;
 
 	vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-				pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+				pcpu_nr_groups, pcpu_atom_size);
 	if (!vms) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78ec9d8bc57c..f67546636322 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2196,17 +2196,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  * @sizes: array containing size of each area
  * @nr_vms: the number of areas to allocate
  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
  *
  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
  *	    vm_structs on success, %NULL on failure
  *
  * Percpu allocator wants to use congruent vm areas so that it can
  * maintain the offsets among percpu areas.  This function allocates
- * congruent vmalloc areas for it.  These areas tend to be scattered
- * pretty far, distance between two areas easily going up to
- * gigabytes.  To avoid interacting with regular vmallocs, these areas
- * are allocated from top.
+ * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes.  To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
  *
  * Despite its complicated look, this allocator is rather simple.  It
  * does everything top-down and scans areas from the end looking for
@@ -2217,7 +2216,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align, gfp_t gfp_mask)
+				     size_t align)
 {
 	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
 	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2227,8 +2226,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	unsigned long base, start, end, last_end;
 	bool purged = false;
 
-	gfp_mask &= GFP_RECLAIM_MASK;
-
 	/* verify parameters and allocate data structures */
 	BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
 	for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2261,14 +2258,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		return NULL;
 	}
 
-	vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
-	vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+	vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+	vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
 	if (!vas || !vms)
 		goto err_free;
 
 	for (area = 0; area < nr_vms; area++) {
-		vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
-		vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+		vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
 		if (!vas[area] || !vms[area])
 			goto err_free;
 	}
-- 
cgit v1.2.3-71-gd317


From d0a21265dfb5fa8ae54e90d0fb6d1c215b10a28a Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 13 Jan 2011 15:46:02 -0800
Subject: mm: unify module_alloc code for vmalloc

Four architectures (arm, mips, sparc, x86) use __vmalloc_area() for
module_init().  Much of the code is duplicated and can be generalized in a
globally accessible function, __vmalloc_node_range().

__vmalloc_node() now calls into __vmalloc_node_range() with a range of
[VMALLOC_START, VMALLOC_END) for functionally equivalent behavior.

Each architecture may then use __vmalloc_node_range() directly to remove
the duplication of code.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/module.c   | 14 +++----------
 arch/mips/kernel/module.c  | 14 +++----------
 arch/sparc/kernel/module.c | 14 ++++---------
 arch/x86/kernel/module.c   | 17 ++++------------
 include/linux/vmalloc.h    |  5 +++--
 mm/vmalloc.c               | 50 +++++++++++++++++++++++++++-------------------
 6 files changed, 46 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 0c1bb68ff4a8..2cfe8161b478 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -38,17 +38,9 @@
 #ifdef CONFIG_MMU
 void *module_alloc(unsigned long size)
 {
-	struct vm_struct *area;
-
-	size = PAGE_ALIGN(size);
-	if (!size)
-		return NULL;
-
-	area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-	if (!area)
-		return NULL;
-
-	return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
+				__builtin_return_address(0));
 }
 #else /* CONFIG_MMU */
 void *module_alloc(unsigned long size)
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 6f51dda87fce..d87a72e9fac7 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 #ifdef MODULE_START
-	struct vm_struct *area;
-
-	size = PAGE_ALIGN(size);
-	if (!size)
-		return NULL;
-
-	area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END);
-	if (!area)
-		return NULL;
-
-	return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
+	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+				GFP_KERNEL, PAGE_KERNEL, -1,
+				__builtin_return_address(0));
 #else
 	if (size == 0)
 		return NULL;
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index ee3c7dde8d9f..8d348c474a2f 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -23,17 +23,11 @@
 
 static void *module_map(unsigned long size)
 {
-	struct vm_struct *area;
-
-	size = PAGE_ALIGN(size);
-	if (!size || size > MODULES_LEN)
-		return NULL;
-
-	area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-	if (!area)
+	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-
-	return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL, PAGE_KERNEL, -1,
+				__builtin_return_address(0));
 }
 
 static char *dot2underscore(char *name)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
 
 void *module_alloc(unsigned long size)
 {
-	struct vm_struct *area;
-
-	if (!size)
-		return NULL;
-	size = PAGE_ALIGN(size);
-	if (size > MODULES_LEN)
+	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-
-	area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-	if (!area)
-		return NULL;
-
-	return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
-					PAGE_KERNEL_EXEC);
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+				-1, __builtin_return_address(0));
 }
 
 /* Free memory returned from module_alloc */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c7348b8d0a81..4ed6fcd6b726 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
-				pgprot_t prot);
+extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
+			unsigned long start, unsigned long end, gfp_t gfp_mask,
+			pgprot_t prot, int node, void *caller);
 extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f67546636322..284346ee0e91 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1530,25 +1530,12 @@ fail:
 	return NULL;
 }
 
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
-	void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
-					 __builtin_return_address(0));
-
-	/*
-	 * A ref_count = 3 is needed because the vm_struct and vmap_area
-	 * structures allocated in the __get_vm_area_node() function contain
-	 * references to the virtual address of the vmalloc'ed block.
-	 */
-	kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-
-	return addr;
-}
-
 /**
- *	__vmalloc_node  -  allocate virtually contiguous memory
+ *	__vmalloc_node_range  -  allocate virtually contiguous memory
  *	@size:		allocation size
  *	@align:		desired alignment
+ *	@start:		vm area range start
+ *	@end:		vm area range end
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
  *	@node:		node to use for allocation or -1
@@ -1558,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
  *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
  */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
-			    gfp_t gfp_mask, pgprot_t prot,
-			    int node, void *caller)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+			unsigned long start, unsigned long end, gfp_t gfp_mask,
+			pgprot_t prot, int node, void *caller)
 {
 	struct vm_struct *area;
 	void *addr;
@@ -1570,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		return NULL;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
-				  VMALLOC_END, node, gfp_mask, caller);
+	area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+				  gfp_mask, caller);
 
 	if (!area)
 		return NULL;
@@ -1588,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 	return addr;
 }
 
+/**
+ *	__vmalloc_node  -  allocate virtually contiguous memory
+ *	@size:		allocation size
+ *	@align:		desired alignment
+ *	@gfp_mask:	flags for the page level allocator
+ *	@prot:		protection mask for the allocated pages
+ *	@node:		node to use for allocation or -1
+ *	@caller:	caller's return address
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator with @gfp_mask flags.  Map them into contiguous
+ *	kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+			    gfp_t gfp_mask, pgprot_t prot,
+			    int node, void *caller)
+{
+	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+				gfp_mask, prot, node, caller);
+}
+
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
 	return __vmalloc_node(size, 1, gfp_mask, prot, -1,
-- 
cgit v1.2.3-71-gd317


From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Thu, 13 Jan 2011 15:46:05 -0800
Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down

We'd like to be able to oom_score_adj a process up/down as it
enters/leaves the foreground.  Currently, it is not possible to oom_adj
down without CAP_SYS_RESOURCE.  This patch allows a task to decrease its
oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to
or its inherited value at fork.  Assuming the thread that has forked it
has oom_score_adj of 0, each process could decrease it back from 0 upon
activation unless a CAP_SYS_RESOURCE thread elevated it to something
higher.

Alternative considered:

* a setuid binary
* a daemon with CAP_SYS_RESOURCE

Since you don't wan't all processes to be able to reduce their oom_adj, a
setuid or daemon implementation would be complex.  The alternatives also
have much higher overhead.

This patch updated from original patch based on feedback from David
Rientjes.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 4 ++++
 fs/proc/base.c                     | 4 +++-
 include/linux/sched.h              | 2 ++
 kernel/fork.c                      | 1 +
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ef757fca470b..23cae6548d3a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1323,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
+value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
+requires CAP_SYS_RESOURCE.
+
 NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
 Documentation/feature-removal-schedule.txt.
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d7..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_task_lock;
 	}
 
-	if (oom_score_adj < task->signal->oom_score_adj &&
+	if (oom_score_adj < task->signal->oom_score_adj_min &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 			atomic_dec(&task->mm->oom_disable_count);
 	}
 	task->signal->oom_score_adj = oom_score_adj;
+	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+		task->signal->oom_score_adj_min = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07402530fc70..f23b5bb6f52e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -634,6 +634,8 @@ struct signal_struct {
 
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
 	int oom_score_adj;	/* OOM kill score adjustment */
+	int oom_score_adj_min;	/* OOM kill score adjustment minimum value.
+				 * Only settable by CAP_SYS_RESOURCE. */
 
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
diff --git a/kernel/fork.c b/kernel/fork.c
index 76a1fdd80bdf..1499607e4da2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
+	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
 
-- 
cgit v1.2.3-71-gd317


From 212260aa07135b327752dc02625c68cf4ce04caf Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Thu, 13 Jan 2011 15:46:06 -0800
Subject: mm: clear PageError bit in msync & fsync

Temporary IO failures, eg.  due to loss of both multipath paths, can
permanently leave the PageError bit set on a page, resulting in msync or
fsync returning -EIO over and over again, even if IO is now getting to the
disk correctly.

We already clear the AS_ENOSPC and AS_IO bits in mapping->flags in the
filemap_fdatawait_range function.  Also clearing the PageError bit on the
page allows subsequent msync or fsync calls on this file to return without
an error, if the subsequent IO succeeds.

Unfortunately data written out in the msync or fsync call that returned
-EIO can still get lost, because the page dirty bit appears to not get
restored on IO error.  However, the alternative could be potentially all
of memory filling up with uncleanable dirty pages, hanging the system, so
there is no nice choice here...

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Valerie Aurora <vaurora@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 mm/filemap.c               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5f38c460367e..4805fdec8d6e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -198,7 +198,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
 struct page;	/* forward declaration */
 
 TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
-PAGEFLAG(Error, error)
+PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
diff --git a/mm/filemap.c b/mm/filemap.c
index 1a3dd5914726..b4ad8e36c81a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
 				continue;
 
 			wait_on_page_writeback(page);
-			if (PageError(page))
+			if (TestClearPageError(page))
 				ret = -EIO;
 		}
 		pagevec_release(&pvec);
-- 
cgit v1.2.3-71-gd317


From 110d74a921f4d272b47ef6104fcf937df808f4c8 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Thu, 13 Jan 2011 15:46:11 -0800
Subject: mm: add FOLL_MLOCK follow_page flag.

Move the code to mlock pages from __mlock_vma_pages_range() to
follow_page().

This allows __mlock_vma_pages_range() to not have to break down work into
16-page batches.

An additional motivation for doing this within the present patch series is
that it'll make it easier for a later chagne to drop mmap_sem when
blocking on disk (we'd like to be able to resume at the page that was read
from disk instead of at the start of a 16-page batch).

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/memory.c        | 22 ++++++++++++++++++
 mm/mlock.c         | 65 +++++-------------------------------------------------
 3 files changed, 28 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 721f451c3029..9ade803bcc1b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1415,6 +1415,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_DUMP	0x08	/* give error on hole if it would be zero */
 #define FOLL_FORCE	0x10	/* get_user_pages read/write w/o permission */
+#define FOLL_MLOCK	0x40	/* mark page as mlocked */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/memory.c b/mm/memory.c
index b8f97b8575b7..15e1f19a3b10 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1310,6 +1310,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		 */
 		mark_page_accessed(page);
 	}
+	if (flags & FOLL_MLOCK) {
+		/*
+		 * The preliminary mapping check is mainly to avoid the
+		 * pointless overhead of lock_page on the ZERO_PAGE
+		 * which might bounce very badly if there is contention.
+		 *
+		 * If the page is already locked, we don't need to
+		 * handle it now - vmscan will handle it later if and
+		 * when it attempts to reclaim the page.
+		 */
+		if (page->mapping && trylock_page(page)) {
+			lru_add_drain();  /* push cached pages to LRU */
+			/*
+			 * Because we lock page here and migration is
+			 * blocked by the pte's page reference, we need
+			 * only check for file-cache page truncation.
+			 */
+			if (page->mapping)
+				mlock_vma_page(page);
+			unlock_page(page);
+		}
+	}
 unlock:
 	pte_unmap_unlock(ptep, ptl);
 out:
diff --git a/mm/mlock.c b/mm/mlock.c
index 67b3dd8616dc..25cc9e88c540 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -159,10 +159,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr = start;
-	struct page *pages[16]; /* 16 gives a reasonable batch */
 	int nr_pages = (end - start) / PAGE_SIZE;
-	int ret = 0;
 	int gup_flags;
+	int ret;
 
 	VM_BUG_ON(start & ~PAGE_MASK);
 	VM_BUG_ON(end   & ~PAGE_MASK);
@@ -170,7 +169,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 	VM_BUG_ON(end   > vma->vm_end);
 	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
-	gup_flags = FOLL_TOUCH | FOLL_GET;
+	gup_flags = FOLL_TOUCH | FOLL_MLOCK;
 	/*
 	 * We want to touch writable mappings with a write fault in order
 	 * to break COW, except for shared mappings because these don't COW
@@ -185,63 +184,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 		nr_pages--;
 	}
 
-	while (nr_pages > 0) {
-		int i;
-
-		cond_resched();
-
-		/*
-		 * get_user_pages makes pages present if we are
-		 * setting mlock. and this extra reference count will
-		 * disable migration of this page.  However, page may
-		 * still be truncated out from under us.
-		 */
-		ret = __get_user_pages(current, mm, addr,
-				min_t(int, nr_pages, ARRAY_SIZE(pages)),
-				gup_flags, pages, NULL);
-		/*
-		 * This can happen for, e.g., VM_NONLINEAR regions before
-		 * a page has been allocated and mapped at a given offset,
-		 * or for addresses that map beyond end of a file.
-		 * We'll mlock the pages if/when they get faulted in.
-		 */
-		if (ret < 0)
-			break;
-
-		lru_add_drain();	/* push cached pages to LRU */
-
-		for (i = 0; i < ret; i++) {
-			struct page *page = pages[i];
-
-			if (page->mapping) {
-				/*
-				 * That preliminary check is mainly to avoid
-				 * the pointless overhead of lock_page on the
-				 * ZERO_PAGE: which might bounce very badly if
-				 * there is contention.  However, we're still
-				 * dirtying its cacheline with get/put_page:
-				 * we'll add another __get_user_pages flag to
-				 * avoid it if that case turns out to matter.
-				 */
-				lock_page(page);
-				/*
-				 * Because we lock page here and migration is
-				 * blocked by the elevated reference, we need
-				 * only check for file-cache page truncation.
-				 */
-				if (page->mapping)
-					mlock_vma_page(page);
-				unlock_page(page);
-			}
-			put_page(page);	/* ref from get_user_pages() */
-		}
-
-		addr += ret * PAGE_SIZE;
-		nr_pages -= ret;
-		ret = 0;
-	}
-
-	return ret;	/* 0 or negative error code */
+	ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+			       NULL, NULL);
+	return max(ret, 0);	/* 0 or negative error code */
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 088e54658f559a13c2b988086512d76fe9e8f846 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 13 Jan 2011 15:46:16 -0800
Subject: mm: remove likely() from mapping_unevictable()

The mapping_unevictable() has a likely() around the mapping parameter.
This mapping parameter comes from page_mapping() which has an unlikely()
that the page will be set as PAGE_MAPPING_ANON, and if so, it will return
NULL.  One would think that this unlikely() means that the mapping
returned by page_mapping() would not be NULL, but where page_mapping() is
used just above mapping_unevictable(), that unlikely() is incorrect most
of the time.  This means that the "likely(mapping)" in
mapping_unevictable() is incorrect most of the time.

Running the annotated branch profiler on my main box which runs firefox,
evolution, xchat and is part of my distcc farm, I had this:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
12872836 1269443893  98 mapping_unevictable            pagemap.h            51
35935762 1270265395  97 page_mapping                   mm.h                 659
1306198001   143659   0 page_mapping                   mm.h                 657
203131478   121586   0 page_mapping                   mm.h                 657
 5415491     1116   0 page_mapping                   mm.h                 657
74899487     1116   0 page_mapping                   mm.h                 657
203132845      224   0 page_mapping                   mm.h                 659
 5415464       27   0 page_mapping                   mm.h                 659
   13552        0   0 page_mapping                   mm.h                 657
   13552        0   0 page_mapping                   mm.h                 659
  242630        0   0 page_mapping                   mm.h                 657
  242630        0   0 page_mapping                   mm.h                 659
74899487        0   0 page_mapping                   mm.h                 659

The page_mapping() is a static inline, which is why it shows up multiple
times.  The mapping_unevictable() is also a static inline but seems to be
used only once in my setup.

The unlikely in page_mapping() was correct a total of 1909540379 times and
incorrect 1270533123 times, with a 39% being incorrect.  Perhaps this is
enough to remove the unlikely from page_mapping() as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Nick Piggin <npiggin@kernel.dk>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2d1ffe3cf1ee..9c66e994540f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping)
 
 static inline int mapping_unevictable(struct address_space *mapping)
 {
-	if (likely(mapping))
+	if (mapping)
 		return test_bit(AS_UNEVICTABLE, &mapping->flags);
 	return !!mapping;
 }
-- 
cgit v1.2.3-71-gd317


From e20e87795834f2f14cb53baf657b91d9c39f92c8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 13 Jan 2011 15:46:17 -0800
Subject: mm: remove unlikely() from page_mapping()

page_mapping() has a unlikely that the mapping has PAGE_MAPPING_ANON set.
But running the annotated branch profiler on a normal desktop system doing
vairous tasks (xchat, evolution, firefox, distcc), it is not really that
unlikely that the mapping here will have the PAGE_MAPPING_ANON flag set:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
35935762 1270265395  97 page_mapping                   mm.h                 659
1306198001   143659   0 page_mapping                   mm.h                 657
203131478   121586   0 page_mapping                   mm.h                 657
 5415491     1116   0 page_mapping                   mm.h                 657
74899487     1116   0 page_mapping                   mm.h                 657
203132845      224   0 page_mapping                   mm.h                 659
 5415464       27   0 page_mapping                   mm.h                 659
   13552        0   0 page_mapping                   mm.h                 657
   13552        0   0 page_mapping                   mm.h                 659
  242630        0   0 page_mapping                   mm.h                 657
  242630        0   0 page_mapping                   mm.h                 659
74899487        0   0 page_mapping                   mm.h                 659

The page_mapping() is a static inline, which is why it shows up multiple
times.

The unlikely in page_mapping() was correct a total of 1909540379 times and
incorrect 1270533123 times, with a 39% being incorrect.  With this much of
an error, it's best to simply remove the unlikely and have the compiler
and branch prediction figure this out.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9ade803bcc1b..57e11b2a18ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -657,7 +657,7 @@ static inline struct address_space *page_mapping(struct page *page)
 	VM_BUG_ON(PageSlab(page));
 	if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
-	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
+	else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
 		mapping = NULL;
 	return mapping;
 }
-- 
cgit v1.2.3-71-gd317


From 9950474883e027e6e728cbcff25f7f2bf0c96530 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:46:20 -0800
Subject: mm: kswapd: stop high-order balancing when any suitable zone is
 balanced

Simon Kirby reported the following problem

   We're seeing cases on a number of servers where cache never fully
   grows to use all available memory.  Sometimes we see servers with 4 GB
   of memory that never seem to have less than 1.5 GB free, even with a
   constantly-active VM.  In some cases, these servers also swap out while
   this happens, even though they are constantly reading the working set
   into memory.  We have been seeing this happening for a long time; I
   don't think it's anything recent, and it still happens on 2.6.36.

After some debugging work by Simon, Dave Hansen and others, the prevaling
theory became that kswapd is reclaiming order-3 pages requested by SLUB
too aggressive about it.

There are two apparent problems here.  On the target machine, there is a
small Normal zone in comparison to DMA32.  As kswapd tries to balance all
zones, it would continually try reclaiming for Normal even though DMA32
was balanced enough for callers.  The second problem is that
sleeping_prematurely() does not use the same logic as balance_pgdat() when
deciding whether to sleep or not.  This keeps kswapd artifically awake.

A number of tests were run and the figures from previous postings will
look very different for a few reasons.  One, the old figures were forcing
my network card to use GFP_ATOMIC in attempt to replicate Simon's problem.
 Second, I previous specified slub_min_order=3 again in an attempt to
reproduce Simon's problem.  In this posting, I'm depending on Simon to say
whether his problem is fixed or not and these figures are to show the
impact to the ordinary cases.  Finally, the "vmscan" figures are taken
from /proc/vmstat instead of the tracepoints.  There is less information
but recording is less disruptive.

The first test of relevance was postmark with a process running in the
background reading a large amount of anonymous memory in blocks.  The
objective was to vaguely simulate what was happening on Simon's machine
and it's memory intensive enough to have kswapd awake.

POSTMARK
                                            traceonly          kanyzone
Transactions per second:              156.00 ( 0.00%)   153.00 (-1.96%)
Data megabytes read per second:        21.51 ( 0.00%)    21.52 ( 0.05%)
Data megabytes written per second:     29.28 ( 0.00%)    29.11 (-0.58%)
Files created alone per second:       250.00 ( 0.00%)   416.00 (39.90%)
Files create/transact per second:      79.00 ( 0.00%)    76.00 (-3.95%)
Files deleted alone per second:       520.00 ( 0.00%)   420.00 (-23.81%)
Files delete/transact per second:      79.00 ( 0.00%)    76.00 (-3.95%)

MMTests Statistics: duration
User/Sys Time Running Test (seconds)         16.58      17.4
Total Elapsed Time (seconds)                218.48    222.47

VMstat Reclaim Statistics: vmscan
Direct reclaims                                  0          4
Direct reclaim pages scanned                     0        203
Direct reclaim pages reclaimed                   0        184
Kswapd pages scanned                        326631     322018
Kswapd pages reclaimed                      312632     309784
Kswapd low wmark quickly                         1          4
Kswapd high wmark quickly                      122        475
Kswapd skip congestion_wait                      1          0
Pages activated                             700040     705317
Pages deactivated                           212113     203922
Pages written                                 9875       6363

Total pages scanned                         326631    322221
Total pages reclaimed                       312632    309968
%age total pages scanned/reclaimed          95.71%    96.20%
%age total pages scanned/written             3.02%     1.97%

proc vmstat: Faults
Major Faults                                   300       254
Minor Faults                                645183    660284
Page ins                                    493588    486704
Page outs                                  4960088   4986704
Swap ins                                      1230       661
Swap outs                                     9869      6355

Performance is mildly affected because kswapd is no longer doing as much
work and the background memory consumer process is getting in the way.
Note that kswapd scanned and reclaimed fewer pages as it's less aggressive
and overall fewer pages were scanned and reclaimed.  Swap in/out is
particularly reduced again reflecting kswapd throwing out fewer pages.

The slight performance impact is unfortunate here but it looks like a
direct result of kswapd being less aggressive.  As the bug report is about
too many pages being freed by kswapd, it may have to be accepted for now.

The second test is a streaming IO benchmark that was previously used by
Johannes to show regressions in page reclaim.

MICRO
					 traceonly  kanyzone
User/Sys Time Running Test (seconds)         29.29     28.87
Total Elapsed Time (seconds)                492.18    488.79

VMstat Reclaim Statistics: vmscan
Direct reclaims                               2128       1460
Direct reclaim pages scanned               2284822    1496067
Direct reclaim pages reclaimed              148919     110937
Kswapd pages scanned                      15450014   16202876
Kswapd pages reclaimed                     8503697    8537897
Kswapd low wmark quickly                      3100       3397
Kswapd high wmark quickly                     1860       7243
Kswapd skip congestion_wait                    708        801
Pages activated                               9635       9573
Pages deactivated                             1432       1271
Pages written                                  223       1130

Total pages scanned                       17734836  17698943
Total pages reclaimed                      8652616   8648834
%age total pages scanned/reclaimed          48.79%    48.87%
%age total pages scanned/written             0.00%     0.01%

proc vmstat: Faults
Major Faults                                   165       221
Minor Faults                               9655785   9656506
Page ins                                      3880      7228
Page outs                                 37692940  37480076
Swap ins                                         0        69
Swap outs                                       19        15

Again fewer pages are scanned and reclaimed as expected and this time the
test completed faster.  Note that kswapd is hitting its watermarks faster
(low and high wmark quickly) which I expect is due to kswapd reclaiming
fewer pages.

I also ran fs-mark, iozone and sysbench but there is nothing interesting
to report in the figures.  Performance is not significantly changed and
the reclaim statistics look reasonable.

Tgis patch:

When the allocator enters its slow path, kswapd is woken up to balance the
node.  It continues working until all zones within the node are balanced.
For order-0 allocations, this makes perfect sense but for higher orders it
can have unintended side-effects.  If the zone sizes are imbalanced,
kswapd may reclaim heavily within a smaller zone discarding an excessive
number of pages.  The user-visible behaviour is that kswapd is awake and
reclaiming even though plenty of pages are free from a suitable zone.

This patch alters the "balance" logic for high-order reclaim allowing
kswapd to stop if any suitable zone becomes balanced to reduce the number
of pages it reclaims from other zones.  kswapd still tries to ensure that
order-0 watermarks for all zones are met before sleeping.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Eric B Munson <emunson@mgebm.net>
Cc: Simon Kirby <sim@hostway.ca>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c        |  8 +++---
 mm/vmscan.c            | 68 +++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 66 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 48906629335c..dad3612a7e39 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -639,6 +639,7 @@ typedef struct pglist_data {
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
+	enum zone_type classzone_idx;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
@@ -654,7 +655,7 @@ typedef struct pglist_data {
 
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(void *data);
-void wakeup_kswapd(struct zone *zone, int order);
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0fd486467b4b..c31460918506 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1944,13 +1944,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-						enum zone_type high_zoneidx)
+						enum zone_type high_zoneidx,
+						enum zone_type classzone_idx)
 {
 	struct zoneref *z;
 	struct zone *zone;
 
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-		wakeup_kswapd(zone, order);
+		wakeup_kswapd(zone, order, classzone_idx);
 }
 
 static inline int
@@ -2028,7 +2029,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	wake_all_kswapd(order, zonelist, high_zoneidx);
+	wake_all_kswapd(order, zonelist, high_zoneidx,
+						zone_idx(preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7037cc8c60b6..3584067800e1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2246,11 +2246,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
  * interoperates with the page allocator fallback scheme to ensure that aging
  * of pages is balanced across the zones.
  */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+							int classzone_idx)
 {
 	int all_zones_ok;
+	int any_zone_ok;
 	int priority;
 	int i;
+	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 	unsigned long total_scanned;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
@@ -2273,7 +2276,6 @@ loop_again:
 	count_vm_event(PAGEOUTRUN);
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
 		int has_under_min_watermark_zone = 0;
 
@@ -2282,6 +2284,7 @@ loop_again:
 			disable_swap_token();
 
 		all_zones_ok = 1;
+		any_zone_ok = 0;
 
 		/*
 		 * Scan in the highmem->dma direction for the highest
@@ -2400,10 +2403,12 @@ loop_again:
 				 * spectulatively avoid congestion waits
 				 */
 				zone_clear_flag(zone, ZONE_CONGESTED);
+				if (i <= classzone_idx)
+					any_zone_ok = 1;
 			}
 
 		}
-		if (all_zones_ok)
+		if (all_zones_ok || (order && any_zone_ok))
 			break;		/* kswapd: all done */
 		/*
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2426,7 +2431,13 @@ loop_again:
 			break;
 	}
 out:
-	if (!all_zones_ok) {
+
+	/*
+	 * order-0: All zones must meet high watermark for a balanced node
+	 * high-order: Any zone below pgdats classzone_idx must meet the high
+	 *             watermark for a balanced node
+	 */
+	if (!(all_zones_ok || (order && any_zone_ok))) {
 		cond_resched();
 
 		try_to_freeze();
@@ -2451,6 +2462,36 @@ out:
 		goto loop_again;
 	}
 
+	/*
+	 * If kswapd was reclaiming at a higher order, it has the option of
+	 * sleeping without all zones being balanced. Before it does, it must
+	 * ensure that the watermarks for order-0 on *all* zones are met and
+	 * that the congestion flags are cleared. The congestion flag must
+	 * be cleared as kswapd is the only mechanism that clears the flag
+	 * and it is potentially going to sleep here.
+	 */
+	if (order) {
+		for (i = 0; i <= end_zone; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+
+			if (!populated_zone(zone))
+				continue;
+
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+				continue;
+
+			/* Confirm the zone is balanced for order-0 */
+			if (!zone_watermark_ok(zone, 0,
+					high_wmark_pages(zone), 0, 0)) {
+				order = sc.order = 0;
+				goto loop_again;
+			}
+
+			/* If balanced, clear the congested flag */
+			zone_clear_flag(zone, ZONE_CONGESTED);
+		}
+	}
+
 	return sc.nr_reclaimed;
 }
 
@@ -2514,6 +2555,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
 static int kswapd(void *p)
 {
 	unsigned long order;
+	int classzone_idx;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 
@@ -2544,21 +2586,27 @@ static int kswapd(void *p)
 	set_freezable();
 
 	order = 0;
+	classzone_idx = MAX_NR_ZONES - 1;
 	for ( ; ; ) {
 		unsigned long new_order;
+		int new_classzone_idx;
 		int ret;
 
 		new_order = pgdat->kswapd_max_order;
+		new_classzone_idx = pgdat->classzone_idx;
 		pgdat->kswapd_max_order = 0;
-		if (order < new_order) {
+		pgdat->classzone_idx = MAX_NR_ZONES - 1;
+		if (order < new_order || classzone_idx > new_classzone_idx) {
 			/*
 			 * Don't sleep if someone wants a larger 'order'
-			 * allocation
+			 * allocation or has tigher zone constraints
 			 */
 			order = new_order;
+			classzone_idx = new_classzone_idx;
 		} else {
 			kswapd_try_to_sleep(pgdat, order);
 			order = pgdat->kswapd_max_order;
+			classzone_idx = pgdat->classzone_idx;
 		}
 
 		ret = try_to_freeze();
@@ -2571,7 +2619,7 @@ static int kswapd(void *p)
 		 */
 		if (!ret) {
 			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-			balance_pgdat(pgdat, order);
+			balance_pgdat(pgdat, order, classzone_idx);
 		}
 	}
 	return 0;
@@ -2580,7 +2628,7 @@ static int kswapd(void *p)
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
  */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
 	pg_data_t *pgdat;
 
@@ -2590,8 +2638,10 @@ void wakeup_kswapd(struct zone *zone, int order)
 	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 		return;
 	pgdat = zone->zone_pgdat;
-	if (pgdat->kswapd_max_order < order)
+	if (pgdat->kswapd_max_order < order) {
 		pgdat->kswapd_max_order = order;
+		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+	}
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
-- 
cgit v1.2.3-71-gd317


From e9da73d67729b58bba256123e2b4651e0d8a01ac Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:32 -0800
Subject: thp: compound_lock

Add a new compound_lock() needed to serialize put_page against
__split_huge_page_refcount().

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h         | 34 ++++++++++++++++++++++++++++++++++
 include/linux/page-flags.h | 12 +++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57e11b2a18ba..3b1754ad8785 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
 #include <linux/mm_types.h>
 #include <linux/range.h>
 #include <linux/pfn.h>
+#include <linux/bit_spinlock.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -305,6 +306,39 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
+static inline void compound_lock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	bit_spin_lock(PG_compound_lock, &page->flags);
+#endif
+}
+
+static inline void compound_unlock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	bit_spin_unlock(PG_compound_lock, &page->flags);
+#endif
+}
+
+static inline unsigned long compound_lock_irqsave(struct page *page)
+{
+	unsigned long uninitialized_var(flags);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	local_irq_save(flags);
+	compound_lock(page);
+#endif
+	return flags;
+}
+
+static inline void compound_unlock_irqrestore(struct page *page,
+					      unsigned long flags)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	compound_unlock(page);
+	local_irq_restore(flags);
+#endif
+}
+
 static inline struct page *compound_head(struct page *page)
 {
 	if (unlikely(PageTail(page)))
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4805fdec8d6e..5a743cc87238 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -107,6 +107,9 @@ enum pageflags {
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	PG_compound_lock,
 #endif
 	__NR_PAGEFLAGS,
 
@@ -397,6 +400,12 @@ static inline void __ClearPageTail(struct page *page)
 #define __PG_MLOCKED		0
 #endif
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __PG_COMPOUND_LOCK		(1 << PG_compound_lock)
+#else
+#define __PG_COMPOUND_LOCK		0
+#endif
+
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
  * these flags set.  It they are, there is a problem.
@@ -406,7 +415,8 @@ static inline void __ClearPageTail(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
+	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+	 __PG_COMPOUND_LOCK)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
-- 
cgit v1.2.3-71-gd317


From 9180706344487700b40da9eca5dedd3d11cb33b4 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:32 -0800
Subject: thp: alter compound get_page/put_page

Alter compound get_page/put_page to keep references on subpages too, in
order to allow __split_huge_page_refcount to split an hugepage even while
subpages have been pinned by one of the get_user_pages() variants.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/gup.c | 12 +++++++
 arch/x86/mm/gup.c     | 12 +++++++
 include/linux/mm.h    | 24 ++++++++++++--
 mm/swap.c             | 90 +++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 129 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf640c7..fec13200868f 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -16,6 +16,16 @@
 
 #ifdef __HAVE_ARCH_PTE_SPECIAL
 
+static inline void get_huge_page_tail(struct page *page)
+{
+	/*
+	 * __split_huge_page_refcount() cannot run
+	 * from under us.
+	 */
+	VM_BUG_ON(atomic_read(&page->_count) < 0);
+	atomic_inc(&page->_count);
+}
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
@@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 			put_page(page);
 			return 0;
 		}
+		if (PageTail(page))
+			get_huge_page_tail(page);
 		pages[*nr] = page;
 		(*nr)++;
 
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..06f56fcf9a77 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -105,6 +105,16 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 	atomic_add(nr, &page->_count);
 }
 
+static inline void get_huge_page_tail(struct page *page)
+{
+	/*
+	 * __split_huge_page_refcount() cannot run
+	 * from under us.
+	 */
+	VM_BUG_ON(atomic_read(&page->_count) < 0);
+	atomic_inc(&page->_count);
+}
+
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
@@ -128,6 +138,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
+		if (PageTail(page))
+			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b1754ad8785..a2718e1ed585 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -353,9 +353,29 @@ static inline int page_count(struct page *page)
 
 static inline void get_page(struct page *page)
 {
-	page = compound_head(page);
-	VM_BUG_ON(atomic_read(&page->_count) == 0);
+	/*
+	 * Getting a normal page or the head of a compound page
+	 * requires to already have an elevated page->_count. Only if
+	 * we're getting a tail page, the elevated page->_count is
+	 * required only in the head page, so for tail pages the
+	 * bugcheck only verifies that the page->_count isn't
+	 * negative.
+	 */
+	VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
 	atomic_inc(&page->_count);
+	/*
+	 * Getting a tail page will elevate both the head and tail
+	 * page->_count(s).
+	 */
+	if (unlikely(PageTail(page))) {
+		/*
+		 * This is safe only because
+		 * __split_huge_page_refcount can't run under
+		 * get_page().
+		 */
+		VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+		atomic_inc(&page->first_page->_count);
+	}
 }
 
 static inline struct page *virt_to_head_page(const void *x)
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..33f5292fe132 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,93 @@ static void __page_cache_release(struct page *page)
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
+}
+
+static void __put_single_page(struct page *page)
+{
+	__page_cache_release(page);
 	free_hot_cold_page(page, 0);
 }
 
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
 {
-	page = compound_head(page);
-	if (put_page_testzero(page)) {
-		compound_page_dtor *dtor;
+	compound_page_dtor *dtor;
+
+	__page_cache_release(page);
+	dtor = get_compound_page_dtor(page);
+	(*dtor)(page);
+}
 
-		dtor = get_compound_page_dtor(page);
-		(*dtor)(page);
+static void put_compound_page(struct page *page)
+{
+	if (unlikely(PageTail(page))) {
+		/* __split_huge_page_refcount can run under us */
+		struct page *page_head = page->first_page;
+		smp_rmb();
+		/*
+		 * If PageTail is still set after smp_rmb() we can be sure
+		 * that the page->first_page we read wasn't a dangling pointer.
+		 * See __split_huge_page_refcount() smp_wmb().
+		 */
+		if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+			unsigned long flags;
+			/*
+			 * Verify that our page_head wasn't converted
+			 * to a a regular page before we got a
+			 * reference on it.
+			 */
+			if (unlikely(!PageHead(page_head))) {
+				/* PageHead is cleared after PageTail */
+				smp_rmb();
+				VM_BUG_ON(PageTail(page));
+				goto out_put_head;
+			}
+			/*
+			 * Only run compound_lock on a valid PageHead,
+			 * after having it pinned with
+			 * get_page_unless_zero() above.
+			 */
+			smp_mb();
+			/* page_head wasn't a dangling pointer */
+			flags = compound_lock_irqsave(page_head);
+			if (unlikely(!PageTail(page))) {
+				/* __split_huge_page_refcount run before us */
+				compound_unlock_irqrestore(page_head, flags);
+				VM_BUG_ON(PageHead(page_head));
+			out_put_head:
+				if (put_page_testzero(page_head))
+					__put_single_page(page_head);
+			out_put_single:
+				if (put_page_testzero(page))
+					__put_single_page(page);
+				return;
+			}
+			VM_BUG_ON(page_head != page->first_page);
+			/*
+			 * We can release the refcount taken by
+			 * get_page_unless_zero now that
+			 * split_huge_page_refcount is blocked on the
+			 * compound_lock.
+			 */
+			if (put_page_testzero(page_head))
+				VM_BUG_ON(1);
+			/* __split_huge_page_refcount will wait now */
+			VM_BUG_ON(atomic_read(&page->_count) <= 0);
+			atomic_dec(&page->_count);
+			VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+			compound_unlock_irqrestore(page_head, flags);
+			if (put_page_testzero(page_head))
+				__put_compound_page(page_head);
+		} else {
+			/* page_head is a dangling pointer */
+			VM_BUG_ON(PageTail(page));
+			goto out_put_single;
+		}
+	} else if (put_page_testzero(page)) {
+		if (PageHead(page))
+			__put_compound_page(page);
+		else
+			__put_single_page(page);
 	}
 }
 
@@ -75,7 +151,7 @@ void put_page(struct page *page)
 	if (unlikely(PageCompound(page)))
 		put_compound_page(page);
 	else if (put_page_testzero(page))
-		__page_cache_release(page);
+		__put_single_page(page);
 }
 EXPORT_SYMBOL(put_page);
 
-- 
cgit v1.2.3-71-gd317


From 14fd403f2146f740942d78af4e0ee59396ad8eab Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:37 -0800
Subject: thp: export maybe_mkwrite

huge_memory.c needs it too when it fallbacks in copying hugepages into
regular fragmented pages if hugepage allocation fails during COW.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 13 +++++++++++++
 mm/memory.c        | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2718e1ed585..6bef67d74adf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -429,6 +429,19 @@ static inline void set_compound_order(struct page *page, unsigned long order)
 	page[1].lru.prev = (void *)order;
 }
 
+/*
+ * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
+ * servicing faults for write access.  In the normal case, do always want
+ * pte_mkwrite.  But get_user_pages can cause write faults for mappings
+ * that do not have writing enabled, when used by access_process_vm.
+ */
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_flags & VM_WRITE))
+		pte = pte_mkwrite(pte);
+	return pte;
+}
+
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
diff --git a/mm/memory.c b/mm/memory.c
index 1bbe9a22429c..bdf19366b705 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2083,19 +2083,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 	return same;
 }
 
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
- * servicing faults for write access.  In the normal case, do always want
- * pte_mkwrite.  But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
-	if (likely(vma->vm_flags & VM_WRITE))
-		pte = pte_mkwrite(pte);
-	return pte;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
 	/*
-- 
cgit v1.2.3-71-gd317


From 8ac1f8320a0073f28cf9e0491af4cd98f504f92a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:43 -0800
Subject: thp: pte alloc trans splitting

pte alloc routines must wait for split_huge_page if the pmd is not present
and not null (i.e.  pmd_trans_splitting).  The additional branches are
optimized away at compile time by pmd_trans_splitting if the config option
is off.  However we must pass the vma down in order to know the anon_vma
lock to wait for.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/pgd.c           |  2 +-
 arch/ia64/mm/hugetlbpage.c  |  2 +-
 arch/sh/mm/hugetlbpage.c    |  2 +-
 arch/sparc/mm/generic_32.c  |  2 +-
 arch/sparc/mm/generic_64.c  |  2 +-
 arch/sparc/mm/hugetlbpage.c |  2 +-
 arch/um/kernel/skas/mmu.c   |  2 +-
 arch/x86/kernel/tboot.c     |  2 +-
 include/linux/mm.h          | 15 +++++++++------
 mm/memory.c                 | 19 +++++++++++++------
 mm/mremap.c                 |  8 +++++---
 11 files changed, 35 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 93292a18cf77..709244c66fa3 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 		if (!new_pmd)
 			goto no_pmd;
 
-		new_pte = pte_alloc_map(mm, new_pmd, 0);
+		new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
 		if (!new_pte)
 			goto no_pte;
 
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 1841ee7e65f9..5ca674b74737 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 	if (pud) {
 		pmd = pmd_alloc(mm, pud, taddr);
 		if (pmd)
-			pte = pte_alloc_map(mm, pmd, taddr);
+			pte = pte_alloc_map(mm, NULL, pmd, taddr);
 	}
 	return pte;
 }
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 9163db3e8d15..d7762349ea48 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 		if (pud) {
 			pmd = pmd_alloc(mm, pud, addr);
 			if (pmd)
-				pte = pte_alloc_map(mm, pmd, addr);
+				pte = pte_alloc_map(mm, NULL, pmd, addr);
 		}
 	}
 
diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c
index 5edcac184eaf..e6067b75f11c 100644
--- a/arch/sparc/mm/generic_32.c
+++ b/arch/sparc/mm/generic_32.c
@@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
 		end = PGDIR_SIZE;
 	offset -= address;
 	do {
-		pte_t * pte = pte_alloc_map(mm, pmd, address);
+		pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c
index 04f2bf4cd571..3cb00dfd4bd6 100644
--- a/arch/sparc/mm/generic_64.c
+++ b/arch/sparc/mm/generic_64.c
@@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
 		end = PGDIR_SIZE;
 	offset -= address;
 	do {
-		pte_t * pte = pte_alloc_map(mm, pmd, address);
+		pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5fdddf134caa..f4e97646ce23 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	if (pud) {
 		pmd = pmd_alloc(mm, pud, addr);
 		if (pmd)
-			pte = pte_alloc_map(mm, pmd, addr);
+			pte = pte_alloc_map(mm, NULL, pmd, addr);
 	}
 	return pte;
 }
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 3d099f974785..1aee587e9c5d 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	if (!pmd)
 		goto out_pmd;
 
-	pte = pte_alloc_map(mm, pmd, proc);
+	pte = pte_alloc_map(mm, NULL, pmd, proc);
 	if (!pte)
 		goto out_pte;
 
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 	pmd = pmd_alloc(&tboot_mm, pud, vaddr);
 	if (!pmd)
 		return -1;
-	pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+	pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
 	if (!pte)
 		return -1;
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bef67d74adf..14ddd98b063f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1131,7 +1131,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 #endif
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+		pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 
 /*
@@ -1200,16 +1201,18 @@ static inline void pgtable_page_dtor(struct page *page)
 	pte_unmap(pte);					\
 } while (0)
 
-#define pte_alloc_map(mm, pmd, address)			\
-	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
-		NULL: pte_offset_map(pmd, address))
+#define pte_alloc_map(mm, vma, pmd, address)				\
+	((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma,	\
+							pmd, address))?	\
+	 NULL: pte_offset_map(pmd, address))
 
 #define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
-	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+	((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL,	\
+							pmd, address))?	\
 		NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
 
 #define pte_alloc_kernel(pmd, address)			\
-	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
+	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
 		NULL: pte_offset_kernel(pmd, address))
 
 extern void free_area_init(unsigned long * zones_size);
diff --git a/mm/memory.c b/mm/memory.c
index bdf19366b705..567bca80ea53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	}
 }
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+		pmd_t *pmd, unsigned long address)
 {
 	pgtable_t new = pte_alloc_one(mm, address);
+	int wait_split_huge_page;
 	if (!new)
 		return -ENOMEM;
 
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
 	spin_lock(&mm->page_table_lock);
-	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
+	wait_split_huge_page = 0;
+	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
 		mm->nr_ptes++;
 		pmd_populate(mm, pmd, new);
 		new = NULL;
-	}
+	} else if (unlikely(pmd_trans_splitting(*pmd)))
+		wait_split_huge_page = 1;
 	spin_unlock(&mm->page_table_lock);
 	if (new)
 		pte_free(mm, new);
+	if (wait_split_huge_page)
+		wait_split_huge_page(vma->anon_vma, pmd);
 	return 0;
 }
 
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 	smp_wmb(); /* See comment in __pte_alloc */
 
 	spin_lock(&init_mm.page_table_lock);
-	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
+	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
 		pmd_populate_kernel(&init_mm, pmd, new);
 		new = NULL;
-	}
+	} else
+		VM_BUG_ON(pmd_trans_splitting(*pmd));
 	spin_unlock(&init_mm.page_table_lock);
 	if (new)
 		pte_free_kernel(&init_mm, new);
@@ -3253,7 +3260,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		return VM_FAULT_OOM;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pte_alloc_map(mm, vma, pmd, address);
 	if (!pte)
 		return VM_FAULT_OOM;
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..b09eefaea0b8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -47,7 +47,8 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 	return pmd;
 }
 
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+			    unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -62,7 +63,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 	if (!pmd)
 		return NULL;
 
-	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+	VM_BUG_ON(pmd_trans_huge(*pmd));
+	if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
 		return NULL;
 
 	return pmd;
@@ -147,7 +149,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
 		if (!old_pmd)
 			continue;
-		new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 		if (!new_pmd)
 			break;
 		next = (new_addr + PMD_SIZE) & PMD_MASK;
-- 
cgit v1.2.3-71-gd317


From 91a4ee2670e0ee2b0630a0eb24fb9a87ea3acd0a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:44 -0800
Subject: thp: add pmd mmu_notifier helpers

Add mmu notifier helpers to handle pmd huge operations.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 43dcfbdc39de..cbfab1e9957d 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -243,6 +243,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__pte;								\
 })
 
+#define pmdp_clear_flush_notify(__vma, __address, __pmdp)		\
+({									\
+	pmd_t __pmd;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	VM_BUG_ON(__address & ~HPAGE_PMD_MASK);				\
+	mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,	\
+					    (__address)+HPAGE_PMD_SIZE);\
+	__pmd = pmdp_clear_flush(___vma, ___address, __pmdp);		\
+	mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,	\
+					  (__address)+HPAGE_PMD_SIZE);	\
+	__pmd;								\
+})
+
+#define pmdp_splitting_flush_notify(__vma, __address, __pmdp)		\
+({									\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	VM_BUG_ON(__address & ~HPAGE_PMD_MASK);				\
+	mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address,	\
+					    (__address)+HPAGE_PMD_SIZE);\
+	pmdp_splitting_flush(___vma, ___address, __pmdp);		\
+	mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address,	\
+					  (__address)+HPAGE_PMD_SIZE);	\
+})
+
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
 ({									\
 	int __young;							\
@@ -254,6 +280,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__young;							\
 })
 
+#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = pmdp_clear_flush_young(___vma, ___address, __pmdp);	\
+	__young |= mmu_notifier_clear_flush_young(___vma->vm_mm,	\
+						  ___address);		\
+	__young;							\
+})
+
 #define set_pte_at_notify(__mm, __address, __ptep, __pte)		\
 ({									\
 	struct mm_struct *___mm = __mm;					\
@@ -305,7 +342,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 }
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_flush_notify ptep_clear_flush
+#define pmdp_clear_flush_notify pmdp_clear_flush
+#define pmdp_splitting_flush_notify pmdp_splitting_flush
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
-- 
cgit v1.2.3-71-gd317


From 4e6af67e970a2ac287739a4c526c857b5bda27ec Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:44 -0800
Subject: thp: clear page compound

split_huge_page must transform a compound page to a regular page and needs
ClearPageCompound.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5a743cc87238..4835cae71047 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -347,7 +347,7 @@ static inline void set_page_writeback(struct page *page)
  * tests can be used in performance sensitive paths. PageCompound is
  * generally not used in hot code paths.
  */
-__PAGEFLAG(Head, head)
+__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
 __PAGEFLAG(Tail, tail)
 
 static inline int PageCompound(struct page *page)
@@ -355,6 +355,13 @@ static inline int PageCompound(struct page *page)
 	return page->flags & ((1L << PG_head) | (1L << PG_tail));
 
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+	BUG_ON(!PageHead(page));
+	ClearPageHead(page);
+}
+#endif
 #else
 /*
  * Reduce page flag use as much as possible by overlapping
@@ -392,6 +399,14 @@ static inline void __ClearPageTail(struct page *page)
 	page->flags &= ~PG_head_tail_mask;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+	BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound));
+	clear_bit(PG_compound, &page->flags);
+}
+#endif
+
 #endif /* !PAGEFLAGS_EXTENDED */
 
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3-71-gd317


From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:45 -0800
Subject: thp: add pmd_huge_pte to mm_struct

This increase the size of the mm struct a bit but it is needed to
preallocate one pte for each hugepage so that split_huge_page will not
require a fail path.  Guarantee of success is a fundamental property of
split_huge_page to avoid decrasing swapping reliability and to avoid
adding -ENOMEM fail paths that would otherwise force the hugepage-unaware
VM code to learn rolling back in the middle of its pte mangling operations
(if something we need it to learn handling pmd_trans_huge natively rather
being capable of rollback).  When split_huge_page runs a pte is needed to
succeed the split, to map the newly splitted regular pages with a regular
pte.  This way all existing VM code remains backwards compatible by just
adding a split_huge_page* one liner.  The memory waste of those
preallocated ptes is negligible and so it is worth it.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 3 +++
 kernel/fork.c            | 7 +++++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb7288a782fd..26bc4e2cd275 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -309,6 +309,9 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
 	/* How many tasks sharing this mm are OOM_DISABLE */
 	atomic_t oom_disable_count;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1499607e4da2..f78f50ba6cb2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -529,6 +529,9 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(mm->pmd_huge_pte);
+#endif
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -669,6 +672,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	mm->token_priority = 0;
 	mm->last_interval = 0;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	mm->pmd_huge_pte = NULL;
+#endif
+
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
-- 
cgit v1.2.3-71-gd317


From 47ad8475c000141eacb3ecda5e5ce4b43a9cd04d Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:47 -0800
Subject: thp: clear_copy_huge_page

Move the copy/clear_huge_page functions to common code to share between
hugetlb.c and huge_memory.c.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  9 +++++++
 mm/hugetlb.c       | 70 +++--------------------------------------------------
 mm/memory.c        | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 14ddd98b063f..cc6ab1038f6f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1589,5 +1589,14 @@ static inline int is_hwpoison_address(unsigned long addr)
 
 extern void dump_page(struct page *page);
 
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+extern void clear_huge_page(struct page *page,
+			    unsigned long addr,
+			    unsigned int pages_per_huge_page);
+extern void copy_user_huge_page(struct page *dst, struct page *src,
+				unsigned long addr, struct vm_area_struct *vma,
+				unsigned int pages_per_huge_page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85855240933d..7bf223d6677b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
 	return 0;
 }
 
-static void clear_gigantic_page(struct page *page,
-			unsigned long addr, unsigned long sz)
-{
-	int i;
-	struct page *p = page;
-
-	might_sleep();
-	for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
-		cond_resched();
-		clear_user_highpage(p, addr + i * PAGE_SIZE);
-	}
-}
-static void clear_huge_page(struct page *page,
-			unsigned long addr, unsigned long sz)
-{
-	int i;
-
-	if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
-		clear_gigantic_page(page, addr, sz);
-		return;
-	}
-
-	might_sleep();
-	for (i = 0; i < sz/PAGE_SIZE; i++) {
-		cond_resched();
-		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
-	}
-}
-
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
-			   unsigned long addr, struct vm_area_struct *vma)
-{
-	int i;
-	struct hstate *h = hstate_vma(vma);
-	struct page *dst_base = dst;
-	struct page *src_base = src;
-
-	for (i = 0; i < pages_per_huge_page(h); ) {
-		cond_resched();
-		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
-		i++;
-		dst = mem_map_next(dst, dst_base, i);
-		src = mem_map_next(src, src_base, i);
-	}
-}
-
-static void copy_user_huge_page(struct page *dst, struct page *src,
-			   unsigned long addr, struct vm_area_struct *vma)
-{
-	int i;
-	struct hstate *h = hstate_vma(vma);
-
-	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-		copy_user_gigantic_page(dst, src, addr, vma);
-		return;
-	}
-
-	might_sleep();
-	for (i = 0; i < pages_per_huge_page(h); i++) {
-		cond_resched();
-		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
-	}
-}
-
 static void copy_gigantic_page(struct page *dst, struct page *src)
 {
 	int i;
@@ -2454,7 +2389,8 @@ retry_avoidcopy:
 		return VM_FAULT_OOM;
 	}
 
-	copy_user_huge_page(new_page, old_page, address, vma);
+	copy_user_huge_page(new_page, old_page, address, vma,
+			    pages_per_huge_page(h));
 	__SetPageUptodate(new_page);
 
 	/*
@@ -2558,7 +2494,7 @@ retry:
 			ret = -PTR_ERR(page);
 			goto out;
 		}
-		clear_huge_page(page, address, huge_page_size(h));
+		clear_huge_page(page, address, pages_per_huge_page(h));
 		__SetPageUptodate(page);
 
 		if (vma->vm_flags & VM_MAYSHARE) {
diff --git a/mm/memory.c b/mm/memory.c
index 567bca80ea53..60e1c68d8218 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3645,3 +3645,74 @@ void might_fault(void)
 }
 EXPORT_SYMBOL(might_fault);
 #endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+				unsigned long addr,
+				unsigned int pages_per_huge_page)
+{
+	int i;
+	struct page *p = page;
+
+	might_sleep();
+	for (i = 0; i < pages_per_huge_page;
+	     i++, p = mem_map_next(p, page, i)) {
+		cond_resched();
+		clear_user_highpage(p, addr + i * PAGE_SIZE);
+	}
+}
+void clear_huge_page(struct page *page,
+		     unsigned long addr, unsigned int pages_per_huge_page)
+{
+	int i;
+
+	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+		clear_gigantic_page(page, addr, pages_per_huge_page);
+		return;
+	}
+
+	might_sleep();
+	for (i = 0; i < pages_per_huge_page; i++) {
+		cond_resched();
+		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+	}
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+				    unsigned long addr,
+				    struct vm_area_struct *vma,
+				    unsigned int pages_per_huge_page)
+{
+	int i;
+	struct page *dst_base = dst;
+	struct page *src_base = src;
+
+	for (i = 0; i < pages_per_huge_page; ) {
+		cond_resched();
+		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+		i++;
+		dst = mem_map_next(dst, dst_base, i);
+		src = mem_map_next(src, src_base, i);
+	}
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+			 unsigned long addr, struct vm_area_struct *vma,
+			 unsigned int pages_per_huge_page)
+{
+	int i;
+
+	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+		copy_user_gigantic_page(dst, src, addr, vma,
+					pages_per_huge_page);
+		return;
+	}
+
+	might_sleep();
+	for (i = 0; i < pages_per_huge_page; i++) {
+		cond_resched();
+		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+	}
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-- 
cgit v1.2.3-71-gd317


From 936a5fe6e6148c0b3ea0d792b903847d9b9931a1 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:48 -0800
Subject: thp: kvm mmu transparent hugepage support

This should work for both hugetlbfs and transparent hugepages.

[akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kvm/mmu.c         | 91 ++++++++++++++++++++++++++++++++++++++--------
 arch/x86/kvm/paging_tmpl.h |  9 ++++-
 include/linux/page-flags.h | 12 ++++++
 virt/kvm/kvm_main.c        | 32 +++++++++++++++-
 4 files changed, 125 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb499813..47b2c3288b6b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 	return ret;
 }
 
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 {
 	struct kvm_memory_slot *slot;
-	int host_level, level, max_level;
-
 	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
 	if (slot && slot->dirty_bitmap)
-		return PT_PAGE_TABLE_LEVEL;
+		return true;
+	return false;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+	int host_level, level, max_level;
 
 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
 
@@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 	return 1;
 }
 
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+					gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+{
+	pfn_t pfn = *pfnp;
+	gfn_t gfn = *gfnp;
+	int level = *levelp;
+
+	/*
+	 * Check if it's a transparent hugepage. If this would be an
+	 * hugetlbfs page, level wouldn't be set to
+	 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+	 * here.
+	 */
+	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+	    level == PT_PAGE_TABLE_LEVEL &&
+	    PageTransCompound(pfn_to_page(pfn)) &&
+	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+		unsigned long mask;
+		/*
+		 * mmu_notifier_retry was successful and we hold the
+		 * mmu_lock here, so the pmd can't become splitting
+		 * from under us, and in turn
+		 * __split_huge_page_refcount() can't run from under
+		 * us and we can safely transfer the refcount from
+		 * PG_tail to PG_head as we switch the pfn to tail to
+		 * head.
+		 */
+		*levelp = level = PT_DIRECTORY_LEVEL;
+		mask = KVM_PAGES_PER_HPAGE(level) - 1;
+		VM_BUG_ON((gfn & mask) != (pfn & mask));
+		if (pfn & mask) {
+			gfn &= ~mask;
+			*gfnp = gfn;
+			kvm_release_pfn_clean(pfn);
+			pfn &= ~mask;
+			if (!get_page_unless_zero(pfn_to_page(pfn)))
+				BUG();
+			*pfnp = pfn;
+		}
+	}
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
@@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 {
 	int r;
 	int level;
+	int force_pt_level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
 	bool map_writable;
 
-	level = mapping_level(vcpu, gfn);
-
-	/*
-	 * This path builds a PAE pagetable - so we can map 2mb pages at
-	 * maximum. Therefore check if the level is larger than that.
-	 */
-	if (level > PT_DIRECTORY_LEVEL)
-		level = PT_DIRECTORY_LEVEL;
+	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	if (likely(!force_pt_level)) {
+		level = mapping_level(vcpu, gfn);
+		/*
+		 * This path builds a PAE pagetable - so we can map
+		 * 2mb pages at maximum. Therefore check if the level
+		 * is larger than that.
+		 */
+		if (level > PT_DIRECTORY_LEVEL)
+			level = PT_DIRECTORY_LEVEL;
 
-	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+	} else
+		level = PT_PAGE_TABLE_LEVEL;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
@@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
+	if (likely(!force_pt_level))
+		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
 			 prefault);
 	spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	pfn_t pfn;
 	int r;
 	int level;
+	int force_pt_level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
@@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 		return r;
 
-	level = mapping_level(vcpu, gfn);
-
-	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	if (likely(!force_pt_level)) {
+		level = mapping_level(vcpu, gfn);
+		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+	} else
+		level = PT_PAGE_TABLE_LEVEL;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
@@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
+	if (likely(!force_pt_level))
+		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, gpa, write, map_writable,
 			 level, gfn, pfn, prefault);
 	spin_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 53210f1e94c2..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
+	int force_pt_level;
 	unsigned long mmu_seq;
 	bool map_writable;
 
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 	}
 
-	if (walker.level >= PT_DIRECTORY_LEVEL) {
+	if (walker.level >= PT_DIRECTORY_LEVEL)
+		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+	else
+		force_pt_level = 1;
+	if (!force_pt_level) {
 		level = min(walker.level, mapping_level(vcpu, walker.gfn));
 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
+	if (!force_pt_level)
+		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn, map_writable, prefault);
 	(void)sptep;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4835cae71047..907f1605926b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page)
 
 #endif /* !PAGEFLAGS_EXTENDED */
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int PageTransCompound(struct page *page)
+{
+	return PageCompound(page);
+}
+#else
+static inline int PageTransCompound(struct page *page)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_MMU
 #define __PG_MLOCKED		(1 << PG_mlocked)
 #else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7f686251f711..85ab7db0d366 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -104,8 +104,36 @@ static pfn_t fault_pfn;
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
-		struct page *page = compound_head(pfn_to_page(pfn));
-		return PageReserved(page);
+		struct page *head;
+		struct page *tail = pfn_to_page(pfn);
+		head = compound_head(tail);
+		if (head != tail) {
+			smp_rmb();
+			/*
+			 * head may be a dangling pointer.
+			 * __split_huge_page_refcount clears PageTail
+			 * before overwriting first_page, so if
+			 * PageTail is still there it means the head
+			 * pointer isn't dangling.
+			 */
+			if (PageTail(tail)) {
+				/*
+				 * the "head" is not a dangling
+				 * pointer but the hugepage may have
+				 * been splitted from under us (and we
+				 * may not hold a reference count on
+				 * the head page so it can be reused
+				 * before we run PageReferenced), so
+				 * we've to recheck PageTail before
+				 * returning what we just read.
+				 */
+				int reserved = PageReserved(head);
+				smp_rmb();
+				if (PageTail(tail))
+					return reserved;
+			}
+		}
+		return PageReserved(tail);
 	}
 
 	return true;
-- 
cgit v1.2.3-71-gd317


From 32dba98e085f8b2b4345887df9abf5e0e93bfc12 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:49 -0800
Subject: thp: _GFP_NO_KSWAPD

Transparent hugepage allocations must be allowed not to invoke kswapd or
any other kind of indirect reclaim (especially when the defrag sysfs is
control disabled).  It's unacceptable to swap out anonymous pages
(potentially anonymous transparent hugepages) in order to create new
transparent hugepages.  This is true for the MADV_HUGEPAGE areas too
(swapping out a kvm virtual machine and so having it suffer an unbearable
slowdown, so another one with guest physical memory marked MADV_HUGEPAGE
can run 30% faster if it is running memory intensive workloads, makes no
sense).  If a transparent hugepage allocation fails the slowdown is minor
and there is total fallback, so kswapd should never be asked to swapout
memory to allow the high order allocation to succeed.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 5 ++++-
 mm/page_alloc.c     | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f54adfcbec9c..49d2356bb82d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
 #else
 #define ___GFP_NOTRACK		0
 #endif
+#define ___GFP_NO_KSWAPD	0x400000u
 
 /*
  * GFP bitmasks..
@@ -81,13 +82,15 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
+#define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
+
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
  * allocations that simply cannot be supported (e.g. page tables).
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 22	/* Room for 22 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 23	/* Room for 23 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e62d5f9d40b..bbd0423f2820 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2027,7 +2027,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	wake_all_kswapd(order, zonelist, high_zoneidx,
+	if (!(gfp_mask & __GFP_NO_KSWAPD))
+		wake_all_kswapd(order, zonelist, high_zoneidx,
 						zone_idx(preferred_zone));
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 71e3aac0724ffe8918992d76acfe3aad7d8724a5 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:52 -0800
Subject: thp: transparent hugepage core

Lately I've been working to make KVM use hugepages transparently without
the usual restrictions of hugetlbfs.  Some of the restrictions I'd like to
see removed:

1) hugepages have to be swappable or the guest physical memory remains
   locked in RAM and can't be paged out to swap

2) if a hugepage allocation fails, regular pages should be allocated
   instead and mixed in the same vma without any failure and without
   userland noticing

3) if some task quits and more hugepages become available in the
   buddy, guest physical memory backed by regular pages should be
   relocated on hugepages automatically in regions under
   madvise(MADV_HUGEPAGE) (ideally event driven by waking up the
   kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes
   not null)

4) avoidance of reservation and maximization of use of hugepages whenever
   possible. Reservation (needed to avoid runtime fatal faliures) may be ok for
   1 machine with 1 database with 1 database cache with 1 database cache size
   known at boot time. It's definitely not feasible with a virtualization
   hypervisor usage like RHEV-H that runs an unknown number of virtual machines
   with an unknown size of each virtual machine with an unknown amount of
   pagecache that could be potentially useful in the host for guest not using
   O_DIRECT (aka cache=off).

hugepages in the virtualization hypervisor (and also in the guest!) are
much more important than in a regular host not using virtualization,
becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24
to 19 in case only the hypervisor uses transparent hugepages, and they
decrease the tlb-miss cacheline accesses from 19 to 15 in case both the
linux hypervisor and the linux guest both uses this patch (though the
guest will limit the addition speedup to anonymous regions only for
now...).  Even more important is that the tlb miss handler is much slower
on a NPT/EPT guest than for a regular shadow paging or no-virtualization
scenario.  So maximizing the amount of virtual memory cached by the TLB
pays off significantly more with NPT/EPT than without (even if there would
be no significant speedup in the tlb-miss runtime).

The first (and more tedious) part of this work requires allowing the VM to
handle anonymous hugepages mixed with regular pages transparently on
regular anonymous vmas.  This is what this patch tries to achieve in the
least intrusive possible way.  We want hugepages and hugetlb to be used in
a way so that all applications can benefit without changes (as usual we
leverage the KVM virtualization design: by improving the Linux VM at
large, KVM gets the performance boost too).

The most important design choice is: always fallback to 4k allocation if
the hugepage allocation fails!  This is the _very_ opposite of some large
pagecache patches that failed with -EIO back then if a 64k (or similar)
allocation failed...

Second important decision (to reduce the impact of the feature on the
existing pagetable handling code) is that at any time we can split an
hugepage into 512 regular pages and it has to be done with an operation
that can't fail.  This way the reliability of the swapping isn't decreased
(no need to allocate memory when we are short on memory to swap) and it's
trivial to plug a split_huge_page* one-liner where needed without
polluting the VM.  Over time we can teach mprotect, mremap and friends to
handle pmd_trans_huge natively without calling split_huge_page*.  The fact
it can't fail isn't just for swap: if split_huge_page would return -ENOMEM
(instead of the current void) we'd need to rollback the mprotect from the
middle of it (ideally including undoing the split_vma) which would be a
big change and in the very wrong direction (it'd likely be simpler not to
call split_huge_page at all and to teach mprotect and friends to handle
hugepages instead of rolling them back from the middle).  In short the
very value of split_huge_page is that it can't fail.

The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and
incremental and it'll just be an "harmless" addition later if this initial
part is agreed upon.  It also should be noted that locking-wise replacing
regular pages with hugepages is going to be very easy if compared to what
I'm doing below in split_huge_page, as it will only happen when
page_count(page) matches page_mapcount(page) if we can take the PG_lock
and mmap_sem in write mode.  collapse_huge_page will be a "best effort"
that (unlike split_huge_page) can fail at the minimal sign of trouble and
we can try again later.  collapse_huge_page will be similar to how KSM
works and the madvise(MADV_HUGEPAGE) will work similar to
madvise(MADV_MERGEABLE).

The default I like is that transparent hugepages are used at page fault
time.  This can be changed with
/sys/kernel/mm/transparent_hugepage/enabled.  The control knob can be set
to three values "always", "madvise", "never" which mean respectively that
hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions,
or never used.  /sys/kernel/mm/transparent_hugepage/defrag instead
controls if the hugepage allocation should defrag memory aggressively
"always", only inside "madvise" regions, or "never".

The pmd_trans_splitting/pmd_trans_huge locking is very solid.  The
put_page (from get_user_page users that can't use mmu notifier like
O_DIRECT) that runs against a __split_huge_page_refcount instead was a
pain to serialize in a way that would result always in a coherent page
count for both tail and head.  I think my locking solution with a
compound_lock taken only after the page_first is valid and is still a
PageHead should be safe but it surely needs review from SMP race point of
view.  In short there is no current existing way to serialize the O_DIRECT
final put_page against split_huge_page_refcount so I had to invent a new
one (O_DIRECT loses knowledge on the mapping status by the time gup_fast
returns so...).  And I didn't want to impact all gup/gup_fast users for
now, maybe if we change the gup interface substantially we can avoid this
locking, I admit I didn't think too much about it because changing the gup
unpinning interface would be invasive.

If we ignored O_DIRECT we could stick to the existing compound refcounting
code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM
(and any other mmu notifier user) would call it without FOLL_GET (and if
FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the
current task mmu notifier list yet).  But O_DIRECT is fundamental for
decent performance of virtualized I/O on fast storage so we can't avoid it
to solve the race of put_page against split_huge_page_refcount to achieve
a complete hugepage feature for KVM.

Swap and oom works fine (well just like with regular pages ;).  MMU
notifier is handled transparently too, with the exception of the young bit
on the pmd, that didn't have a range check but I think KVM will be fine
because the whole point of hugepages is that EPT/NPT will also use a huge
pmd when they notice gup returns pages with PageCompound set, so they
won't care of a range and there's just the pmd young bit to check in that
case.

NOTE: in some cases if the L2 cache is small, this may slowdown and waste
memory during COWs because 4M of memory are accessed in a single fault
instead of 8k (the payoff is that after COW the program can run faster).
So we might want to switch the copy_huge_page (and clear_huge_page too) to
not temporal stores.  I also extensively researched ways to avoid this
cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k
up to 1M (I can send those patches that fully implemented prefault) but I
concluded they're not worth it and they add an huge additional complexity
and they remove all tlb benefits until the full hugepage has been faulted
in, to save a little bit of memory and some cache during app startup, but
they still don't improve substantially the cache-trashing during startup
if the prefault happens in >4k chunks.  One reason is that those 4k pte
entries copied are still mapped on a perfectly cache-colored hugepage, so
the trashing is the worst one can generate in those copies (cow of 4k page
copies aren't so well colored so they trashes less, but again this results
in software running faster after the page fault).  Those prefault patches
allowed things like a pte where post-cow pages were local 4k regular anon
pages and the not-yet-cowed pte entries were pointing in the middle of
some hugepage mapped read-only.  If it doesn't payoff substantially with
todays hardware it will payoff even less in the future with larger l2
caches, and the prefault logic would blot the VM a lot.  If one is
emebdded transparent_hugepage can be disabled during boot with sysfs or
with the boot commandline parameter transparent_hugepage=0 (or
transparent_hugepage=2 to restrict hugepages inside madvise regions) that
will ensure not a single hugepage is allocated at boot time.  It is simple
enough to just disable transparent hugepage globally and let transparent
hugepages be allocated selectively by applications in the MADV_HUGEPAGE
region (both at page fault time, and if enabled with the
collapse_huge_page too through the kernel daemon).

This patch supports only hugepages mapped in the pmd, archs that have
smaller hugepages will not fit in this patch alone.  Also some archs like
power have certain tlb limits that prevents mixing different page size in
the same regions so they will not fit in this framework that requires
"graceful fallback" to basic PAGE_SIZE in case of physical memory
fragmentation.  hugetlbfs remains a perfect fit for those because its
software limits happen to match the hardware limits.  hugetlbfs also
remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped
to be found not fragmented after a certain system uptime and that would be
very expensive to defragment with relocation, so requiring reservation.
hugetlbfs is the "reservation way", the point of transparent hugepages is
not to have any reservation at all and maximizing the use of cache and
hugepages at all times automatically.

Some performance result:

vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep
ages3
memset page fault 1566023
memset tlb miss 453854
memset second tlb miss 453321
random access tlb miss 41635
random access second tlb miss 41658
vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3
memset page fault 1566471
memset tlb miss 453375
memset second tlb miss 453320
random access tlb miss 41636
random access second tlb miss 41637
vmx andrea # ./largepages3
memset page fault 1566642
memset tlb miss 453417
memset second tlb miss 453313
random access tlb miss 41630
random access second tlb miss 41647
vmx andrea # ./largepages3
memset page fault 1566872
memset tlb miss 453418
memset second tlb miss 453315
random access tlb miss 41618
random access second tlb miss 41659
vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage
vmx andrea # ./largepages3
memset page fault 2182476
memset tlb miss 460305
memset second tlb miss 460179
random access tlb miss 44483
random access second tlb miss 44186
vmx andrea # ./largepages3
memset page fault 2182791
memset tlb miss 460742
memset second tlb miss 459962
random access tlb miss 43981
random access second tlb miss 43988

============
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

#define SIZE (3UL*1024*1024*1024)

int main()
{
	char *p = malloc(SIZE), *p2;
	struct timeval before, after;

	gettimeofday(&before, NULL);
	memset(p, 0, SIZE);
	gettimeofday(&after, NULL);
	printf("memset page fault %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	memset(p, 0, SIZE);
	gettimeofday(&after, NULL);
	printf("memset tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	memset(p, 0, SIZE);
	gettimeofday(&after, NULL);
	printf("memset second tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	for (p2 = p; p2 < p+SIZE; p2 += 4096)
		*p2 = 0;
	gettimeofday(&after, NULL);
	printf("random access tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	for (p2 = p; p2 < p+SIZE; p2 += 4096)
		*p2 = 0;
	gettimeofday(&after, NULL);
	printf("random access second tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	return 0;
}
============

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_64.h |   5 +
 include/linux/gfp.h               |   3 +
 include/linux/huge_mm.h           | 118 +++++
 include/linux/mm.h                |   4 +
 include/linux/mm_inline.h         |  11 +-
 include/linux/page-flags.h        |  21 +
 include/linux/rmap.h              |   2 +
 include/linux/swap.h              |   2 +
 mm/Makefile                       |   1 +
 mm/huge_memory.c                  | 901 ++++++++++++++++++++++++++++++++++++++
 mm/internal.h                     |   4 +
 mm/memory.c                       |  84 +++-
 mm/rmap.c                         |  62 ++-
 mm/swap.c                         |  37 ++
 14 files changed, 1220 insertions(+), 35 deletions(-)
 create mode 100644 include/linux/huge_mm.h
 create mode 100644 mm/huge_memory.c

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 1fb61a74b2e1..b2df039a4119 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -286,6 +286,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_RW);
 }
 
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 49d2356bb82d..d95082cc6f4a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -109,6 +109,9 @@ struct vm_area_struct;
 				 __GFP_HARDWALL | __GFP_HIGHMEM | \
 				 __GFP_MOVABLE)
 #define GFP_IOFS	(__GFP_IO | __GFP_FS)
+#define GFP_TRANSHUGE	(GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
+			 __GFP_NO_KSWAPD)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
new file mode 100644
index 000000000000..9301824c7491
--- /dev/null
+++ b/include/linux/huge_mm.h
@@ -0,0 +1,118 @@
+#ifndef _LINUX_HUGE_MM_H
+#define _LINUX_HUGE_MM_H
+
+extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
+				      struct vm_area_struct *vma,
+				      unsigned long address, pmd_t *pmd,
+				      unsigned int flags);
+extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+			 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+			 struct vm_area_struct *vma);
+extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			       unsigned long address, pmd_t *pmd,
+			       pmd_t orig_pmd);
+extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
+extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+					  unsigned long addr,
+					  pmd_t *pmd,
+					  unsigned int flags);
+extern int zap_huge_pmd(struct mmu_gather *tlb,
+			struct vm_area_struct *vma,
+			pmd_t *pmd);
+
+enum transparent_hugepage_flag {
+	TRANSPARENT_HUGEPAGE_FLAG,
+	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+#ifdef CONFIG_DEBUG_VM
+	TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
+#endif
+};
+
+enum page_check_address_pmd_flag {
+	PAGE_CHECK_ADDRESS_PMD_FLAG,
+	PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
+	PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
+};
+extern pmd_t *page_check_address_pmd(struct page *page,
+				     struct mm_struct *mm,
+				     unsigned long address,
+				     enum page_check_address_pmd_flag flag);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define HPAGE_PMD_SHIFT HPAGE_SHIFT
+#define HPAGE_PMD_MASK HPAGE_MASK
+#define HPAGE_PMD_SIZE HPAGE_SIZE
+
+#define transparent_hugepage_enabled(__vma)				\
+	(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_FLAG) ||	\
+	 (transparent_hugepage_flags &					\
+	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&			\
+	  (__vma)->vm_flags & VM_HUGEPAGE))
+#define transparent_hugepage_defrag(__vma)				\
+	((transparent_hugepage_flags &					\
+	  (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) ||			\
+	 (transparent_hugepage_flags &					\
+	  (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) &&		\
+	  (__vma)->vm_flags & VM_HUGEPAGE))
+#ifdef CONFIG_DEBUG_VM
+#define transparent_hugepage_debug_cow()				\
+	(transparent_hugepage_flags &					\
+	 (1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG))
+#else /* CONFIG_DEBUG_VM */
+#define transparent_hugepage_debug_cow() 0
+#endif /* CONFIG_DEBUG_VM */
+
+extern unsigned long transparent_hugepage_flags;
+extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+			  pmd_t *dst_pmd, pmd_t *src_pmd,
+			  struct vm_area_struct *vma,
+			  unsigned long addr, unsigned long end);
+extern int handle_pte_fault(struct mm_struct *mm,
+			    struct vm_area_struct *vma, unsigned long address,
+			    pte_t *pte, pmd_t *pmd, unsigned int flags);
+extern int split_huge_page(struct page *page);
+extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
+#define split_huge_page_pmd(__mm, __pmd)				\
+	do {								\
+		pmd_t *____pmd = (__pmd);				\
+		if (unlikely(pmd_trans_huge(*____pmd)))			\
+			__split_huge_page_pmd(__mm, ____pmd);		\
+	}  while (0)
+#define wait_split_huge_page(__anon_vma, __pmd)				\
+	do {								\
+		pmd_t *____pmd = (__pmd);				\
+		spin_unlock_wait(&(__anon_vma)->root->lock);		\
+		/*							\
+		 * spin_unlock_wait() is just a loop in C and so the	\
+		 * CPU can reorder anything around it.			\
+		 */							\
+		smp_mb();						\
+		BUG_ON(pmd_trans_splitting(*____pmd) ||			\
+		       pmd_trans_huge(*____pmd));			\
+	} while (0)
+#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+#if HPAGE_PMD_ORDER > MAX_ORDER
+#error "hugepages can't be allocated by the buddy allocator"
+#endif
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
+#define HPAGE_PMD_MASK ({ BUG(); 0; })
+#define HPAGE_PMD_SIZE ({ BUG(); 0; })
+
+#define transparent_hugepage_enabled(__vma) 0
+
+#define transparent_hugepage_flags 0UL
+static inline int split_huge_page(struct page *page)
+{
+	return 0;
+}
+#define split_huge_page_pmd(__mm, __pmd)	\
+	do { } while (0)
+#define wait_split_huge_page(__anon_vma, __pmd)	\
+	do { } while (0)
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc6ab1038f6f..78adec4ba9f4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -111,6 +111,9 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
 #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
 #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
+#if BITS_PER_LONG > 32
+#define VM_HUGEPAGE	0x100000000UL	/* MADV_HUGEPAGE marked this vma */
+#endif
 
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
@@ -243,6 +246,7 @@ struct inode;
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/huge_mm.h>
 
 /*
  * Methods to modify the page usage count.
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8835b877b8db..650f31eabdb1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -20,13 +20,20 @@ static inline int page_is_file_cache(struct page *page)
 }
 
 static inline void
-add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
+		       struct list_head *head)
 {
-	list_add(&page->lru, &zone->lru[l].list);
+	list_add(&page->lru, head);
 	__inc_zone_state(zone, NR_LRU_BASE + l);
 	mem_cgroup_add_lru_list(page, l);
 }
 
+static inline void
+add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+	__add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
+}
+
 static inline void
 del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 907f1605926b..4ca1241ef94e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -410,11 +410,32 @@ static inline void ClearPageCompound(struct page *page)
 #endif /* !PAGEFLAGS_EXTENDED */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for
+ * normal or transparent huge pages.
+ *
+ * PageTransHuge() returns true for both transparent huge and
+ * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
+ * called only in the core VM paths where hugetlbfs pages can't exist.
+ */
+static inline int PageTransHuge(struct page *page)
+{
+	VM_BUG_ON(PageTail(page));
+	return PageHead(page);
+}
+
 static inline int PageTransCompound(struct page *page)
 {
 	return PageCompound(page);
 }
+
 #else
+
+static inline int PageTransHuge(struct page *page)
+{
+	return 0;
+}
+
 static inline int PageTransCompound(struct page *page)
 {
 	return 0;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bb83c0da2071..e9fd04ca1e51 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,6 +198,8 @@ enum ttu_flags {
 };
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
+bool is_vma_temporary_stack(struct vm_area_struct *vma);
+
 int try_to_unmap(struct page *, enum ttu_flags flags);
 int try_to_unmap_one(struct page *, struct vm_area_struct *,
 			unsigned long address, enum ttu_flags flags);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index eba53e71d2cc..4d559325d919 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void lru_add_page_tail(struct zone* zone,
+			      struct page *page, struct page *page_tail);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
diff --git a/mm/Makefile b/mm/Makefile
index 380772a9ccb8..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..0c1e8f939f7c
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,901 @@
+/*
+ *  Copyright (C) 2009  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+unsigned long transparent_hugepage_flags __read_mostly =
+	(1<<TRANSPARENT_HUGEPAGE_FLAG);
+
+#ifdef CONFIG_SYSFS
+static ssize_t double_flag_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf,
+				enum transparent_hugepage_flag enabled,
+				enum transparent_hugepage_flag req_madv)
+{
+	if (test_bit(enabled, &transparent_hugepage_flags)) {
+		VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+		return sprintf(buf, "[always] madvise never\n");
+	} else if (test_bit(req_madv, &transparent_hugepage_flags))
+		return sprintf(buf, "always [madvise] never\n");
+	else
+		return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+				 struct kobj_attribute *attr,
+				 const char *buf, size_t count,
+				 enum transparent_hugepage_flag enabled,
+				 enum transparent_hugepage_flag req_madv)
+{
+	if (!memcmp("always", buf,
+		    min(sizeof("always")-1, count))) {
+		set_bit(enabled, &transparent_hugepage_flags);
+		clear_bit(req_madv, &transparent_hugepage_flags);
+	} else if (!memcmp("madvise", buf,
+			   min(sizeof("madvise")-1, count))) {
+		clear_bit(enabled, &transparent_hugepage_flags);
+		set_bit(req_madv, &transparent_hugepage_flags);
+	} else if (!memcmp("never", buf,
+			   min(sizeof("never")-1, count))) {
+		clear_bit(enabled, &transparent_hugepage_flags);
+		clear_bit(req_madv, &transparent_hugepage_flags);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buf)
+{
+	return double_flag_show(kobj, attr, buf,
+				TRANSPARENT_HUGEPAGE_FLAG,
+				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+			     struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	return double_flag_store(kobj, attr, buf, count,
+				 TRANSPARENT_HUGEPAGE_FLAG,
+				 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static struct kobj_attribute enabled_attr =
+	__ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf,
+				enum transparent_hugepage_flag flag)
+{
+	if (test_bit(flag, &transparent_hugepage_flags))
+		return sprintf(buf, "[yes] no\n");
+	else
+		return sprintf(buf, "yes [no]\n");
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+				 struct kobj_attribute *attr,
+				 const char *buf, size_t count,
+				 enum transparent_hugepage_flag flag)
+{
+	if (!memcmp("yes", buf,
+		    min(sizeof("yes")-1, count))) {
+		set_bit(flag, &transparent_hugepage_flags);
+	} else if (!memcmp("no", buf,
+			   min(sizeof("no")-1, count))) {
+		clear_bit(flag, &transparent_hugepage_flags);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+			   struct kobj_attribute *attr, char *buf)
+{
+	return double_flag_show(kobj, attr, buf,
+				TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+				TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+			    struct kobj_attribute *attr,
+			    const char *buf, size_t count)
+{
+	return double_flag_store(kobj, attr, buf, count,
+				 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+	__ATTR(defrag, 0644, defrag_show, defrag_store);
+
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	return single_flag_show(kobj, attr, buf,
+				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	return single_flag_store(kobj, attr, buf, count,
+				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+	&enabled_attr.attr,
+	&defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+	&debug_cow_attr.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+	.attrs = hugepage_attr,
+	.name = "transparent_hugepage",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+#ifdef CONFIG_SYSFS
+	int err;
+
+	err = sysfs_create_group(mm_kobj, &hugepage_attr_group);
+	if (err)
+		printk(KERN_ERR "hugepage: register sysfs failed\n");
+#endif
+	return 0;
+}
+module_init(hugepage_init)
+
+static int __init setup_transparent_hugepage(char *str)
+{
+	int ret = 0;
+	if (!str)
+		goto out;
+	if (!strcmp(str, "always")) {
+		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+			&transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+			  &transparent_hugepage_flags);
+		ret = 1;
+	} else if (!strcmp(str, "madvise")) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+			  &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+			&transparent_hugepage_flags);
+		ret = 1;
+	} else if (!strcmp(str, "never")) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+			  &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+			  &transparent_hugepage_flags);
+		ret = 1;
+	}
+out:
+	if (!ret)
+		printk(KERN_WARNING
+		       "transparent_hugepage= cannot parse, ignored\n");
+	return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+				 struct mm_struct *mm)
+{
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	if (!mm->pmd_huge_pte)
+		INIT_LIST_HEAD(&pgtable->lru);
+	else
+		list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+	mm->pmd_huge_pte = pgtable;
+}
+
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_flags & VM_WRITE))
+		pmd = pmd_mkwrite(pmd);
+	return pmd;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+					struct vm_area_struct *vma,
+					unsigned long haddr, pmd_t *pmd,
+					struct page *page)
+{
+	int ret = 0;
+	pgtable_t pgtable;
+
+	VM_BUG_ON(!PageCompound(page));
+	pgtable = pte_alloc_one(mm, haddr);
+	if (unlikely(!pgtable)) {
+		put_page(page);
+		return VM_FAULT_OOM;
+	}
+
+	clear_huge_page(page, haddr, HPAGE_PMD_NR);
+	__SetPageUptodate(page);
+
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_none(*pmd))) {
+		spin_unlock(&mm->page_table_lock);
+		put_page(page);
+		pte_free(mm, pgtable);
+	} else {
+		pmd_t entry;
+		entry = mk_pmd(page, vma->vm_page_prot);
+		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+		entry = pmd_mkhuge(entry);
+		/*
+		 * The spinlocking to take the lru_lock inside
+		 * page_add_new_anon_rmap() acts as a full memory
+		 * barrier to be sure clear_huge_page writes become
+		 * visible after the set_pmd_at() write.
+		 */
+		page_add_new_anon_rmap(page, vma, haddr);
+		set_pmd_at(mm, haddr, pmd, entry);
+		prepare_pmd_huge_pte(pgtable, mm);
+		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+		spin_unlock(&mm->page_table_lock);
+	}
+
+	return ret;
+}
+
+static inline struct page *alloc_hugepage(int defrag)
+{
+	return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
+			   HPAGE_PMD_ORDER);
+}
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			       unsigned long address, pmd_t *pmd,
+			       unsigned int flags)
+{
+	struct page *page;
+	unsigned long haddr = address & HPAGE_PMD_MASK;
+	pte_t *pte;
+
+	if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+		if (unlikely(anon_vma_prepare(vma)))
+			return VM_FAULT_OOM;
+		page = alloc_hugepage(transparent_hugepage_defrag(vma));
+		if (unlikely(!page))
+			goto out;
+
+		return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+	}
+out:
+	/*
+	 * Use __pte_alloc instead of pte_alloc_map, because we can't
+	 * run pte_offset_map on the pmd, if an huge pmd could
+	 * materialize from under us from a different thread.
+	 */
+	if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+		return VM_FAULT_OOM;
+	/* if an huge pmd materialized from under us just retry later */
+	if (unlikely(pmd_trans_huge(*pmd)))
+		return 0;
+	/*
+	 * A regular pmd is established and it can't morph into a huge pmd
+	 * from under us anymore at this point because we hold the mmap_sem
+	 * read mode and khugepaged takes it in write mode. So now it's
+	 * safe to run pte_offset_map().
+	 */
+	pte = pte_offset_map(pmd, address);
+	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+		  struct vm_area_struct *vma)
+{
+	struct page *src_page;
+	pmd_t pmd;
+	pgtable_t pgtable;
+	int ret;
+
+	ret = -ENOMEM;
+	pgtable = pte_alloc_one(dst_mm, addr);
+	if (unlikely(!pgtable))
+		goto out;
+
+	spin_lock(&dst_mm->page_table_lock);
+	spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+
+	ret = -EAGAIN;
+	pmd = *src_pmd;
+	if (unlikely(!pmd_trans_huge(pmd))) {
+		pte_free(dst_mm, pgtable);
+		goto out_unlock;
+	}
+	if (unlikely(pmd_trans_splitting(pmd))) {
+		/* split huge page running from under us */
+		spin_unlock(&src_mm->page_table_lock);
+		spin_unlock(&dst_mm->page_table_lock);
+		pte_free(dst_mm, pgtable);
+
+		wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+		goto out;
+	}
+	src_page = pmd_page(pmd);
+	VM_BUG_ON(!PageHead(src_page));
+	get_page(src_page);
+	page_dup_rmap(src_page);
+	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+	pmdp_set_wrprotect(src_mm, addr, src_pmd);
+	pmd = pmd_mkold(pmd_wrprotect(pmd));
+	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+	prepare_pmd_huge_pte(pgtable, dst_mm);
+
+	ret = 0;
+out_unlock:
+	spin_unlock(&src_mm->page_table_lock);
+	spin_unlock(&dst_mm->page_table_lock);
+out:
+	return ret;
+}
+
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+	pgtable_t pgtable;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	/* FIFO */
+	pgtable = mm->pmd_huge_pte;
+	if (list_empty(&pgtable->lru))
+		mm->pmd_huge_pte = NULL;
+	else {
+		mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+					      struct page, lru);
+		list_del(&pgtable->lru);
+	}
+	return pgtable;
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+					struct vm_area_struct *vma,
+					unsigned long address,
+					pmd_t *pmd, pmd_t orig_pmd,
+					struct page *page,
+					unsigned long haddr)
+{
+	pgtable_t pgtable;
+	pmd_t _pmd;
+	int ret = 0, i;
+	struct page **pages;
+
+	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+			GFP_KERNEL);
+	if (unlikely(!pages)) {
+		ret |= VM_FAULT_OOM;
+		goto out;
+	}
+
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+					  vma, address);
+		if (unlikely(!pages[i])) {
+			while (--i >= 0)
+				put_page(pages[i]);
+			kfree(pages);
+			ret |= VM_FAULT_OOM;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		copy_user_highpage(pages[i], page + i,
+				   haddr + PAGE_SHIFT*i, vma);
+		__SetPageUptodate(pages[i]);
+		cond_resched();
+	}
+
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_same(*pmd, orig_pmd)))
+		goto out_free_pages;
+	VM_BUG_ON(!PageHead(page));
+
+	pmdp_clear_flush_notify(vma, haddr, pmd);
+	/* leave pmd empty until pte is filled */
+
+	pgtable = get_pmd_huge_pte(mm);
+	pmd_populate(mm, &_pmd, pgtable);
+
+	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+		pte_t *pte, entry;
+		entry = mk_pte(pages[i], vma->vm_page_prot);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		page_add_new_anon_rmap(pages[i], vma, haddr);
+		pte = pte_offset_map(&_pmd, haddr);
+		VM_BUG_ON(!pte_none(*pte));
+		set_pte_at(mm, haddr, pte, entry);
+		pte_unmap(pte);
+	}
+	kfree(pages);
+
+	mm->nr_ptes++;
+	smp_wmb(); /* make pte visible before pmd */
+	pmd_populate(mm, pmd, pgtable);
+	page_remove_rmap(page);
+	spin_unlock(&mm->page_table_lock);
+
+	ret |= VM_FAULT_WRITE;
+	put_page(page);
+
+out:
+	return ret;
+
+out_free_pages:
+	spin_unlock(&mm->page_table_lock);
+	for (i = 0; i < HPAGE_PMD_NR; i++)
+		put_page(pages[i]);
+	kfree(pages);
+	goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+	int ret = 0;
+	struct page *page, *new_page;
+	unsigned long haddr;
+
+	VM_BUG_ON(!vma->anon_vma);
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_same(*pmd, orig_pmd)))
+		goto out_unlock;
+
+	page = pmd_page(orig_pmd);
+	VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+	haddr = address & HPAGE_PMD_MASK;
+	if (page_mapcount(page) == 1) {
+		pmd_t entry;
+		entry = pmd_mkyoung(orig_pmd);
+		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
+			update_mmu_cache(vma, address, entry);
+		ret |= VM_FAULT_WRITE;
+		goto out_unlock;
+	}
+	get_page(page);
+	spin_unlock(&mm->page_table_lock);
+
+	if (transparent_hugepage_enabled(vma) &&
+	    !transparent_hugepage_debug_cow())
+		new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
+	else
+		new_page = NULL;
+
+	if (unlikely(!new_page)) {
+		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+						   pmd, orig_pmd, page, haddr);
+		put_page(page);
+		goto out;
+	}
+
+	copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+	__SetPageUptodate(new_page);
+
+	spin_lock(&mm->page_table_lock);
+	put_page(page);
+	if (unlikely(!pmd_same(*pmd, orig_pmd)))
+		put_page(new_page);
+	else {
+		pmd_t entry;
+		VM_BUG_ON(!PageHead(page));
+		entry = mk_pmd(new_page, vma->vm_page_prot);
+		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+		entry = pmd_mkhuge(entry);
+		pmdp_clear_flush_notify(vma, haddr, pmd);
+		page_add_new_anon_rmap(new_page, vma, haddr);
+		set_pmd_at(mm, haddr, pmd, entry);
+		update_mmu_cache(vma, address, entry);
+		page_remove_rmap(page);
+		put_page(page);
+		ret |= VM_FAULT_WRITE;
+	}
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+out:
+	return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+				   unsigned long addr,
+				   pmd_t *pmd,
+				   unsigned int flags)
+{
+	struct page *page = NULL;
+
+	assert_spin_locked(&mm->page_table_lock);
+
+	if (flags & FOLL_WRITE && !pmd_write(*pmd))
+		goto out;
+
+	page = pmd_page(*pmd);
+	VM_BUG_ON(!PageHead(page));
+	if (flags & FOLL_TOUCH) {
+		pmd_t _pmd;
+		/*
+		 * We should set the dirty bit only for FOLL_WRITE but
+		 * for now the dirty bit in the pmd is meaningless.
+		 * And if the dirty bit will become meaningful and
+		 * we'll only set it with FOLL_WRITE, an atomic
+		 * set_bit will be required on the pmd to set the
+		 * young bit, instead of the current set_pmd_at.
+		 */
+		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+		set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+	}
+	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+	VM_BUG_ON(!PageCompound(page));
+	if (flags & FOLL_GET)
+		get_page(page);
+
+out:
+	return page;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		 pmd_t *pmd)
+{
+	int ret = 0;
+
+	spin_lock(&tlb->mm->page_table_lock);
+	if (likely(pmd_trans_huge(*pmd))) {
+		if (unlikely(pmd_trans_splitting(*pmd))) {
+			spin_unlock(&tlb->mm->page_table_lock);
+			wait_split_huge_page(vma->anon_vma,
+					     pmd);
+		} else {
+			struct page *page;
+			pgtable_t pgtable;
+			pgtable = get_pmd_huge_pte(tlb->mm);
+			page = pmd_page(*pmd);
+			pmd_clear(pmd);
+			page_remove_rmap(page);
+			VM_BUG_ON(page_mapcount(page) < 0);
+			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+			VM_BUG_ON(!PageHead(page));
+			spin_unlock(&tlb->mm->page_table_lock);
+			tlb_remove_page(tlb, page);
+			pte_free(tlb->mm, pgtable);
+			ret = 1;
+		}
+	} else
+		spin_unlock(&tlb->mm->page_table_lock);
+
+	return ret;
+}
+
+pmd_t *page_check_address_pmd(struct page *page,
+			      struct mm_struct *mm,
+			      unsigned long address,
+			      enum page_check_address_pmd_flag flag)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd, *ret = NULL;
+
+	if (address & ~HPAGE_PMD_MASK)
+		goto out;
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd))
+		goto out;
+	if (pmd_page(*pmd) != page)
+		goto out;
+	VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+		  pmd_trans_splitting(*pmd));
+	if (pmd_trans_huge(*pmd)) {
+		VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+			  !pmd_trans_splitting(*pmd));
+		ret = pmd;
+	}
+out:
+	return ret;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+				       struct vm_area_struct *vma,
+				       unsigned long address)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pmd_t *pmd;
+	int ret = 0;
+
+	spin_lock(&mm->page_table_lock);
+	pmd = page_check_address_pmd(page, mm, address,
+				     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+	if (pmd) {
+		/*
+		 * We can't temporarily set the pmd to null in order
+		 * to split it, the pmd must remain marked huge at all
+		 * times or the VM won't take the pmd_trans_huge paths
+		 * and it won't wait on the anon_vma->root->lock to
+		 * serialize against split_huge_page*.
+		 */
+		pmdp_splitting_flush_notify(vma, address, pmd);
+		ret = 1;
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page)
+{
+	int i;
+	unsigned long head_index = page->index;
+	struct zone *zone = page_zone(page);
+
+	/* prevent PageLRU to go away from under us, and freeze lru stats */
+	spin_lock_irq(&zone->lru_lock);
+	compound_lock(page);
+
+	for (i = 1; i < HPAGE_PMD_NR; i++) {
+		struct page *page_tail = page + i;
+
+		/* tail_page->_count cannot change */
+		atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+		BUG_ON(page_count(page) <= 0);
+		atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+		BUG_ON(atomic_read(&page_tail->_count) <= 0);
+
+		/* after clearing PageTail the gup refcount can be released */
+		smp_mb();
+
+		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+		page_tail->flags |= (page->flags &
+				     ((1L << PG_referenced) |
+				      (1L << PG_swapbacked) |
+				      (1L << PG_mlocked) |
+				      (1L << PG_uptodate)));
+		page_tail->flags |= (1L << PG_dirty);
+
+		/*
+		 * 1) clear PageTail before overwriting first_page
+		 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+		 */
+		smp_wmb();
+
+		/*
+		 * __split_huge_page_splitting() already set the
+		 * splitting bit in all pmd that could map this
+		 * hugepage, that will ensure no CPU can alter the
+		 * mapcount on the head page. The mapcount is only
+		 * accounted in the head page and it has to be
+		 * transferred to all tail pages in the below code. So
+		 * for this code to be safe, the split the mapcount
+		 * can't change. But that doesn't mean userland can't
+		 * keep changing and reading the page contents while
+		 * we transfer the mapcount, so the pmd splitting
+		 * status is achieved setting a reserved bit in the
+		 * pmd, not by clearing the present bit.
+		*/
+		BUG_ON(page_mapcount(page_tail));
+		page_tail->_mapcount = page->_mapcount;
+
+		BUG_ON(page_tail->mapping);
+		page_tail->mapping = page->mapping;
+
+		page_tail->index = ++head_index;
+
+		BUG_ON(!PageAnon(page_tail));
+		BUG_ON(!PageUptodate(page_tail));
+		BUG_ON(!PageDirty(page_tail));
+		BUG_ON(!PageSwapBacked(page_tail));
+
+		lru_add_page_tail(zone, page, page_tail);
+	}
+
+	ClearPageCompound(page);
+	compound_unlock(page);
+	spin_unlock_irq(&zone->lru_lock);
+
+	for (i = 1; i < HPAGE_PMD_NR; i++) {
+		struct page *page_tail = page + i;
+		BUG_ON(page_count(page_tail) <= 0);
+		/*
+		 * Tail pages may be freed if there wasn't any mapping
+		 * like if add_to_swap() is running on a lru page that
+		 * had its mapping zapped. And freeing these pages
+		 * requires taking the lru_lock so we do the put_page
+		 * of the tail pages after the split is complete.
+		 */
+		put_page(page_tail);
+	}
+
+	/*
+	 * Only the head page (now become a regular page) is required
+	 * to be pinned by the caller.
+	 */
+	BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+				 struct vm_area_struct *vma,
+				 unsigned long address)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pmd_t *pmd, _pmd;
+	int ret = 0, i;
+	pgtable_t pgtable;
+	unsigned long haddr;
+
+	spin_lock(&mm->page_table_lock);
+	pmd = page_check_address_pmd(page, mm, address,
+				     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+	if (pmd) {
+		pgtable = get_pmd_huge_pte(mm);
+		pmd_populate(mm, &_pmd, pgtable);
+
+		for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+		     i++, haddr += PAGE_SIZE) {
+			pte_t *pte, entry;
+			BUG_ON(PageCompound(page+i));
+			entry = mk_pte(page + i, vma->vm_page_prot);
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+			if (!pmd_write(*pmd))
+				entry = pte_wrprotect(entry);
+			else
+				BUG_ON(page_mapcount(page) != 1);
+			if (!pmd_young(*pmd))
+				entry = pte_mkold(entry);
+			pte = pte_offset_map(&_pmd, haddr);
+			BUG_ON(!pte_none(*pte));
+			set_pte_at(mm, haddr, pte, entry);
+			pte_unmap(pte);
+		}
+
+		mm->nr_ptes++;
+		smp_wmb(); /* make pte visible before pmd */
+		/*
+		 * Up to this point the pmd is present and huge and
+		 * userland has the whole access to the hugepage
+		 * during the split (which happens in place). If we
+		 * overwrite the pmd with the not-huge version
+		 * pointing to the pte here (which of course we could
+		 * if all CPUs were bug free), userland could trigger
+		 * a small page size TLB miss on the small sized TLB
+		 * while the hugepage TLB entry is still established
+		 * in the huge TLB. Some CPU doesn't like that. See
+		 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+		 * Erratum 383 on page 93. Intel should be safe but is
+		 * also warns that it's only safe if the permission
+		 * and cache attributes of the two entries loaded in
+		 * the two TLB is identical (which should be the case
+		 * here). But it is generally safer to never allow
+		 * small and huge TLB entries for the same virtual
+		 * address to be loaded simultaneously. So instead of
+		 * doing "pmd_populate(); flush_tlb_range();" we first
+		 * mark the current pmd notpresent (atomically because
+		 * here the pmd_trans_huge and pmd_trans_splitting
+		 * must remain set at all times on the pmd until the
+		 * split is complete for this pmd), then we flush the
+		 * SMP TLB and finally we write the non-huge version
+		 * of the pmd entry with pmd_populate.
+		 */
+		set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+		pmd_populate(mm, pmd, pgtable);
+		ret = 1;
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	return ret;
+}
+
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+			      struct anon_vma *anon_vma)
+{
+	int mapcount, mapcount2;
+	struct anon_vma_chain *avc;
+
+	BUG_ON(!PageHead(page));
+	BUG_ON(PageTail(page));
+
+	mapcount = 0;
+	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+		struct vm_area_struct *vma = avc->vma;
+		unsigned long addr = vma_address(page, vma);
+		BUG_ON(is_vma_temporary_stack(vma));
+		if (addr == -EFAULT)
+			continue;
+		mapcount += __split_huge_page_splitting(page, vma, addr);
+	}
+	BUG_ON(mapcount != page_mapcount(page));
+
+	__split_huge_page_refcount(page);
+
+	mapcount2 = 0;
+	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+		struct vm_area_struct *vma = avc->vma;
+		unsigned long addr = vma_address(page, vma);
+		BUG_ON(is_vma_temporary_stack(vma));
+		if (addr == -EFAULT)
+			continue;
+		mapcount2 += __split_huge_page_map(page, vma, addr);
+	}
+	BUG_ON(mapcount != mapcount2);
+}
+
+int split_huge_page(struct page *page)
+{
+	struct anon_vma *anon_vma;
+	int ret = 1;
+
+	BUG_ON(!PageAnon(page));
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		goto out;
+	ret = 0;
+	if (!PageCompound(page))
+		goto out_unlock;
+
+	BUG_ON(!PageSwapBacked(page));
+	__split_huge_page(page, anon_vma);
+
+	BUG_ON(PageCompound(page));
+out_unlock:
+	page_unlock_anon_vma(anon_vma);
+out:
+	return ret;
+}
+
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+	struct page *page;
+
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_trans_huge(*pmd))) {
+		spin_unlock(&mm->page_table_lock);
+		return;
+	}
+	page = pmd_page(*pmd);
+	VM_BUG_ON(!page_count(page));
+	get_page(page);
+	spin_unlock(&mm->page_table_lock);
+
+	split_huge_page(page);
+
+	put_page(page);
+	BUG_ON(pmd_trans_huge(*pmd));
+}
diff --git a/mm/internal.h b/mm/internal.h
index bd4f581f624a..69488205723d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 	}
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+				 struct vm_area_struct *vma);
+#endif
 #else /* !CONFIG_MMU */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 60e1c68d8218..c50a195041ec 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -726,9 +726,9 @@ out_set_pte:
 	return 0;
 }
 
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end)
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+		   unsigned long addr, unsigned long end)
 {
 	pte_t *orig_src_pte, *orig_dst_pte;
 	pte_t *src_pte, *dst_pte;
@@ -802,6 +802,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
 	src_pmd = pmd_offset(src_pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
+		if (pmd_trans_huge(*src_pmd)) {
+			int err;
+			err = copy_huge_pmd(dst_mm, src_mm,
+					    dst_pmd, src_pmd, addr, vma);
+			if (err == -ENOMEM)
+				return -ENOMEM;
+			if (!err)
+				continue;
+			/* fall through */
+		}
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
 		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -1004,6 +1014,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
+		if (pmd_trans_huge(*pmd)) {
+			if (next-addr != HPAGE_PMD_SIZE)
+				split_huge_page_pmd(vma->vm_mm, pmd);
+			else if (zap_huge_pmd(tlb, vma, pmd)) {
+				(*zap_work)--;
+				continue;
+			}
+			/* fall through */
+		}
 		if (pmd_none_or_clear_bad(pmd)) {
 			(*zap_work)--;
 			continue;
@@ -1280,11 +1299,27 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		goto no_page_table;
-	if (pmd_huge(*pmd)) {
+	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
 		BUG_ON(flags & FOLL_GET);
 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
 	}
+	if (pmd_trans_huge(*pmd)) {
+		spin_lock(&mm->page_table_lock);
+		if (likely(pmd_trans_huge(*pmd))) {
+			if (unlikely(pmd_trans_splitting(*pmd))) {
+				spin_unlock(&mm->page_table_lock);
+				wait_split_huge_page(vma->anon_vma, pmd);
+			} else {
+				page = follow_trans_huge_pmd(mm, address,
+							     pmd, flags);
+				spin_unlock(&mm->page_table_lock);
+				goto out;
+			}
+		} else
+			spin_unlock(&mm->page_table_lock);
+		/* fall through */
+	}
 	if (unlikely(pmd_bad(*pmd)))
 		goto no_page_table;
 
@@ -3179,9 +3214,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static inline int handle_pte_fault(struct mm_struct *mm,
-		struct vm_area_struct *vma, unsigned long address,
-		pte_t *pte, pmd_t *pmd, unsigned int flags)
+int handle_pte_fault(struct mm_struct *mm,
+		     struct vm_area_struct *vma, unsigned long address,
+		     pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
 	pte_t entry;
 	spinlock_t *ptl;
@@ -3260,9 +3295,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		return VM_FAULT_OOM;
-	pte = pte_alloc_map(mm, vma, pmd, address);
-	if (!pte)
+	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+		if (!vma->vm_ops)
+			return do_huge_pmd_anonymous_page(mm, vma, address,
+							  pmd, flags);
+	} else {
+		pmd_t orig_pmd = *pmd;
+		barrier();
+		if (pmd_trans_huge(orig_pmd)) {
+			if (flags & FAULT_FLAG_WRITE &&
+			    !pmd_write(orig_pmd) &&
+			    !pmd_trans_splitting(orig_pmd))
+				return do_huge_pmd_wp_page(mm, vma, address,
+							   pmd, orig_pmd);
+			return 0;
+		}
+	}
+
+	/*
+	 * Use __pte_alloc instead of pte_alloc_map, because we can't
+	 * run pte_offset_map on the pmd, if an huge pmd could
+	 * materialize from under us from a different thread.
+	 */
+	if (unlikely(__pte_alloc(mm, vma, pmd, address)))
 		return VM_FAULT_OOM;
+	/* if an huge pmd materialized from under us just retry later */
+	if (unlikely(pmd_trans_huge(*pmd)))
+		return 0;
+	/*
+	 * A regular pmd is established and it can't morph into a huge pmd
+	 * from under us anymore at this point because we hold the mmap_sem
+	 * read mode and khugepaged takes it in write mode. So now it's
+	 * safe to run pte_offset_map().
+	 */
+	pte = pte_offset_map(pmd, address);
 
 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index a3197a8a295b..e41375a6b029 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -360,7 +360,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
  * Returns virtual address or -EFAULT if page's index/offset is not
  * within the range mapped the @vma.
  */
-static inline unsigned long
+inline unsigned long
 vma_address(struct page *page, struct vm_area_struct *vma)
 {
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +435,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
 		return NULL;
+	if (pmd_trans_huge(*pmd))
+		return NULL;
 
 	pte = pte_offset_map(pmd, address);
 	/* Make a quick check before getting the lock */
@@ -489,35 +491,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			unsigned long *vm_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	pte_t *pte;
-	spinlock_t *ptl;
 	int referenced = 0;
 
-	pte = page_check_address(page, mm, address, &ptl, 0);
-	if (!pte)
-		goto out;
-
 	/*
 	 * Don't want to elevate referenced for mlocked page that gets this far,
 	 * in order that it progresses to try_to_unmap and is moved to the
 	 * unevictable list.
 	 */
 	if (vma->vm_flags & VM_LOCKED) {
-		*mapcount = 1;	/* break early from loop */
+		*mapcount = 0;	/* break early from loop */
 		*vm_flags |= VM_LOCKED;
-		goto out_unmap;
-	}
-
-	if (ptep_clear_flush_young_notify(vma, address, pte)) {
-		/*
-		 * Don't treat a reference through a sequentially read
-		 * mapping as such.  If the page has been used in
-		 * another mapping, we will catch it; if this other
-		 * mapping is already gone, the unmap path will have
-		 * set PG_referenced or activated the page.
-		 */
-		if (likely(!VM_SequentialReadHint(vma)))
-			referenced++;
+		goto out;
 	}
 
 	/* Pretend the page is referenced if the task has the
@@ -526,9 +510,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 
-out_unmap:
+	if (unlikely(PageTransHuge(page))) {
+		pmd_t *pmd;
+
+		spin_lock(&mm->page_table_lock);
+		pmd = page_check_address_pmd(page, mm, address,
+					     PAGE_CHECK_ADDRESS_PMD_FLAG);
+		if (pmd && !pmd_trans_splitting(*pmd) &&
+		    pmdp_clear_flush_young_notify(vma, address, pmd))
+			referenced++;
+		spin_unlock(&mm->page_table_lock);
+	} else {
+		pte_t *pte;
+		spinlock_t *ptl;
+
+		pte = page_check_address(page, mm, address, &ptl, 0);
+		if (!pte)
+			goto out;
+
+		if (ptep_clear_flush_young_notify(vma, address, pte)) {
+			/*
+			 * Don't treat a reference through a sequentially read
+			 * mapping as such.  If the page has been used in
+			 * another mapping, we will catch it; if this other
+			 * mapping is already gone, the unmap path will have
+			 * set PG_referenced or activated the page.
+			 */
+			if (likely(!VM_SequentialReadHint(vma)))
+				referenced++;
+		}
+		pte_unmap_unlock(pte, ptl);
+	}
+
 	(*mapcount)--;
-	pte_unmap_unlock(pte, ptl);
 
 	if (referenced)
 		*vm_flags |= vma->vm_flags;
@@ -1202,7 +1216,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 	return ret;
 }
 
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
 	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
 
diff --git a/mm/swap.c b/mm/swap.c
index e0eeef940886..c02f93611a84 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -479,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec)
 
 EXPORT_SYMBOL(__pagevec_release);
 
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+		       struct page *page, struct page *page_tail)
+{
+	int active;
+	enum lru_list lru;
+	const int file = 0;
+	struct list_head *head;
+
+	VM_BUG_ON(!PageHead(page));
+	VM_BUG_ON(PageCompound(page_tail));
+	VM_BUG_ON(PageLRU(page_tail));
+	VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+
+	SetPageLRU(page_tail);
+
+	if (page_evictable(page_tail, NULL)) {
+		if (PageActive(page)) {
+			SetPageActive(page_tail);
+			active = 1;
+			lru = LRU_ACTIVE_ANON;
+		} else {
+			active = 0;
+			lru = LRU_INACTIVE_ANON;
+		}
+		update_page_reclaim_stat(zone, page_tail, file, active);
+		if (likely(PageLRU(page)))
+			head = page->lru.prev;
+		else
+			head = &zone->lru[lru].list;
+		__add_page_to_lru_list(zone, page_tail, lru, head);
+	} else {
+		SetPageUnevictable(page_tail);
+		add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+	}
+}
+
 /*
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
-- 
cgit v1.2.3-71-gd317


From 0af4e98b6b095c74588af04872f83d333c958c32 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:55 -0800
Subject: thp: madvise(MADV_HUGEPAGE)

Add madvise MADV_HUGEPAGE to mark regions that are important to be
hugepage backed.  Return -EINVAL if the vma is not of an anonymous type,
or the feature isn't built into the kernel.  Never silently return
success.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  6 ++++++
 mm/huge_memory.c        | 16 ++++++++++++++++
 mm/madvise.c            |  8 ++++++++
 3 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9301824c7491..d9ab70d776e2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -97,6 +97,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #if HPAGE_PMD_ORDER > MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
+extern int hugepage_madvise(unsigned long *vm_flags);
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUG(); 0; })
@@ -113,6 +114,11 @@ static inline int split_huge_page(struct page *page)
 	do { } while (0)
 #define wait_split_huge_page(__anon_vma, __pmd)	\
 	do { } while (0)
+static inline int hugepage_madvise(unsigned long *vm_flags)
+{
+	BUG();
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 763507932898..620891f4e54f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -896,6 +896,22 @@ out:
 	return ret;
 }
 
+int hugepage_madvise(unsigned long *vm_flags)
+{
+	/*
+	 * Be somewhat over-protective like KSM for now!
+	 */
+	if (*vm_flags & (VM_HUGEPAGE | VM_SHARED  | VM_MAYSHARE   |
+			 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+			 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+			 VM_MIXEDMAP | VM_SAO))
+		return -EINVAL;
+
+	*vm_flags |= VM_HUGEPAGE;
+
+	return 0;
+}
+
 void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 {
 	struct page *page;
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..ecde40a401c1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,11 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		if (error)
 			goto out;
 		break;
+	case MADV_HUGEPAGE:
+		error = hugepage_madvise(&new_flags);
+		if (error)
+			goto out;
+		break;
 	}
 
 	if (new_flags == vma->vm_flags) {
@@ -282,6 +287,9 @@ madvise_behavior_valid(int behavior)
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	case MADV_HUGEPAGE:
 #endif
 		return 1;
 
-- 
cgit v1.2.3-71-gd317


From 500d65d471018d9a13b0d51b7e141ed2a3555c1d Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:55 -0800
Subject: thp: pmd_trans_huge migrate bugcheck

No pmd_trans_huge should ever materialize in migration ptes areas, because
we split the hugepage before migration ptes are instantiated.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 mm/memory.c        | 5 +++++
 mm/migrate.c       | 7 ++++++-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 78adec4ba9f4..2ec5138badab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1490,6 +1490,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_DUMP	0x08	/* give error on hole if it would be zero */
 #define FOLL_FORCE	0x10	/* get_user_pages read/write w/o permission */
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
+#define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/memory.c b/mm/memory.c
index c1a80e00458d..12ee1ea237f5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1305,6 +1305,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		goto out;
 	}
 	if (pmd_trans_huge(*pmd)) {
+		if (flags & FOLL_SPLIT) {
+			split_huge_page_pmd(mm, pmd);
+			goto split_fallthrough;
+		}
 		spin_lock(&mm->page_table_lock);
 		if (likely(pmd_trans_huge(*pmd))) {
 			if (unlikely(pmd_trans_splitting(*pmd))) {
@@ -1320,6 +1324,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			spin_unlock(&mm->page_table_lock);
 		/* fall through */
 	}
+split_fallthrough:
 	if (unlikely(pmd_bad(*pmd)))
 		goto no_page_table;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 690d0de993af..1a531b760b3b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 			goto out;
 
 		pmd = pmd_offset(pud, addr);
+		if (pmd_trans_huge(*pmd))
+			goto out;
 		if (!pmd_present(*pmd))
 			goto out;
 
@@ -632,6 +634,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 		/* page was freed from under us. So we are done. */
 		goto move_newpage;
 	}
+	if (unlikely(PageTransHuge(page)))
+		if (unlikely(split_huge_page(page)))
+			goto move_newpage;
 
 	/* prepare cgroup just returns 0 or -ENOMEM */
 	rc = -EAGAIN;
@@ -1063,7 +1068,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
 			goto set_status;
 
-		page = follow_page(vma, pp->addr, FOLL_GET);
+		page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
 
 		err = PTR_ERR(page);
 		if (IS_ERR(page))
-- 
cgit v1.2.3-71-gd317


From 79134171df238171daa4c024a42b77b401ccb00b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:58 -0800
Subject: thp: transparent hugepage vmstat

Add hugepage stat information to /proc/vmstat and /proc/meminfo.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c      | 14 +++++++++++++-
 include/linux/mmzone.h |  1 +
 mm/huge_memory.c       |  3 +++
 mm/rmap.c              | 20 ++++++++++++++++----
 mm/vmstat.c            |  1 +
 5 files changed, 34 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -100,6 +100,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"VmallocChunk:   %8lu kB\n"
 #ifdef CONFIG_MEMORY_FAILURE
 		"HardwareCorrupted: %5lu kB\n"
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		"AnonHugePages:  %8lu kB\n"
 #endif
 		,
 		K(i.totalram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
 		K(global_page_state(NR_WRITEBACK)),
-		K(global_page_state(NR_ANON_PAGES)),
+		K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		  HPAGE_PMD_NR
+#endif
+		  ),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SHMEM)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -150,6 +158,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		vmi.largest_chunk >> 10
 #ifdef CONFIG_MEMORY_FAILURE
 		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		   HPAGE_PMD_NR)
 #endif
 		);
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dad3612a7e39..02ecb0189b1d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,7 @@ enum zone_stat_item {
 	NUMA_LOCAL,		/* allocation from local node */
 	NUMA_OTHER,		/* allocation from other node */
 #endif
+	NR_ANON_TRANSPARENT_HUGEPAGES,
 	NR_VM_ZONE_STAT_ITEMS };
 
 /*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a313403b3c5e..7101112a5429 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -751,6 +751,9 @@ static void __split_huge_page_refcount(struct page *page)
 		lru_add_page_tail(zone, page, page_tail);
 	}
 
+	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+
 	ClearPageCompound(page);
 	compound_unlock(page);
 	spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e14dcfe737..3825ae4bc32f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -882,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
 	int first = atomic_inc_and_test(&page->_mapcount);
-	if (first)
-		__inc_zone_page_state(page, NR_ANON_PAGES);
+	if (first) {
+		if (!PageTransHuge(page))
+			__inc_zone_page_state(page, NR_ANON_PAGES);
+		else
+			__inc_zone_page_state(page,
+					      NR_ANON_TRANSPARENT_HUGEPAGES);
+	}
 	if (unlikely(PageKsm(page)))
 		return;
 
@@ -911,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	SetPageSwapBacked(page);
 	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-	__inc_zone_page_state(page, NR_ANON_PAGES);
+	if (!PageTransHuge(page))
+		__inc_zone_page_state(page, NR_ANON_PAGES);
+	else
+		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__page_set_anon_rmap(page, vma, address, 1);
 	if (page_evictable(page, vma))
 		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -964,7 +972,11 @@ void page_remove_rmap(struct page *page)
 		return;
 	if (PageAnon(page)) {
 		mem_cgroup_uncharge_page(page);
-		__dec_zone_page_state(page, NR_ANON_PAGES);
+		if (!PageTransHuge(page))
+			__dec_zone_page_state(page, NR_ANON_PAGES);
+		else
+			__dec_zone_page_state(page,
+					      NR_ANON_TRANSPARENT_HUGEPAGES);
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
 		mem_cgroup_update_file_mapped(page, -1);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 751a65e00aac..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -880,6 +880,7 @@ static const char * const vmstat_text[] = {
 	"numa_local",
 	"numa_other",
 #endif
+	"nr_anon_transparent_hugepages",
 	"nr_dirty_threshold",
 	"nr_dirty_background_threshold",
 
-- 
cgit v1.2.3-71-gd317


From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:58 -0800
Subject: thp: khugepaged

Add khugepaged to relocate fragmented pages into hugepages if new
hugepages become available.  (this is indipendent of the defrag logic that
will have to make new hugepages available)

The fundamental reason why khugepaged is unavoidable, is that some memory
can be fragmented and not everything can be relocated.  So when a virtual
machine quits and releases gigabytes of hugepages, we want to use those
freely available hugepages to create huge-pmd in the other virtual
machines that may be running on fragmented memory, to maximize the CPU
efficiency at all times.  The scan is slow, it takes nearly zero cpu time,
except when it copies data (in which case it means we definitely want to
pay for that cpu time) so it seems a good tradeoff.

In addition to the hugepages being released by other process releasing
memory, we have the strong suspicion that the performance impact of
potentially defragmenting hugepages during or before each page fault could
lead to more performance inconsistency than allocating small pages at
first and having them collapsed into large pages later...  if they prove
themselfs to be long lived mappings (khugepaged scan is slow so short
lived mappings have low probability to run into khugepaged if compared to
long lived mappings).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h    |    1 +
 include/linux/khugepaged.h |   66 +++
 include/linux/sched.h      |    1 +
 kernel/fork.c              |    5 +
 mm/huge_memory.c           | 1073 +++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 1136 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/khugepaged.h

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d9ab70d776e2..43a694ef8904 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -25,6 +25,7 @@ enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
 #ifdef CONFIG_DEBUG_VM
 	TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
 #endif
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
new file mode 100644
index 000000000000..552f3184756c
--- /dev/null
+++ b/include/linux/khugepaged.h
@@ -0,0 +1,66 @@
+#ifndef _LINUX_KHUGEPAGED_H
+#define _LINUX_KHUGEPAGED_H
+
+#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __khugepaged_enter(struct mm_struct *mm);
+extern void __khugepaged_exit(struct mm_struct *mm);
+extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+
+#define khugepaged_enabled()					       \
+	(transparent_hugepage_flags &				       \
+	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
+	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define khugepaged_always()				\
+	(transparent_hugepage_flags &			\
+	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
+#define khugepaged_req_madv()					\
+	(transparent_hugepage_flags &				\
+	 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+#define khugepaged_defrag()					\
+	(transparent_hugepage_flags &				\
+	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+		return __khugepaged_enter(mm);
+	return 0;
+}
+
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+	if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+		__khugepaged_exit(mm);
+}
+
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
+		if (khugepaged_always() ||
+		    (khugepaged_req_madv() &&
+		     vma->vm_flags & VM_HUGEPAGE))
+			if (__khugepaged_enter(vma->vm_mm))
+				return -ENOMEM;
+	return 0;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	return 0;
+}
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+}
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+	return 0;
+}
+static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+	return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* _LINUX_KHUGEPAGED_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f23b5bb6f52e..d747f948b34e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm);
 #endif
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f78f50ba6cb2..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
+#include <linux/khugepaged.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -328,6 +329,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 	retval = ksm_fork(mm, oldmm);
+	if (retval)
+		goto out;
+	retval = khugepaged_fork(mm, oldmm);
 	if (retval)
 		goto out;
 
@@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm)
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
 		ksm_exit(mm);
+		khugepaged_exit(mm); /* must run before exit_mmap */
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7101112a5429..ae2bf08b1099 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,14 +12,111 @@
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
 unsigned long transparent_hugepage_flags __read_mostly =
-	(1<<TRANSPARENT_HUGEPAGE_FLAG);
+	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+	struct hlist_node hash;
+	struct list_head mm_node;
+	struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+	struct list_head mm_head;
+	struct mm_slot *mm_slot;
+	unsigned long address;
+} khugepaged_scan = {
+	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+static int start_khugepaged(void)
+{
+	int err = 0;
+	if (khugepaged_enabled()) {
+		int wakeup;
+		if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mutex_lock(&khugepaged_mutex);
+		if (!khugepaged_thread)
+			khugepaged_thread = kthread_run(khugepaged, NULL,
+							"khugepaged");
+		if (unlikely(IS_ERR(khugepaged_thread))) {
+			printk(KERN_ERR
+			       "khugepaged: kthread_run(khugepaged) failed\n");
+			err = PTR_ERR(khugepaged_thread);
+			khugepaged_thread = NULL;
+		}
+		wakeup = !list_empty(&khugepaged_scan.mm_head);
+		mutex_unlock(&khugepaged_mutex);
+		if (wakeup)
+			wake_up_interruptible(&khugepaged_wait);
+	} else
+		/* wakeup to exit */
+		wake_up_interruptible(&khugepaged_wait);
+out:
+	return err;
+}
 
 #ifdef CONFIG_SYSFS
+
 static ssize_t double_flag_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf,
 				enum transparent_hugepage_flag enabled,
@@ -68,9 +165,19 @@ static ssize_t enabled_store(struct kobject *kobj,
 			     struct kobj_attribute *attr,
 			     const char *buf, size_t count)
 {
-	return double_flag_store(kobj, attr, buf, count,
-				 TRANSPARENT_HUGEPAGE_FLAG,
-				 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+	ssize_t ret;
+
+	ret = double_flag_store(kobj, attr, buf, count,
+				TRANSPARENT_HUGEPAGE_FLAG,
+				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+	if (ret > 0) {
+		int err = start_khugepaged();
+		if (err)
+			ret = err;
+	}
+
+	return ret;
 }
 static struct kobj_attribute enabled_attr =
 	__ATTR(enabled, 0644, enabled_show, enabled_store);
@@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = {
 
 static struct attribute_group hugepage_attr_group = {
 	.attrs = hugepage_attr,
-	.name = "transparent_hugepage",
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	unsigned long msecs;
+	int err;
+
+	err = strict_strtoul(buf, 10, &msecs);
+	if (err || msecs > UINT_MAX)
+		return -EINVAL;
+
+	khugepaged_scan_sleep_millisecs = msecs;
+	wake_up_interruptible(&khugepaged_wait);
+
+	return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+	__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+	       scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	unsigned long msecs;
+	int err;
+
+	err = strict_strtoul(buf, 10, &msecs);
+	if (err || msecs > UINT_MAX)
+		return -EINVAL;
+
+	khugepaged_alloc_sleep_millisecs = msecs;
+	wake_up_interruptible(&khugepaged_wait);
+
+	return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+	__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+	       alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+				  struct kobj_attribute *attr,
+				  char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
+{
+	int err;
+	unsigned long pages;
+
+	err = strict_strtoul(buf, 10, &pages);
+	if (err || !pages || pages > UINT_MAX)
+		return -EINVAL;
+
+	khugepaged_pages_to_scan = pages;
+
+	return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
+	       pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+	__ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+	__ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *buf)
+{
+	return single_flag_show(kobj, attr, buf,
+				TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	return single_flag_store(kobj, attr, buf, count,
+				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+	__ATTR(defrag, 0644, khugepaged_defrag_show,
+	       khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+					      struct kobj_attribute *attr,
+					      const char *buf, size_t count)
+{
+	int err;
+	unsigned long max_ptes_none;
+
+	err = strict_strtoul(buf, 10, &max_ptes_none);
+	if (err || max_ptes_none > HPAGE_PMD_NR-1)
+		return -EINVAL;
+
+	khugepaged_max_ptes_none = max_ptes_none;
+
+	return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+	__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+	       khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+	&khugepaged_defrag_attr.attr,
+	&khugepaged_max_ptes_none_attr.attr,
+	&pages_to_scan_attr.attr,
+	&pages_collapsed_attr.attr,
+	&full_scans_attr.attr,
+	&scan_sleep_millisecs_attr.attr,
+	&alloc_sleep_millisecs_attr.attr,
+	NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+	.attrs = khugepaged_attr,
+	.name = "khugepaged",
 };
 #endif /* CONFIG_SYSFS */
 
 static int __init hugepage_init(void)
 {
-#ifdef CONFIG_SYSFS
 	int err;
+#ifdef CONFIG_SYSFS
+	static struct kobject *hugepage_kobj;
 
-	err = sysfs_create_group(mm_kobj, &hugepage_attr_group);
-	if (err)
-		printk(KERN_ERR "hugepage: register sysfs failed\n");
+	err = -ENOMEM;
+	hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+	if (unlikely(!hugepage_kobj)) {
+		printk(KERN_ERR "hugepage: failed kobject create\n");
+		goto out;
+	}
+
+	err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+	if (err) {
+		printk(KERN_ERR "hugepage: failed register hugeage group\n");
+		goto out;
+	}
+
+	err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+	if (err) {
+		printk(KERN_ERR "hugepage: failed register hugeage group\n");
+		goto out;
+	}
 #endif
-	return 0;
+
+	err = khugepaged_slab_init();
+	if (err)
+		goto out;
+
+	err = mm_slots_hash_init();
+	if (err) {
+		khugepaged_slab_free();
+		goto out;
+	}
+
+	start_khugepaged();
+
+out:
+	return err;
 }
 module_init(hugepage_init)
 
@@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
 		if (unlikely(anon_vma_prepare(vma)))
 			return VM_FAULT_OOM;
+		if (unlikely(khugepaged_enter(vma)))
+			return VM_FAULT_OOM;
 		page = alloc_hugepage(transparent_hugepage_defrag(vma));
 		if (unlikely(!page))
 			goto out;
@@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags)
 	return 0;
 }
 
+static int __init khugepaged_slab_init(void)
+{
+	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+					  sizeof(struct mm_slot),
+					  __alignof__(struct mm_slot), 0, NULL);
+	if (!mm_slot_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __init khugepaged_slab_free(void)
+{
+	kmem_cache_destroy(mm_slot_cache);
+	mm_slot_cache = NULL;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+	if (!mm_slot_cache)	/* initialization failed */
+		return NULL;
+	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+	kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+				GFP_KERNEL);
+	if (!mm_slots_hash)
+		return -ENOMEM;
+	return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+	kfree(mm_slots_hash);
+	mm_slots_hash = NULL;
+}
+#endif
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+	struct mm_slot *mm_slot;
+	struct hlist_head *bucket;
+	struct hlist_node *node;
+
+	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+				% MM_SLOTS_HASH_HEADS];
+	hlist_for_each_entry(mm_slot, node, bucket, hash) {
+		if (mm == mm_slot->mm)
+			return mm_slot;
+	}
+	return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+				    struct mm_slot *mm_slot)
+{
+	struct hlist_head *bucket;
+
+	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+				% MM_SLOTS_HASH_HEADS];
+	mm_slot->mm = mm;
+	hlist_add_head(&mm_slot->hash, bucket);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+	return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+	struct mm_slot *mm_slot;
+	int wakeup;
+
+	mm_slot = alloc_mm_slot();
+	if (!mm_slot)
+		return -ENOMEM;
+
+	/* __khugepaged_exit() must not run from under us */
+	VM_BUG_ON(khugepaged_test_exit(mm));
+	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+		free_mm_slot(mm_slot);
+		return 0;
+	}
+
+	spin_lock(&khugepaged_mm_lock);
+	insert_to_mm_slots_hash(mm, mm_slot);
+	/*
+	 * Insert just behind the scanning cursor, to let the area settle
+	 * down a little.
+	 */
+	wakeup = list_empty(&khugepaged_scan.mm_head);
+	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+	spin_unlock(&khugepaged_mm_lock);
+
+	atomic_inc(&mm->mm_count);
+	if (wakeup)
+		wake_up_interruptible(&khugepaged_wait);
+
+	return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+	unsigned long hstart, hend;
+	if (!vma->anon_vma)
+		/*
+		 * Not yet faulted in so we will register later in the
+		 * page fault if needed.
+		 */
+		return 0;
+	if (vma->vm_file || vma->vm_ops)
+		/* khugepaged not yet working on file or special mappings */
+		return 0;
+	VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+	hend = vma->vm_end & HPAGE_PMD_MASK;
+	if (hstart < hend)
+		return khugepaged_enter(vma);
+	return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+	struct mm_slot *mm_slot;
+	int free = 0;
+
+	spin_lock(&khugepaged_mm_lock);
+	mm_slot = get_mm_slot(mm);
+	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+		hlist_del(&mm_slot->hash);
+		list_del(&mm_slot->mm_node);
+		free = 1;
+	}
+
+	if (free) {
+		spin_unlock(&khugepaged_mm_lock);
+		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+		free_mm_slot(mm_slot);
+		mmdrop(mm);
+	} else if (mm_slot) {
+		spin_unlock(&khugepaged_mm_lock);
+		/*
+		 * This is required to serialize against
+		 * khugepaged_test_exit() (which is guaranteed to run
+		 * under mmap sem read mode). Stop here (after we
+		 * return all pagetables will be destroyed) until
+		 * khugepaged has finished working on the pagetables
+		 * under the mmap_sem.
+		 */
+		down_write(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
+	} else
+		spin_unlock(&khugepaged_mm_lock);
+}
+
+static void release_pte_page(struct page *page)
+{
+	/* 0 stands for page_is_file_cache(page) == false */
+	dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+	unlock_page(page);
+	putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+	while (--_pte >= pte) {
+		pte_t pteval = *_pte;
+		if (!pte_none(pteval))
+			release_pte_page(pte_page(pteval));
+	}
+}
+
+static void release_all_pte_pages(pte_t *pte)
+{
+	release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+					unsigned long address,
+					pte_t *pte)
+{
+	struct page *page;
+	pte_t *_pte;
+	int referenced = 0, isolated = 0, none = 0;
+	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+	     _pte++, address += PAGE_SIZE) {
+		pte_t pteval = *_pte;
+		if (pte_none(pteval)) {
+			if (++none <= khugepaged_max_ptes_none)
+				continue;
+			else {
+				release_pte_pages(pte, _pte);
+				goto out;
+			}
+		}
+		if (!pte_present(pteval) || !pte_write(pteval)) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		page = vm_normal_page(vma, address, pteval);
+		if (unlikely(!page)) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		VM_BUG_ON(PageCompound(page));
+		BUG_ON(!PageAnon(page));
+		VM_BUG_ON(!PageSwapBacked(page));
+
+		/* cannot use mapcount: can't collapse if there's a gup pin */
+		if (page_count(page) != 1) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		/*
+		 * We can do it before isolate_lru_page because the
+		 * page can't be freed from under us. NOTE: PG_lock
+		 * is needed to serialize against split_huge_page
+		 * when invoked from the VM.
+		 */
+		if (!trylock_page(page)) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		/*
+		 * Isolate the page to avoid collapsing an hugepage
+		 * currently in use by the VM.
+		 */
+		if (isolate_lru_page(page)) {
+			unlock_page(page);
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		/* 0 stands for page_is_file_cache(page) == false */
+		inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+		VM_BUG_ON(!PageLocked(page));
+		VM_BUG_ON(PageLRU(page));
+
+		/* If there is no mapped pte young don't collapse the page */
+		if (pte_young(pteval))
+			referenced = 1;
+	}
+	if (unlikely(!referenced))
+		release_all_pte_pages(pte);
+	else
+		isolated = 1;
+out:
+	return isolated;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+				      struct vm_area_struct *vma,
+				      unsigned long address,
+				      spinlock_t *ptl)
+{
+	pte_t *_pte;
+	for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+		pte_t pteval = *_pte;
+		struct page *src_page;
+
+		if (pte_none(pteval)) {
+			clear_user_highpage(page, address);
+			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+		} else {
+			src_page = pte_page(pteval);
+			copy_user_highpage(page, src_page, address, vma);
+			VM_BUG_ON(page_mapcount(src_page) != 1);
+			VM_BUG_ON(page_count(src_page) != 2);
+			release_pte_page(src_page);
+			/*
+			 * ptl mostly unnecessary, but preempt has to
+			 * be disabled to update the per-cpu stats
+			 * inside page_remove_rmap().
+			 */
+			spin_lock(ptl);
+			/*
+			 * paravirt calls inside pte_clear here are
+			 * superfluous.
+			 */
+			pte_clear(vma->vm_mm, address, _pte);
+			page_remove_rmap(src_page);
+			spin_unlock(ptl);
+			free_page_and_swap_cache(src_page);
+		}
+
+		address += PAGE_SIZE;
+		page++;
+	}
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+			       unsigned long address,
+			       struct page **hpage)
+{
+	struct vm_area_struct *vma;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd, _pmd;
+	pte_t *pte;
+	pgtable_t pgtable;
+	struct page *new_page;
+	spinlock_t *ptl;
+	int isolated;
+	unsigned long hstart, hend;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	VM_BUG_ON(!*hpage);
+
+	/*
+	 * Prevent all access to pagetables with the exception of
+	 * gup_fast later hanlded by the ptep_clear_flush and the VM
+	 * handled by the anon_vma lock + PG_lock.
+	 */
+	down_write(&mm->mmap_sem);
+	if (unlikely(khugepaged_test_exit(mm)))
+		goto out;
+
+	vma = find_vma(mm, address);
+	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+	hend = vma->vm_end & HPAGE_PMD_MASK;
+	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+		goto out;
+
+	if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+		goto out;
+
+	/* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+	if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+		goto out;
+	VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	/* pmd can't go away or become huge under us */
+	if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+		goto out;
+
+	new_page = *hpage;
+	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+		goto out;
+
+	anon_vma_lock(vma->anon_vma);
+
+	pte = pte_offset_map(pmd, address);
+	ptl = pte_lockptr(mm, pmd);
+
+	spin_lock(&mm->page_table_lock); /* probably unnecessary */
+	/*
+	 * After this gup_fast can't run anymore. This also removes
+	 * any huge TLB entry from the CPU so we won't allow
+	 * huge and small TLB entries for the same virtual address
+	 * to avoid the risk of CPU bugs in that area.
+	 */
+	_pmd = pmdp_clear_flush_notify(vma, address, pmd);
+	spin_unlock(&mm->page_table_lock);
+
+	spin_lock(ptl);
+	isolated = __collapse_huge_page_isolate(vma, address, pte);
+	spin_unlock(ptl);
+	pte_unmap(pte);
+
+	if (unlikely(!isolated)) {
+		spin_lock(&mm->page_table_lock);
+		BUG_ON(!pmd_none(*pmd));
+		set_pmd_at(mm, address, pmd, _pmd);
+		spin_unlock(&mm->page_table_lock);
+		anon_vma_unlock(vma->anon_vma);
+		mem_cgroup_uncharge_page(new_page);
+		goto out;
+	}
+
+	/*
+	 * All pages are isolated and locked so anon_vma rmap
+	 * can't run anymore.
+	 */
+	anon_vma_unlock(vma->anon_vma);
+
+	__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+	__SetPageUptodate(new_page);
+	pgtable = pmd_pgtable(_pmd);
+	VM_BUG_ON(page_count(pgtable) != 1);
+	VM_BUG_ON(page_mapcount(pgtable) != 0);
+
+	_pmd = mk_pmd(new_page, vma->vm_page_prot);
+	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+	_pmd = pmd_mkhuge(_pmd);
+
+	/*
+	 * spin_lock() below is not the equivalent of smp_wmb(), so
+	 * this is needed to avoid the copy_huge_page writes to become
+	 * visible after the set_pmd_at() write.
+	 */
+	smp_wmb();
+
+	spin_lock(&mm->page_table_lock);
+	BUG_ON(!pmd_none(*pmd));
+	page_add_new_anon_rmap(new_page, vma, address);
+	set_pmd_at(mm, address, pmd, _pmd);
+	update_mmu_cache(vma, address, entry);
+	prepare_pmd_huge_pte(pgtable, mm);
+	mm->nr_ptes--;
+	spin_unlock(&mm->page_table_lock);
+
+	*hpage = NULL;
+	khugepaged_pages_collapsed++;
+out:
+	up_write(&mm->mmap_sem);
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+			       struct vm_area_struct *vma,
+			       unsigned long address,
+			       struct page **hpage)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, *_pte;
+	int ret = 0, referenced = 0, none = 0;
+	struct page *page;
+	unsigned long _address;
+	spinlock_t *ptl;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+		goto out;
+
+	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+	     _pte++, _address += PAGE_SIZE) {
+		pte_t pteval = *_pte;
+		if (pte_none(pteval)) {
+			if (++none <= khugepaged_max_ptes_none)
+				continue;
+			else
+				goto out_unmap;
+		}
+		if (!pte_present(pteval) || !pte_write(pteval))
+			goto out_unmap;
+		page = vm_normal_page(vma, _address, pteval);
+		if (unlikely(!page))
+			goto out_unmap;
+		VM_BUG_ON(PageCompound(page));
+		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+			goto out_unmap;
+		/* cannot use mapcount: can't collapse if there's a gup pin */
+		if (page_count(page) != 1)
+			goto out_unmap;
+		if (pte_young(pteval))
+			referenced = 1;
+	}
+	if (referenced)
+		ret = 1;
+out_unmap:
+	pte_unmap_unlock(pte, ptl);
+	if (ret) {
+		up_read(&mm->mmap_sem);
+		collapse_huge_page(mm, address, hpage);
+	}
+out:
+	return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+	struct mm_struct *mm = mm_slot->mm;
+
+	VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+	if (khugepaged_test_exit(mm)) {
+		/* free mm_slot */
+		hlist_del(&mm_slot->hash);
+		list_del(&mm_slot->mm_node);
+
+		/*
+		 * Not strictly needed because the mm exited already.
+		 *
+		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+		 */
+
+		/* khugepaged_mm_lock actually not necessary for the below */
+		free_mm_slot(mm_slot);
+		mmdrop(mm);
+	}
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+					    struct page **hpage)
+{
+	struct mm_slot *mm_slot;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	int progress = 0;
+
+	VM_BUG_ON(!pages);
+	VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+	if (khugepaged_scan.mm_slot)
+		mm_slot = khugepaged_scan.mm_slot;
+	else {
+		mm_slot = list_entry(khugepaged_scan.mm_head.next,
+				     struct mm_slot, mm_node);
+		khugepaged_scan.address = 0;
+		khugepaged_scan.mm_slot = mm_slot;
+	}
+	spin_unlock(&khugepaged_mm_lock);
+
+	mm = mm_slot->mm;
+	down_read(&mm->mmap_sem);
+	if (unlikely(khugepaged_test_exit(mm)))
+		vma = NULL;
+	else
+		vma = find_vma(mm, khugepaged_scan.address);
+
+	progress++;
+	for (; vma; vma = vma->vm_next) {
+		unsigned long hstart, hend;
+
+		cond_resched();
+		if (unlikely(khugepaged_test_exit(mm))) {
+			progress++;
+			break;
+		}
+
+		if (!(vma->vm_flags & VM_HUGEPAGE) &&
+		    !khugepaged_always()) {
+			progress++;
+			continue;
+		}
+
+		/* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+		if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+			khugepaged_scan.address = vma->vm_end;
+			progress++;
+			continue;
+		}
+		VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+		hend = vma->vm_end & HPAGE_PMD_MASK;
+		if (hstart >= hend) {
+			progress++;
+			continue;
+		}
+		if (khugepaged_scan.address < hstart)
+			khugepaged_scan.address = hstart;
+		if (khugepaged_scan.address > hend) {
+			khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
+			progress++;
+			continue;
+		}
+		BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+		while (khugepaged_scan.address < hend) {
+			int ret;
+			cond_resched();
+			if (unlikely(khugepaged_test_exit(mm)))
+				goto breakouterloop;
+
+			VM_BUG_ON(khugepaged_scan.address < hstart ||
+				  khugepaged_scan.address + HPAGE_PMD_SIZE >
+				  hend);
+			ret = khugepaged_scan_pmd(mm, vma,
+						  khugepaged_scan.address,
+						  hpage);
+			/* move to next address */
+			khugepaged_scan.address += HPAGE_PMD_SIZE;
+			progress += HPAGE_PMD_NR;
+			if (ret)
+				/* we released mmap_sem so break loop */
+				goto breakouterloop_mmap_sem;
+			if (progress >= pages)
+				goto breakouterloop;
+		}
+	}
+breakouterloop:
+	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+	spin_lock(&khugepaged_mm_lock);
+	BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+	/*
+	 * Release the current mm_slot if this mm is about to die, or
+	 * if we scanned all vmas of this mm.
+	 */
+	if (khugepaged_test_exit(mm) || !vma) {
+		/*
+		 * Make sure that if mm_users is reaching zero while
+		 * khugepaged runs here, khugepaged_exit will find
+		 * mm_slot not pointing to the exiting mm.
+		 */
+		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+			khugepaged_scan.mm_slot = list_entry(
+				mm_slot->mm_node.next,
+				struct mm_slot, mm_node);
+			khugepaged_scan.address = 0;
+		} else {
+			khugepaged_scan.mm_slot = NULL;
+			khugepaged_full_scans++;
+		}
+
+		collect_mm_slot(mm_slot);
+	}
+
+	return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+	return !list_empty(&khugepaged_scan.mm_head) &&
+		khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+	return !list_empty(&khugepaged_scan.mm_head) ||
+		!khugepaged_enabled();
+}
+
+static void khugepaged_do_scan(struct page **hpage)
+{
+	unsigned int progress = 0, pass_through_head = 0;
+	unsigned int pages = khugepaged_pages_to_scan;
+
+	barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+	while (progress < pages) {
+		cond_resched();
+
+		if (!*hpage) {
+			*hpage = alloc_hugepage(khugepaged_defrag());
+			if (unlikely(!*hpage))
+				break;
+		}
+
+		spin_lock(&khugepaged_mm_lock);
+		if (!khugepaged_scan.mm_slot)
+			pass_through_head++;
+		if (khugepaged_has_work() &&
+		    pass_through_head < 2)
+			progress += khugepaged_scan_mm_slot(pages - progress,
+							    hpage);
+		else
+			progress = pages;
+		spin_unlock(&khugepaged_mm_lock);
+	}
+}
+
+static struct page *khugepaged_alloc_hugepage(void)
+{
+	struct page *hpage;
+
+	do {
+		hpage = alloc_hugepage(khugepaged_defrag());
+		if (!hpage) {
+			DEFINE_WAIT(wait);
+			add_wait_queue(&khugepaged_wait, &wait);
+			schedule_timeout_interruptible(
+				msecs_to_jiffies(
+					khugepaged_alloc_sleep_millisecs));
+			remove_wait_queue(&khugepaged_wait, &wait);
+		}
+	} while (unlikely(!hpage) &&
+		 likely(khugepaged_enabled()));
+	return hpage;
+}
+
+static void khugepaged_loop(void)
+{
+	struct page *hpage;
+
+	while (likely(khugepaged_enabled())) {
+		hpage = khugepaged_alloc_hugepage();
+		if (unlikely(!hpage))
+			break;
+
+		khugepaged_do_scan(&hpage);
+		if (hpage)
+			put_page(hpage);
+		if (khugepaged_has_work()) {
+			DEFINE_WAIT(wait);
+			if (!khugepaged_scan_sleep_millisecs)
+				continue;
+			add_wait_queue(&khugepaged_wait, &wait);
+			schedule_timeout_interruptible(
+				msecs_to_jiffies(
+					khugepaged_scan_sleep_millisecs));
+			remove_wait_queue(&khugepaged_wait, &wait);
+		} else if (khugepaged_enabled())
+			wait_event_interruptible(khugepaged_wait,
+						 khugepaged_wait_event());
+	}
+}
+
+static int khugepaged(void *none)
+{
+	struct mm_slot *mm_slot;
+
+	set_user_nice(current, 19);
+
+	/* serialize with start_khugepaged() */
+	mutex_lock(&khugepaged_mutex);
+
+	for (;;) {
+		mutex_unlock(&khugepaged_mutex);
+		BUG_ON(khugepaged_thread != current);
+		khugepaged_loop();
+		BUG_ON(khugepaged_thread != current);
+
+		mutex_lock(&khugepaged_mutex);
+		if (!khugepaged_enabled())
+			break;
+	}
+
+	spin_lock(&khugepaged_mm_lock);
+	mm_slot = khugepaged_scan.mm_slot;
+	khugepaged_scan.mm_slot = NULL;
+	if (mm_slot)
+		collect_mm_slot(mm_slot);
+	spin_unlock(&khugepaged_mm_lock);
+
+	khugepaged_thread = NULL;
+	mutex_unlock(&khugepaged_mutex);
+
+	return 0;
+}
+
 void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 {
 	struct page *page;
-- 
cgit v1.2.3-71-gd317


From 5f24ce5fd34c3ca1b3d10d30da754732da64d5c0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:00 -0800
Subject: thp: remove PG_buddy

PG_buddy can be converted to _mapcount == -2.  So the PG_compound_lock can
be added to page->flags without overflowing (because of the sparse section
bits increasing) with CONFIG_X86_PAE=y and CONFIG_X86_PAT=y.  This also
has to move the memory hotplug code from _mapcount to lru.next to avoid
any risk of clashes.  We can't use lru.next for PG_buddy removal, but
memory hotplug can use lru.next even more easily than the mapcount
instead.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c                 | 14 ++++++++------
 include/linux/memory_hotplug.h | 14 +++++++++-----
 include/linux/mm.h             | 21 +++++++++++++++++++++
 include/linux/page-flags.h     |  7 +------
 mm/memory_hotplug.c            | 14 ++++++++------
 mm/page_alloc.c                |  7 +++----
 mm/sparse.c                    |  4 ++--
 7 files changed, 52 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index b06c674624e6..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
 	if (PageHuge(page))
 		u |= 1 << KPF_HUGE;
 
-	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
-
 	/*
-	 * Caveats on high order pages:
-	 * PG_buddy will only be set on the head page; SLUB/SLQB do the same
-	 * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+	 * Caveats on high order pages: page->_count will only be set
+	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+	 * SLOB won't set PG_slab at all on compound pages.
 	 */
+	if (PageBuddy(page))
+		u |= 1 << KPF_BUDDY;
+
+	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
+
 	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
-	u |= kpf_copy_bit(k, KPF_BUDDY,		PG_buddy);
 
 	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
 	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 31c237a00c48..24376fe7ee68 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -13,12 +13,16 @@ struct mem_section;
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 /*
- * Types for free bootmem.
- * The normal smallest mapcount is -1. Here is smaller value than it.
+ * Types for free bootmem stored in page->lru.next. These have to be in
+ * some random range in unsigned long space for debugging purposes.
  */
-#define SECTION_INFO		(-1 - 1)
-#define MIX_SECTION_INFO	(-1 - 2)
-#define NODE_INFO		(-1 - 3)
+enum {
+	MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+	SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
+	MIX_SECTION_INFO,
+	NODE_INFO,
+	MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+};
 
 /*
  * pgdat resizing functions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ec5138badab..7ab7d2b60041 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -397,6 +397,27 @@ static inline void init_page_count(struct page *page)
 	atomic_set(&page->_count, 1);
 }
 
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ */
+static inline int PageBuddy(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == -2;
+}
+
+static inline void __SetPageBuddy(struct page *page)
+{
+	VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
+	atomic_set(&page->_mapcount, -2);
+}
+
+static inline void __ClearPageBuddy(struct page *page)
+{
+	VM_BUG_ON(!PageBuddy(page));
+	atomic_set(&page->_mapcount, -1);
+}
+
 void put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4ca1241ef94e..0db8037e2725 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -48,9 +48,6 @@
  * struct page (these bits with information) are always mapped into kernel
  * address space...
  *
- * PG_buddy is set to indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
  * PG_hwpoison indicates that a page got corrupted in hardware and contains
  * data with incorrect ECC bits that triggered a machine check. Accessing is
  * not safe since it may cause another machine check. Don't touch!
@@ -96,7 +93,6 @@ enum pageflags {
 	PG_swapcache,		/* Swap page: swp_entry_t in private */
 	PG_mappedtodisk,	/* Has blocks allocated on-disk */
 	PG_reclaim,		/* To be reclaimed asap */
-	PG_buddy,		/* Page is free, on buddy lists */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 	PG_unevictable,		/* Page is "unevictable"  */
 #ifdef CONFIG_MMU
@@ -233,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
  * risky: they bypass page accounting.
  */
 TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-__PAGEFLAG(Buddy, buddy)
 PAGEFLAG(MappedToDisk, mappedtodisk)
 
 /* PG_readahead is only used for file reads; PG_reclaim is only for writes */
@@ -461,7 +456,7 @@ static inline int PageTransCompound(struct page *page)
 #define PAGE_FLAGS_CHECK_AT_FREE \
 	(1 << PG_lru	 | 1 << PG_locked    | \
 	 1 << PG_private | 1 << PG_private_2 | \
-	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
+	 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
 	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
 	 __PG_COMPOUND_LOCK)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a2832c092509..e92f04749fcb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int type)
+static void get_page_bootmem(unsigned long info,  struct page *page,
+			     unsigned long type)
 {
-	atomic_set(&page->_mapcount, type);
+	page->lru.next = (struct list_head *) type;
 	SetPagePrivate(page);
 	set_page_private(page, info);
 	atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int type)
  * so use __ref to tell modpost not to generate a warning */
 void __ref put_page_bootmem(struct page *page)
 {
-	int type;
+	unsigned long type;
 
-	type = atomic_read(&page->_mapcount);
-	BUG_ON(type >= -1);
+	type = (unsigned long) page->lru.next;
+	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 
 	if (atomic_dec_return(&page->_count) == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
-		reset_page_mapcount(page);
+		INIT_LIST_HEAD(&page->lru);
 		__free_pages_bootmem(page, 0);
 	}
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e7664b9f706c..9dfe49bceff4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -449,8 +449,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
@@ -483,7 +483,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
@@ -5574,7 +5574,6 @@ static struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
-	{1UL << PG_buddy,		"buddy"		},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 static void free_map_bootmem(struct page *page, unsigned long nr_pages)
 {
 	unsigned long maps_section_nr, removing_section_nr, i;
-	int magic;
+	unsigned long magic;
 
 	for (i = 0; i < nr_pages; i++, page++) {
-		magic = atomic_read(&page->_mapcount);
+		magic = (unsigned long) page->lru.next;
 
 		BUG_ON(magic == NODE_INFO);
 
-- 
cgit v1.2.3-71-gd317


From f2d6bfe9ff0acec30b713614260e78b03d20e909 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Jan 2011 15:47:01 -0800
Subject: thp: add x86 32bit support

Add support for transparent hugepages to x86 32bit.

Share the same VM_ bitflag for VM_MAPPED_COPY.  mm/nommu.c will never
support transparent hugepages.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable-2level.h |   9 +++
 arch/x86/include/asm/pgtable-3level.h |  23 +++++++
 arch/x86/include/asm/pgtable.h        | 117 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/pgtable_64.h     | 109 -------------------------------
 arch/x86/mm/pgtable.c                 |   4 +-
 include/linux/mm.h                    |   7 +-
 mm/Kconfig                            |   2 +-
 7 files changed, 156 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+	return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
 /*
  * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
  * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+union split_pmd {
+	struct {
+		u32 pmd_low;
+		u32 pmd_high;
+	};
+	pmd_t pmd;
+};
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+	union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+	/* xchg acts as a barrier before setting of the high bits */
+	res.pmd_low = xchg(&orig->pmd_low, 0);
+	res.pmd_high = orig->pmd_high;
+	orig->pmd_high = 0;
+
+	return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
 /*
  * Bits 0, 6 and 7 are taken in the low part of the pte,
  * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3278038e9706..001a3831567a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -97,6 +97,11 @@ static inline int pte_young(pte_t pte)
 	return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
+static inline int pmd_young(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
 static inline int pte_write(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_RW;
@@ -145,6 +150,18 @@ static inline int pmd_large(pmd_t pte)
 		(_PAGE_PSE | _PAGE_PRESENT);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	return pmd_val(pmd) & _PAGE_PSE;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
 {
 	pteval_t v = native_pte_val(pte);
@@ -219,6 +236,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
 	return pte_set_flags(pte, _PAGE_SPECIAL);
 }
 
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+	pmdval_t v = native_pmd_val(pmd);
+
+	return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+	pmdval_t v = native_pmd_val(pmd);
+
+	return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
@@ -527,6 +593,14 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
 	return res;
 }
 
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+	pmd_t res = *pmdp;
+
+	native_pmd_clear(pmdp);
+	return res;
+}
+
 static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
 				     pte_t *ptep , pte_t pte)
 {
@@ -616,6 +690,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 
 #define flush_tlb_fix_spurious_fault(vma, address)
 
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp,
+				 pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+				 unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+				       pmd_t *pmdp)
+{
+	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+	pmd_update(mm, addr, pmdp);
+	return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long addr, pmd_t *pmdp)
+{
+	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+	pmd_update(mm, addr, pmdp);
+}
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index b2df039a4119..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -182,115 +182,6 @@ extern void cleanup_highmap(void);
 
 #define __HAVE_ARCH_PTE_SAME
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
-static inline int pmd_trans_huge(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_PSE;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
-
-#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
-extern int pmdp_set_access_flags(struct vm_area_struct *vma,
-				 unsigned long address, pmd_t *pmdp,
-				 pmd_t entry, int dirty);
-
-#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
-				     unsigned long addr, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
-extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
-				  unsigned long address, pmd_t *pmdp);
-
-
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-				 unsigned long addr, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMD_WRITE
-static inline int pmd_write(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_RW;
-}
-
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
-				       pmd_t *pmdp)
-{
-	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
-	pmd_update(mm, addr, pmdp);
-	return pmd;
-}
-
-#define __HAVE_ARCH_PMDP_SET_WRPROTECT
-static inline void pmdp_set_wrprotect(struct mm_struct *mm,
-				      unsigned long addr, pmd_t *pmdp)
-{
-	clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd);
-	pmd_update(mm, addr, pmdp);
-}
-
-static inline int pmd_young(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_ACCESSED;
-}
-
-static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
-{
-	pmdval_t v = native_pmd_val(pmd);
-
-	return native_make_pmd(v | set);
-}
-
-static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
-{
-	pmdval_t v = native_pmd_val(pmd);
-
-	return native_make_pmd(v & ~clear);
-}
-
-static inline pmd_t pmd_mkold(pmd_t pmd)
-{
-	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
-}
-
-static inline pmd_t pmd_wrprotect(pmd_t pmd)
-{
-	return pmd_clear_flags(pmd, _PAGE_RW);
-}
-
-static inline pmd_t pmd_mkdirty(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_DIRTY);
-}
-
-static inline pmd_t pmd_mkhuge(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_PSE);
-}
-
-static inline pmd_t pmd_mkyoung(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_ACCESSED);
-}
-
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_RW);
-}
-
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
-{
-	return pmd_clear_flags(pmd, _PAGE_PRESENT);
-}
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 65e92d58f942..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -362,7 +362,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 
 	if (pmd_young(*pmdp))
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
-					 (unsigned long *) &pmdp->pmd);
+					 (unsigned long *)pmdp);
 
 	if (ret)
 		pmd_update(vma->vm_mm, addr, pmdp);
@@ -404,7 +404,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 	int set;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
-				(unsigned long *)&pmdp->pmd);
+				(unsigned long *)pmdp);
 	if (set) {
 		pmd_update(vma->vm_mm, address, pmdp);
 		/* need tlb flush only to serialize against gup-fast */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ab7d2b60041..9c2695beab86 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -102,7 +102,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
+#else
+#define VM_HUGEPAGE	0x01000000	/* MADV_HUGEPAGE marked this vma */
+#endif
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
@@ -111,9 +115,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
 #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
 #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
-#if BITS_PER_LONG > 32
-#define VM_HUGEPAGE	0x100000000UL	/* MADV_HUGEPAGE marked this vma */
-#endif
 
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
diff --git a/mm/Kconfig b/mm/Kconfig
index 3982be2d721a..d774f77538ce 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -304,7 +304,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support" if EMBEDDED
-	depends on X86_64 && MMU
+	depends on X86 && MMU
 	default y
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
-- 
cgit v1.2.3-71-gd317


From 0ca1634d4143c3579273ca53b993df19f5c98e92 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Jan 2011 15:47:02 -0800
Subject: thp: mincore transparent hugepage support

Handle transparent huge page pmd entries natively instead of splitting
them into subpages.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  3 +++
 mm/huge_memory.c        | 25 +++++++++++++++++++++++++
 mm/mincore.c            |  8 +++++++-
 3 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 43a694ef8904..25125fb6acf7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 extern int zap_huge_pmd(struct mmu_gather *tlb,
 			struct vm_area_struct *vma,
 			pmd_t *pmd);
+extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			unsigned long addr, unsigned long end,
+			unsigned char *vec);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ae2bf08b1099..37e89a32a0b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -923,6 +923,31 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return ret;
 }
 
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+		unsigned long addr, unsigned long end,
+		unsigned char *vec)
+{
+	int ret = 0;
+
+	spin_lock(&vma->vm_mm->page_table_lock);
+	if (likely(pmd_trans_huge(*pmd))) {
+		ret = !pmd_trans_splitting(*pmd);
+		spin_unlock(&vma->vm_mm->page_table_lock);
+		if (unlikely(!ret))
+			wait_split_huge_page(vma->anon_vma, pmd);
+		else {
+			/*
+			 * All logical pages in the range are present
+			 * if backed by a huge page.
+			 */
+			memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+		}
+	} else
+		spin_unlock(&vma->vm_mm->page_table_lock);
+
+	return ret;
+}
+
 pmd_t *page_check_address_pmd(struct page *page,
 			      struct mm_struct *mm,
 			      unsigned long address,
diff --git a/mm/mincore.c b/mm/mincore.c
index 9959bb41570e..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,7 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		split_huge_page_pmd(vma->vm_mm, pmd);
+		if (pmd_trans_huge(*pmd)) {
+			if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+				vec += (next - addr) >> PAGE_SHIFT;
+				continue;
+			}
+			/* fall through */
+		}
 		if (pmd_none_or_clear_bad(pmd))
 			mincore_unmapped_range(vma, addr, next, vec);
 		else
-- 
cgit v1.2.3-71-gd317


From cd7548ab360c462118568eebb8c6da3bc303b02e Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Jan 2011 15:47:04 -0800
Subject: thp: mprotect: transparent huge page support

Natively handle huge pmds when changing page tables on behalf of
mprotect().

I left out update_mmu_cache() because we do not need it on x86 anyway but
more importantly the interface works on ptes, not pmds.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  2 ++
 mm/huge_memory.c        | 27 +++++++++++++++++++++++++++
 mm/mprotect.c           |  8 +++++++-
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 25125fb6acf7..c590b08c6fa6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -22,6 +22,8 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
+extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			unsigned long addr, pgprot_t newprot);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 37e89a32a0b1..7b55fe0e998b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -948,6 +948,33 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	return ret;
 }
 
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+		unsigned long addr, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	int ret = 0;
+
+	spin_lock(&mm->page_table_lock);
+	if (likely(pmd_trans_huge(*pmd))) {
+		if (unlikely(pmd_trans_splitting(*pmd))) {
+			spin_unlock(&mm->page_table_lock);
+			wait_split_huge_page(vma->anon_vma, pmd);
+		} else {
+			pmd_t entry;
+
+			entry = pmdp_get_and_clear(mm, addr, pmd);
+			entry = pmd_modify(entry, newprot);
+			set_pmd_at(mm, addr, pmd, entry);
+			spin_unlock(&vma->vm_mm->page_table_lock);
+			flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+			ret = 1;
+		}
+	} else
+		spin_unlock(&vma->vm_mm->page_table_lock);
+
+	return ret;
+}
+
 pmd_t *page_check_address_pmd(struct page *page,
 			      struct mm_struct *mm,
 			      unsigned long address,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9402080d0d4a..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -88,7 +88,13 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		split_huge_page_pmd(vma->vm_mm, pmd);
+		if (pmd_trans_huge(*pmd)) {
+			if (next - addr != HPAGE_PMD_SIZE)
+				split_huge_page_pmd(vma->vm_mm, pmd);
+			else if (change_huge_pmd(vma, pmd, addr, newprot))
+				continue;
+			/* fall through */
+		}
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
 		change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
-- 
cgit v1.2.3-71-gd317


From 0bbbc0b33d141f78a0d9218a54a47f50621220d3 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:05 -0800
Subject: thp: add numa awareness to hugepage allocations

It's mostly a matter of replacing alloc_pages with alloc_pages_vma after
introducing alloc_pages_vma.  khugepaged needs special handling as the
allocation has to happen inside collapse_huge_page where the vma is known
and an error has to be returned to the outer loop to sleep
alloc_sleep_millisecs in case of failure.  But it retains the more
efficient logic of handling allocation failures in khugepaged in case of
CONFIG_NUMA=n.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  7 +++--
 mm/huge_memory.c    | 87 +++++++++++++++++++++++++++++++++++++++++++++--------
 mm/mempolicy.c      | 13 +++++---
 3 files changed, 87 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index d95082cc6f4a..a3b148a91874 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	return alloc_pages_current(gfp_mask, order);
 }
-extern struct page *alloc_page_vma(gfp_t gfp_mask,
+extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+#define alloc_page_vma(gfp_mask, vma, addr)	\
+	alloc_pages_vma(gfp_mask, 0, vma, addr)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0415a83afd66..f6559e7711bd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	return ret;
 }
 
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+	return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+					      struct vm_area_struct *vma,
+					      unsigned long haddr)
+{
+	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+			       HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
 static inline struct page *alloc_hugepage(int defrag)
 {
-	return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
+	return alloc_pages(alloc_hugepage_gfpmask(defrag),
 			   HPAGE_PMD_ORDER);
 }
+#endif
 
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
@@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			return VM_FAULT_OOM;
 		if (unlikely(khugepaged_enter(vma)))
 			return VM_FAULT_OOM;
-		page = alloc_hugepage(transparent_hugepage_defrag(vma));
+		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+					  vma, haddr);
 		if (unlikely(!page))
 			goto out;
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
-		new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
+		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+					      vma, haddr);
 	else
 		new_page = NULL;
 
@@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm,
 	unsigned long hstart, hend;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
 	VM_BUG_ON(!*hpage);
+#else
+	VM_BUG_ON(*hpage);
+#endif
 
 	/*
 	 * Prevent all access to pagetables with the exception of
@@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm,
 	if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
 		goto out;
 
+#ifndef CONFIG_NUMA
 	new_page = *hpage;
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+#else
+	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+	if (unlikely(!new_page)) {
+		*hpage = ERR_PTR(-ENOMEM);
 		goto out;
+	}
+#endif
+	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+		goto out_put_page;
 
 	anon_vma_lock(vma->anon_vma);
 
@@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 		spin_unlock(&mm->page_table_lock);
 		anon_vma_unlock(vma->anon_vma);
 		mem_cgroup_uncharge_page(new_page);
-		goto out;
+		goto out_put_page;
 	}
 
 	/*
@@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm,
 	mm->nr_ptes--;
 	spin_unlock(&mm->page_table_lock);
 
+#ifndef CONFIG_NUMA
 	*hpage = NULL;
+#endif
 	khugepaged_pages_collapsed++;
 out:
 	up_write(&mm->mmap_sem);
+	return;
+
+out_put_page:
+#ifdef CONFIG_NUMA
+	put_page(new_page);
+#endif
+	goto out;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage)
 	while (progress < pages) {
 		cond_resched();
 
+#ifndef CONFIG_NUMA
 		if (!*hpage) {
 			*hpage = alloc_hugepage(khugepaged_defrag());
 			if (unlikely(!*hpage))
 				break;
 		}
+#else
+		if (IS_ERR(*hpage))
+			break;
+#endif
 
 		spin_lock(&khugepaged_mm_lock);
 		if (!khugepaged_scan.mm_slot)
@@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage)
 	}
 }
 
+static void khugepaged_alloc_sleep(void)
+{
+	DEFINE_WAIT(wait);
+	add_wait_queue(&khugepaged_wait, &wait);
+	schedule_timeout_interruptible(
+		msecs_to_jiffies(
+			khugepaged_alloc_sleep_millisecs));
+	remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
 static struct page *khugepaged_alloc_hugepage(void)
 {
 	struct page *hpage;
 
 	do {
 		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage) {
-			DEFINE_WAIT(wait);
-			add_wait_queue(&khugepaged_wait, &wait);
-			schedule_timeout_interruptible(
-				msecs_to_jiffies(
-					khugepaged_alloc_sleep_millisecs));
-			remove_wait_queue(&khugepaged_wait, &wait);
-		}
+		if (!hpage)
+			khugepaged_alloc_sleep();
 	} while (unlikely(!hpage) &&
 		 likely(khugepaged_enabled()));
 	return hpage;
 }
+#endif
 
 static void khugepaged_loop(void)
 {
 	struct page *hpage;
 
+#ifdef CONFIG_NUMA
+	hpage = NULL;
+#endif
 	while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
 		hpage = khugepaged_alloc_hugepage();
 		if (unlikely(!hpage))
 			break;
+#else
+		if (IS_ERR(hpage)) {
+			khugepaged_alloc_sleep();
+			hpage = NULL;
+		}
+#endif
 
 		khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
 		if (hpage)
 			put_page(hpage);
+#endif
 		if (khugepaged_has_work()) {
 			DEFINE_WAIT(wait);
 			if (!khugepaged_scan_sleep_millisecs)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 83b7df309fc4..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 }
 
 /**
- * 	alloc_page_vma	- Allocate a page for a VMA.
+ * 	alloc_pages_vma	- Allocate a page for a VMA.
  *
  * 	@gfp:
  *      %GFP_USER    user allocation.
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  *      %GFP_FS      allocation should not call back into a file system.
  *      %GFP_ATOMIC  don't sleep.
  *
+ *	@order:Order of the GFP allocation.
  * 	@vma:  Pointer to VMA or NULL if not available.
  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
  *
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  *	Should be called with the mm_sem of the vma hold.
  */
 struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+		unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		mpol_cond_put(pol);
-		page = alloc_page_interleave(gfp, 0, nid);
+		page = alloc_page_interleave(gfp, order, nid);
 		put_mems_allowed();
 		return page;
 	}
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		/*
 		 * slow path: ref counted shared policy
 		 */
-		struct page *page =  __alloc_pages_nodemask(gfp, 0,
+		struct page *page =  __alloc_pages_nodemask(gfp, order,
 						zl, policy_nodemask(gfp, pol));
 		__mpol_put(pol);
 		put_mems_allowed();
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 	/*
 	 * fast path:  default or task policy
 	 */
-	page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	page = __alloc_pages_nodemask(gfp, order, zl,
+				      policy_nodemask(gfp, pol));
 	put_mems_allowed();
 	return page;
 }
-- 
cgit v1.2.3-71-gd317


From 94fcc585fb85ad7b059c70872489b50044d401f3 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:08 -0800
Subject: thp: avoid breaking huge pmd invariants in case of vma_adjust
 failures

An huge pmd can only be mapped if the corresponding 2M virtual range is
fully contained in the vma.  At times the VM calls split_vma twice, if the
first split_vma succeeds and the second fail, the first split_vma remains
in effect and it's not rolled back.  For split_vma or vma_adjust to fail
an allocation failure is needed so it's a very unlikely event (the out of
memory killer would normally fire before any allocation failure is visible
to kernel and userland and if an out of memory condition happens it's
unlikely to happen exactly here).  Nevertheless it's safer to ensure that
no huge pmd can be left around if the vma is adjusted in a way that can't
fit hugepages anymore at the new vm_start/vm_end address.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 19 ++++++++++++
 mm/huge_memory.c        | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
 mm/mmap.c               |  2 ++
 3 files changed, 99 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c590b08c6fa6..827595228734 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -104,6 +104,19 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
 extern int hugepage_madvise(unsigned long *vm_flags);
+extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+				    unsigned long start,
+				    unsigned long end,
+				    long adjust_next);
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end,
+					 long adjust_next)
+{
+	if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+		return;
+	__vma_adjust_trans_huge(vma, start, end, adjust_next);
+}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUG(); 0; })
@@ -125,6 +138,12 @@ static inline int hugepage_madvise(unsigned long *vm_flags)
 	BUG();
 	return 0;
 }
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end,
+					 long adjust_next)
+{
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 30c3cec82023..b6facc35e893 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1075,8 +1075,16 @@ pmd_t *page_check_address_pmd(struct page *page,
 		goto out;
 	if (pmd_page(*pmd) != page)
 		goto out;
-	VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-		  pmd_trans_splitting(*pmd));
+	/*
+	 * split_vma() may create temporary aliased mappings. There is
+	 * no risk as long as all huge pmd are found and have their
+	 * splitting bit set before __split_huge_page_refcount
+	 * runs. Finding the same huge pmd more than once during the
+	 * same rmap walk is not a problem.
+	 */
+	if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+	    pmd_trans_splitting(*pmd))
+		goto out;
 	if (pmd_trans_huge(*pmd)) {
 		VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
 			  !pmd_trans_splitting(*pmd));
@@ -2196,3 +2204,71 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 	put_page(page);
 	BUG_ON(pmd_trans_huge(*pmd));
 }
+
+static void split_huge_page_address(struct mm_struct *mm,
+				    unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		return;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return;
+	/*
+	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
+	 * materialize from under us.
+	 */
+	split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+			     unsigned long start,
+			     unsigned long end,
+			     long adjust_next)
+{
+	/*
+	 * If the new start address isn't hpage aligned and it could
+	 * previously contain an hugepage: check if we need to split
+	 * an huge pmd.
+	 */
+	if (start & ~HPAGE_PMD_MASK &&
+	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+		split_huge_page_address(vma->vm_mm, start);
+
+	/*
+	 * If the new end address isn't hpage aligned and it could
+	 * previously contain an hugepage: check if we need to split
+	 * an huge pmd.
+	 */
+	if (end & ~HPAGE_PMD_MASK &&
+	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+		split_huge_page_address(vma->vm_mm, end);
+
+	/*
+	 * If we're also updating the vma->vm_next->vm_start, if the new
+	 * vm_next->vm_start isn't page aligned and it could previously
+	 * contain an hugepage: check if we need to split an huge pmd.
+	 */
+	if (adjust_next > 0) {
+		struct vm_area_struct *next = vma->vm_next;
+		unsigned long nstart = next->vm_start;
+		nstart += adjust_next << PAGE_SHIFT;
+		if (nstart & ~HPAGE_PMD_MASK &&
+		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+			split_huge_page_address(next->vm_mm, nstart);
+	}
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index 753f44d17047..73cc648873d6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -589,6 +589,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 		}
 	}
 
+	vma_adjust_trans_huge(vma, start, end, adjust_next);
+
 	/*
 	 * When changing only vma->vm_end, we don't really need anon_vma
 	 * lock. This is a fairly rare case by itself, but the anon_vma
-- 
cgit v1.2.3-71-gd317


From 8ee53820edfd1f3b6554c593f337148dd3d7fc91 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:10 -0800
Subject: thp: mmu_notifier_test_young

For GRU and EPT, we need gup-fast to set referenced bit too (this is why
it's correct to return 0 when shadow_access_mask is zero, it requires
gup-fast to set the referenced bit).  qemu-kvm access already sets the
young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow
paging EPT minor fault we relay on gup-fast to signal the page is in
use...

We also need to check the young bits on the secondary pagetables for NPT
and not nested shadow mmu as the data may never get accessed again by the
primary pte.

Without this closer accuracy, we'd have to remove the heuristic that
avoids collapsing hugepages in hugepage virtual regions that have not even
a single subpage in use.

->test_young is full backwards compatible with GRU and other usages that
don't have young bits in pagetables set by the hardware and that should
nuke the secondary mmu mappings when ->clear_flush_young runs just like
EPT does.

Removing the heuristic that checks the young bit in
khugepaged/collapse_huge_page completely isn't so bad either probably but
I thought it was worth it and this makes it reliable.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 34 ++++++++++++++++++++++++++++++++++
 arch/x86/mm/gup.c               |  3 +++
 include/linux/mmu_notifier.h    | 26 ++++++++++++++++++++++++++
 mm/huge_memory.c                |  6 ++++--
 mm/mmu_notifier.c               | 20 ++++++++++++++++++++
 virt/kvm/kvm_main.c             | 17 +++++++++++++++++
 7 files changed, 105 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21a9fba..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 47b2c3288b6b..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -945,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	return young;
 }
 
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			      unsigned long data)
+{
+	u64 *spte;
+	int young = 0;
+
+	/*
+	 * If there's no access bit in the secondary pte set by the
+	 * hardware it's up to gup-fast/gup to set the access bit in
+	 * the primary pte or in the page structure.
+	 */
+	if (!shadow_accessed_mask)
+		goto out;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		young = _spte & PT_ACCESSED_MASK;
+		if (young) {
+			young = 1;
+			break;
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+out:
+	return young;
+}
+
 #define RMAP_RECYCLE_THRESHOLD 1000
 
 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -965,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 }
 
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 269aa53932e0..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 
 #include <asm/pgtable.h>
 
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
 		get_page(page);
+		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
 
@@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 	VM_BUG_ON(page != compound_head(page));
 	VM_BUG_ON(page_count(page) == 0);
 	atomic_add(nr, &page->_count);
+	SetPageReferenced(page);
 }
 
 static inline void get_huge_page_tail(struct page *page)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index cbfab1e9957d..cc2e7dfea9d7 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -61,6 +61,16 @@ struct mmu_notifier_ops {
 				 struct mm_struct *mm,
 				 unsigned long address);
 
+	/*
+	 * test_young is called to check the young/accessed bitflag in
+	 * the secondary pte. This is used to know if the page is
+	 * frequently used without actually clearing the flag or tearing
+	 * down the secondary mapping on the page.
+	 */
+	int (*test_young)(struct mmu_notifier *mn,
+			  struct mm_struct *mm,
+			  unsigned long address);
+
 	/*
 	 * change_pte is called in cases that pte mapping to page is changed:
 	 * for example, when ksm remaps pte to point to a new shared page.
@@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
+extern int __mmu_notifier_test_young(struct mm_struct *mm,
+				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 				      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_test_young(mm, address);
+	return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 					   unsigned long address, pte_t pte)
 {
@@ -313,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 					   unsigned long address, pte_t pte)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 915809b16edf..39d7df40c067 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1632,7 +1632,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		VM_BUG_ON(PageLRU(page));
 
 		/* If there is no mapped pte young don't collapse the page */
-		if (pte_young(pteval))
+		if (pte_young(pteval) || PageReferenced(page) ||
+		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
 	if (unlikely(!referenced))
@@ -1892,7 +1893,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		/* cannot use mapcount: can't collapse if there's a gup pin */
 		if (page_count(page) != 1)
 			goto out_unmap;
-		if (pte_young(pteval))
+		if (pte_young(pteval) || PageReferenced(page) ||
+		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
 	if (referenced)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
+int __mmu_notifier_test_young(struct mm_struct *mm,
+			      unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+	int young = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->test_young) {
+			young = mn->ops->test_young(mn, mm, address);
+			if (young)
+				break;
+		}
+	}
+	rcu_read_unlock();
+
+	return young;
+}
+
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 			       pte_t pte)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 85ab7db0d366..4286d4766510 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -380,6 +380,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	return young;
 }
 
+static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	young = kvm_test_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return young;
+}
+
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 				     struct mm_struct *mm)
 {
@@ -396,6 +412,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.test_young		= kvm_mmu_notifier_test_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
-- 
cgit v1.2.3-71-gd317


From 5a03b051ed87e72b959f32a86054e1142ac4cf55 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:11 -0800
Subject: thp: use compaction in kswapd for GFP_ATOMIC order > 0

This takes advantage of memory compaction to properly generate pages of
order > 0 if regular page reclaim fails and priority level becomes more
severe and we don't reach the proper watermarks.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 11 ++++++++---
 mm/compaction.c            | 31 ++++++++++++++++++++++++++-----
 mm/vmscan.c                | 30 ++++++++++++++++++++----------
 3 files changed, 54 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 72cba4034785..dfa2ed4c0d26 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -11,6 +11,9 @@
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	3
 
+#define COMPACT_MODE_DIRECT_RECLAIM	0
+#define COMPACT_MODE_KSWAPD		1
+
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -25,7 +28,8 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			bool sync);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 extern unsigned long compact_zone_order(struct zone *zone, int order,
-						gfp_t gfp_mask, bool sync);
+					gfp_t gfp_mask, bool sync,
+					int compact_mode);
 
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -70,9 +74,10 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order)
 }
 
 static inline unsigned long compact_zone_order(struct zone *zone, int order,
-						gfp_t gfp_mask, bool sync)
+					       gfp_t gfp_mask, bool sync,
+					       int compact_mode)
 {
-	return 0;
+	return COMPACT_CONTINUE;
 }
 
 static inline void defer_compaction(struct zone *zone)
diff --git a/mm/compaction.c b/mm/compaction.c
index da25b5a2e476..60d52a7298d5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,6 +42,8 @@ struct compact_control {
 	unsigned int order;		/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
+
+	int compact_mode;
 };
 
 static unsigned long release_freepages(struct list_head *freelist)
@@ -382,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
 }
 
 static int compact_finished(struct zone *zone,
-						struct compact_control *cc)
+			    struct compact_control *cc)
 {
 	unsigned int order;
-	unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+	unsigned long watermark;
 
 	if (fatal_signal_pending(current))
 		return COMPACT_PARTIAL;
@@ -395,12 +397,27 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_COMPLETE;
 
 	/* Compaction run is not finished if the watermark is not met */
+	if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+		watermark = low_wmark_pages(zone);
+	else
+		watermark = high_wmark_pages(zone);
+	watermark += (1 << cc->order);
+
 	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
 		return COMPACT_CONTINUE;
 
 	if (cc->order == -1)
 		return COMPACT_CONTINUE;
 
+	/*
+	 * Generating only one page of the right order is not enough
+	 * for kswapd, we must continue until we're above the high
+	 * watermark as a pool for high order GFP_ATOMIC allocations
+	 * too.
+	 */
+	if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+		return COMPACT_CONTINUE;
+
 	/* Direct compactor: Is a suitable page free? */
 	for (order = cc->order; order < MAX_ORDER; order++) {
 		/* Job done if page is free of the right migratetype */
@@ -514,8 +531,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 }
 
 unsigned long compact_zone_order(struct zone *zone,
-						int order, gfp_t gfp_mask,
-						bool sync)
+				 int order, gfp_t gfp_mask,
+				 bool sync,
+				 int compact_mode)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -524,6 +542,7 @@ unsigned long compact_zone_order(struct zone *zone,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.sync = sync,
+		.compact_mode = compact_mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -569,7 +588,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync);
+		status = compact_zone_order(zone, order, gfp_mask, sync,
+					    COMPACT_MODE_DIRECT_RECLAIM);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
@@ -600,6 +620,7 @@ static int compact_node(int nid)
 			.nr_freepages = 0,
 			.nr_migratepages = 0,
 			.order = -1,
+			.compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
 		};
 
 		zone = &pgdat->node_zones[zoneid];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfdef0bcc7ab..f5b762ae23a2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <linux/compaction.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -2382,6 +2383,7 @@ loop_again:
 		 * cause too much scanning of the lower zones.
 		 */
 		for (i = 0; i <= end_zone; i++) {
+			int compaction;
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 
@@ -2411,9 +2413,26 @@ loop_again:
 						lru_pages);
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
+
+			compaction = 0;
+			if (order &&
+			    zone_watermark_ok(zone, 0,
+					       high_wmark_pages(zone),
+					      end_zone, 0) &&
+			    !zone_watermark_ok(zone, order,
+					       high_wmark_pages(zone),
+					       end_zone, 0)) {
+				compact_zone_order(zone,
+						   order,
+						   sc.gfp_mask, false,
+						   COMPACT_MODE_KSWAPD);
+				compaction = 1;
+			}
+
 			if (zone->all_unreclaimable)
 				continue;
-			if (nr_slab == 0 && !zone_reclaimable(zone))
+			if (!compaction && nr_slab == 0 &&
+			    !zone_reclaimable(zone))
 				zone->all_unreclaimable = 1;
 			/*
 			 * If we've done a decent amount of scanning and
@@ -2424,15 +2443,6 @@ loop_again:
 			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
 				sc.may_writepage = 1;
 
-			/*
-			 * Compact the zone for higher orders to reduce
-			 * latencies for higher-order allocations that
-			 * would ordinarily call try_to_compact_pages()
-			 */
-			if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
-				compact_zone_order(zone, sc.order, sc.gfp_mask,
-							false);
-
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), end_zone, 0)) {
 				all_zones_ok = 0;
-- 
cgit v1.2.3-71-gd317


From 2c888cfbc1b45508a44763d85ba2e8ac43faff5f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Thu, 13 Jan 2011 15:47:13 -0800
Subject: thp: fix anon memory statistics with transparent hugepages

Count each transparent hugepage as HPAGE_PMD_NR pages in the LRU
statistics, so the Active(anon) and Inactive(anon) statistics in
/proc/meminfo are correct.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h   |  8 ++++++++
 include/linux/mm_inline.h |  8 +++++---
 mm/huge_memory.c          | 10 ++++++++++
 mm/memcontrol.c           |  2 +-
 mm/vmscan.c               | 11 ++++++-----
 5 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 827595228734..9b48c24df260 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -117,11 +117,19 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 		return;
 	__vma_adjust_trans_huge(vma, start, end, adjust_next);
 }
+static inline int hpage_nr_pages(struct page *page)
+{
+	if (unlikely(PageTransHuge(page)))
+		return HPAGE_PMD_NR;
+	return 1;
+}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUG(); 0; })
 #define HPAGE_PMD_SIZE ({ BUG(); 0; })
 
+#define hpage_nr_pages(x) 1
+
 #define transparent_hugepage_enabled(__vma) 0
 
 #define transparent_hugepage_flags 0UL
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 650f31eabdb1..8f7d24712dc1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,6 +1,8 @@
 #ifndef LINUX_MM_INLINE_H
 #define LINUX_MM_INLINE_H
 
+#include <linux/huge_mm.h>
+
 /**
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
  * @page: the page to test
@@ -24,7 +26,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
 		       struct list_head *head)
 {
 	list_add(&page->lru, head);
-	__inc_zone_state(zone, NR_LRU_BASE + l);
+	__mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
 	mem_cgroup_add_lru_list(page, l);
 }
 
@@ -38,7 +40,7 @@ static inline void
 del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
 	list_del(&page->lru);
-	__dec_zone_state(zone, NR_LRU_BASE + l);
+	__mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
 	mem_cgroup_del_lru_list(page, l);
 }
 
@@ -73,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
 			l += LRU_ACTIVE;
 		}
 	}
-	__dec_zone_state(zone, NR_LRU_BASE + l);
+	__mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
 	mem_cgroup_del_lru_list(page, l);
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 892d8a17a7e5..f4f6041176a4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1143,6 +1143,7 @@ static void __split_huge_page_refcount(struct page *page)
 	int i;
 	unsigned long head_index = page->index;
 	struct zone *zone = page_zone(page);
+	int zonestat;
 
 	/* prevent PageLRU to go away from under us, and freeze lru stats */
 	spin_lock_irq(&zone->lru_lock);
@@ -1207,6 +1208,15 @@ static void __split_huge_page_refcount(struct page *page)
 	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
 
+	/*
+	 * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+	 * so adjust those appropriately if this page is on the LRU.
+	 */
+	if (PageLRU(page)) {
+		zonestat = NR_LRU_BASE + page_lru(page);
+		__mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+	}
+
 	ClearPageCompound(page);
 	compound_unlock(page);
 	spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a1bb59d4c9d0..f4ea3410fb4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1091,7 +1091,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
-			nr_taken++;
+			nr_taken += hpage_nr_pages(page);
 			break;
 		case -EBUSY:
 			/* we don't affect global LRU but rotate in our LRU */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f5b762ae23a2..0882014d2ce5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1045,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
-			nr_taken++;
+			nr_taken += hpage_nr_pages(page);
 			break;
 
 		case -EBUSY:
@@ -1103,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
 				list_move(&cursor_page->lru, dst);
 				mem_cgroup_del_lru(cursor_page);
-				nr_taken++;
+				nr_taken += hpage_nr_pages(page);
 				nr_lumpy_taken++;
 				if (PageDirty(cursor_page))
 					nr_lumpy_dirty++;
@@ -1158,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
 	struct page *page;
 
 	list_for_each_entry(page, page_list, lru) {
+		int numpages = hpage_nr_pages(page);
 		lru = page_lru_base_type(page);
 		if (PageActive(page)) {
 			lru += LRU_ACTIVE;
 			ClearPageActive(page);
-			nr_active++;
+			nr_active += numpages;
 		}
 		if (count)
-			count[lru]++;
+			count[lru] += numpages;
 	}
 
 	return nr_active;
@@ -1483,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone,
 
 		list_move(&page->lru, &zone->lru[lru].list);
 		mem_cgroup_add_lru_list(page, lru);
-		pgmoved++;
+		pgmoved += hpage_nr_pages(page);
 
 		if (!pagevec_add(&pvec, page) || list_empty(list)) {
 			spin_unlock_irq(&zone->lru_lock);
-- 
cgit v1.2.3-71-gd317


From 37c2ac7872a9387542616f658d20ac25f5bdb32e Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:16 -0800
Subject: thp: compound_trans_order

Read compound_trans_order safe. Noop for CONFIG_TRANSPARENT_HUGEPAGE=n.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  | 14 ++++++++++++++
 mm/memcontrol.c     | 12 ++++++------
 mm/memory-failure.c | 12 ++++++------
 3 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c2695beab86..ce97a2bb0b19 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -450,6 +450,20 @@ static inline int compound_order(struct page *page)
 	return (unsigned long)page[1].lru.prev;
 }
 
+static inline int compound_trans_order(struct page *page)
+{
+	int order;
+	unsigned long flags;
+
+	if (!PageHead(page))
+		return 0;
+
+	flags = compound_lock_irqsave(page);
+	order = compound_order(page);
+	compound_unlock_irqrestore(page, flags);
+	return order;
+}
+
 static inline void set_compound_order(struct page *page, unsigned long order)
 {
 	page[1].lru.prev = (void *)order;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f4ea3410fb4d..741206ffdace 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1027,10 +1027,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
-	int page_size = PAGE_SIZE;
-
-	if (PageTransHuge(page))
-		page_size <<= compound_order(page);
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -2286,8 +2282,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	int ret;
 	int page_size = PAGE_SIZE;
 
-	if (PageTransHuge(page))
+	if (PageTransHuge(page)) {
 		page_size <<= compound_order(page);
+		VM_BUG_ON(!PageTransHuge(page));
+	}
 
 	pc = lookup_page_cgroup(page);
 	/* can happen at boot */
@@ -2558,8 +2556,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	if (PageSwapCache(page))
 		return NULL;
 
-	if (PageTransHuge(page))
+	if (PageTransHuge(page)) {
 		page_size <<= compound_order(page);
+		VM_BUG_ON(!PageTransHuge(page));
+	}
 
 	count = page_size >> PAGE_SHIFT;
 	/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1b43d0ffff65..548fbd70f026 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
 	si.si_trapno = trapno;
 #endif
-	si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+	si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
 	/*
 	 * Don't use force here, it's convenient if the signal
 	 * can be temporarily blocked.
@@ -930,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
-	int nr_pages = 1 << compound_order(hpage);
+	int nr_pages = 1 << compound_trans_order(hpage);
 	for (i = 0; i < nr_pages; i++)
 		SetPageHWPoison(hpage + i);
 }
@@ -938,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
-	int nr_pages = 1 << compound_order(hpage);
+	int nr_pages = 1 << compound_trans_order(hpage);
 	for (i = 0; i < nr_pages; i++)
 		ClearPageHWPoison(hpage + i);
 }
@@ -968,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_order(hpage);
+	nr_pages = 1 << compound_trans_order(hpage);
 	atomic_long_add(nr_pages, &mce_bad_pages);
 
 	/*
@@ -1166,7 +1166,7 @@ int unpoison_memory(unsigned long pfn)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_order(page);
+	nr_pages = 1 << compound_trans_order(page);
 
 	if (!get_page_unless_zero(page)) {
 		/*
@@ -1304,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	}
 done:
 	if (!PageHWPoison(hpage))
-		atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+		atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
 	set_page_hwpoison_huge_page(hpage);
 	dequeue_hwpoisoned_huge_page(hpage);
 	/* keep elevated page count for bad page */
-- 
cgit v1.2.3-71-gd317


From a664b2d8555c659127bf8fe049a58449d394a707 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:17 -0800
Subject: thp: madvise(MADV_NOHUGEPAGE)

Add madvise MADV_NOHUGEPAGE to mark regions that are not important to be
hugepage backed.  Return -EINVAL if the vma is not of an anonymous type,
or the feature isn't built into the kernel.  Never silently return
success.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h    | 14 ++++++++------
 include/linux/khugepaged.h |  7 ++++---
 include/linux/mm.h         |  1 +
 mm/huge_memory.c           | 41 ++++++++++++++++++++++++++++++-----------
 mm/madvise.c               |  4 +++-
 5 files changed, 46 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9b48c24df260..a8b7e42d19ec 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -52,10 +52,12 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 #define HPAGE_PMD_SIZE HPAGE_SIZE
 
 #define transparent_hugepage_enabled(__vma)				\
-	(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_FLAG) ||	\
-	 (transparent_hugepage_flags &					\
-	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&			\
-	  (__vma)->vm_flags & VM_HUGEPAGE))
+	((transparent_hugepage_flags &					\
+	  (1<<TRANSPARENT_HUGEPAGE_FLAG) ||				\
+	  (transparent_hugepage_flags &					\
+	   (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&			\
+	   ((__vma)->vm_flags & VM_HUGEPAGE))) &&			\
+	 !((__vma)->vm_flags & VM_NOHUGEPAGE))
 #define transparent_hugepage_defrag(__vma)				\
 	((transparent_hugepage_flags &					\
 	  (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) ||			\
@@ -103,7 +105,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #if HPAGE_PMD_ORDER > MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
-extern int hugepage_madvise(unsigned long *vm_flags);
+extern int hugepage_madvise(unsigned long *vm_flags, int advice);
 extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
@@ -141,7 +143,7 @@ static inline int split_huge_page(struct page *page)
 	do { } while (0)
 #define wait_split_huge_page(__anon_vma, __pmd)	\
 	do { } while (0)
-static inline int hugepage_madvise(unsigned long *vm_flags)
+static inline int hugepage_madvise(unsigned long *vm_flags, int advice)
 {
 	BUG();
 	return 0;
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 552f3184756c..6b394f0b5148 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -38,9 +38,10 @@ static inline void khugepaged_exit(struct mm_struct *mm)
 static inline int khugepaged_enter(struct vm_area_struct *vma)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
-		if (khugepaged_always() ||
-		    (khugepaged_req_madv() &&
-		     vma->vm_flags & VM_HUGEPAGE))
+		if ((khugepaged_always() ||
+		     (khugepaged_req_madv() &&
+		      vma->vm_flags & VM_HUGEPAGE)) &&
+		    !(vma->vm_flags & VM_NOHUGEPAGE))
 			if (__khugepaged_enter(vma->vm_mm))
 				return -ENOMEM;
 	return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ce97a2bb0b19..956a35532f47 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -83,6 +83,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSUP	0x00000200
 #else
 #define VM_GROWSUP	0x00000000
+#define VM_NOHUGEPAGE	0x00000200	/* MADV_NOHUGEPAGE marked this vma */
 #endif
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f4f6041176a4..fce667c0281d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
+#include <linux/mman.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
@@ -1388,18 +1389,36 @@ out:
 	return ret;
 }
 
-int hugepage_madvise(unsigned long *vm_flags)
+int hugepage_madvise(unsigned long *vm_flags, int advice)
 {
-	/*
-	 * Be somewhat over-protective like KSM for now!
-	 */
-	if (*vm_flags & (VM_HUGEPAGE | VM_SHARED  | VM_MAYSHARE   |
-			 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
-			 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
-			 VM_MIXEDMAP | VM_SAO))
-		return -EINVAL;
-
-	*vm_flags |= VM_HUGEPAGE;
+	switch (advice) {
+	case MADV_HUGEPAGE:
+		/*
+		 * Be somewhat over-protective like KSM for now!
+		 */
+		if (*vm_flags & (VM_HUGEPAGE |
+				 VM_SHARED   | VM_MAYSHARE   |
+				 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+				 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+				 VM_MIXEDMAP | VM_SAO))
+			return -EINVAL;
+		*vm_flags &= ~VM_NOHUGEPAGE;
+		*vm_flags |= VM_HUGEPAGE;
+		break;
+	case MADV_NOHUGEPAGE:
+		/*
+		 * Be somewhat over-protective like KSM for now!
+		 */
+		if (*vm_flags & (VM_NOHUGEPAGE |
+				 VM_SHARED   | VM_MAYSHARE   |
+				 VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+				 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+				 VM_MIXEDMAP | VM_SAO))
+			return -EINVAL;
+		*vm_flags &= ~VM_HUGEPAGE;
+		*vm_flags |= VM_NOHUGEPAGE;
+		break;
+	}
 
 	return 0;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index ecde40a401c1..bbac126e03ed 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -72,7 +72,8 @@ static long madvise_behavior(struct vm_area_struct * vma,
 			goto out;
 		break;
 	case MADV_HUGEPAGE:
-		error = hugepage_madvise(&new_flags);
+	case MADV_NOHUGEPAGE:
+		error = hugepage_madvise(&new_flags, behavior);
 		if (error)
 			goto out;
 		break;
@@ -290,6 +291,7 @@ madvise_behavior_valid(int behavior)
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	case MADV_HUGEPAGE:
+	case MADV_NOHUGEPAGE:
 #endif
 		return 1;
 
-- 
cgit v1.2.3-71-gd317


From 60ab3244ec85c44276c585a2a20d3750402e1cf4 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:18 -0800
Subject: thp: khugepaged: make khugepaged aware about madvise

MADV_HUGEPAGE and MADV_NOHUGEPAGE were fully effective only if run after
mmap and before touching the memory.  While this is enough for most
usages, it's little effort to make madvise more dynamic at runtime on an
existing mapping by making khugepaged aware about madvise.

MADV_HUGEPAGE: register in khugepaged immediately without waiting a page
fault (that may not ever happen if all pages are already mapped and the
"enabled" knob was set to madvise during the initial page faults).

MADV_NOHUGEPAGE: skip vmas marked VM_NOHUGEPAGE in khugepaged to stop
collapsing pages where not needed.

[akpm@linux-foundation.org: tweak comment]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  6 ++++--
 mm/huge_memory.c        | 23 +++++++++++++++++++----
 mm/madvise.c            |  2 +-
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a8b7e42d19ec..bddfba1d7b85 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -105,7 +105,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #if HPAGE_PMD_ORDER > MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
-extern int hugepage_madvise(unsigned long *vm_flags, int advice);
+extern int hugepage_madvise(struct vm_area_struct *vma,
+			    unsigned long *vm_flags, int advice);
 extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
@@ -143,7 +144,8 @@ static inline int split_huge_page(struct page *page)
 	do { } while (0)
 #define wait_split_huge_page(__anon_vma, __pmd)	\
 	do { } while (0)
-static inline int hugepage_madvise(unsigned long *vm_flags, int advice)
+static inline int hugepage_madvise(struct vm_area_struct *vma,
+				   unsigned long *vm_flags, int advice)
 {
 	BUG();
 	return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fce667c0281d..004c9c2aac78 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1389,7 +1389,8 @@ out:
 	return ret;
 }
 
-int hugepage_madvise(unsigned long *vm_flags, int advice)
+int hugepage_madvise(struct vm_area_struct *vma,
+		     unsigned long *vm_flags, int advice)
 {
 	switch (advice) {
 	case MADV_HUGEPAGE:
@@ -1404,6 +1405,13 @@ int hugepage_madvise(unsigned long *vm_flags, int advice)
 			return -EINVAL;
 		*vm_flags &= ~VM_NOHUGEPAGE;
 		*vm_flags |= VM_HUGEPAGE;
+		/*
+		 * If the vma become good for khugepaged to scan,
+		 * register it here without waiting a page fault that
+		 * may not happen any time soon.
+		 */
+		if (unlikely(khugepaged_enter_vma_merge(vma)))
+			return -ENOMEM;
 		break;
 	case MADV_NOHUGEPAGE:
 		/*
@@ -1417,6 +1425,11 @@ int hugepage_madvise(unsigned long *vm_flags, int advice)
 			return -EINVAL;
 		*vm_flags &= ~VM_HUGEPAGE;
 		*vm_flags |= VM_NOHUGEPAGE;
+		/*
+		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+		 * this vma even if we leave the mm registered in khugepaged if
+		 * it got registered before VM_NOHUGEPAGE was set.
+		 */
 		break;
 	}
 
@@ -1784,7 +1797,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
 		goto out;
 
-	if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+	if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+	    (vma->vm_flags & VM_NOHUGEPAGE))
 		goto out;
 
 	/* VM_PFNMAP vmas may have vm_ops null but vm_file set */
@@ -2007,8 +2021,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			break;
 		}
 
-		if (!(vma->vm_flags & VM_HUGEPAGE) &&
-		    !khugepaged_always()) {
+		if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+		     !khugepaged_always()) ||
+		    (vma->vm_flags & VM_NOHUGEPAGE)) {
 			progress++;
 			continue;
 		}
diff --git a/mm/madvise.c b/mm/madvise.c
index bbac126e03ed..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -73,7 +73,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		break;
 	case MADV_HUGEPAGE:
 	case MADV_NOHUGEPAGE:
-		error = hugepage_madvise(&new_flags, behavior);
+		error = hugepage_madvise(vma, &new_flags, behavior);
 		if (error)
 			goto out;
 		break;
-- 
cgit v1.2.3-71-gd317


From 22e5c47ee238abe636655c3862ed28d6eb084ad4 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:20 -0800
Subject: thp: add compound_trans_head() helper

Cleanup some code with common compound_trans_head helper.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 18 ++++++++++++++++++
 mm/ksm.c                | 15 +++------------
 virt/kvm/kvm_main.c     | 38 ++++++++++++++------------------------
 3 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index bddfba1d7b85..8e6c8c42bc3c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -126,6 +126,23 @@ static inline int hpage_nr_pages(struct page *page)
 		return HPAGE_PMD_NR;
 	return 1;
 }
+static inline struct page *compound_trans_head(struct page *page)
+{
+	if (PageTail(page)) {
+		struct page *head;
+		head = page->first_page;
+		smp_rmb();
+		/*
+		 * head may be a dangling pointer.
+		 * __split_huge_page_refcount clears PageTail before
+		 * overwriting first_page, so if PageTail is still
+		 * there it means the head pointer isn't dangling.
+		 */
+		if (PageTail(page))
+			return head;
+	}
+	return page;
+}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUG(); 0; })
@@ -144,6 +161,7 @@ static inline int split_huge_page(struct page *page)
 	do { } while (0)
 #define wait_split_huge_page(__anon_vma, __pmd)	\
 	do { } while (0)
+#define compound_trans_head(page) compound_head(page)
 static inline int hugepage_madvise(struct vm_area_struct *vma,
 				   unsigned long *vm_flags, int advice)
 {
diff --git a/mm/ksm.c b/mm/ksm.c
index 4d5a681923bb..33781de0b6bf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -415,20 +415,11 @@ out:
 static struct page *page_trans_compound_anon(struct page *page)
 {
 	if (PageTransCompound(page)) {
-		struct page *head;
-		head = compound_head(page);
+		struct page *head = compound_trans_head(page);
 		/*
-		 * head may be a dangling pointer.
-		 * __split_huge_page_refcount clears PageTail
-		 * before overwriting first_page, so if
-		 * PageTail is still there it means the head
-		 * pointer isn't dangling.
+		 * head may actually be splitted and freed from under
+		 * us but it's ok here.
 		 */
-		if (head != page) {
-			smp_rmb();
-			if (!PageTransCompound(page))
-				return NULL;
-		}
 		if (PageAnon(head))
 			return head;
 	}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4286d4766510..f29abeb6a912 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -104,34 +104,24 @@ static pfn_t fault_pfn;
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
-		struct page *head;
+		int reserved;
 		struct page *tail = pfn_to_page(pfn);
-		head = compound_head(tail);
+		struct page *head = compound_trans_head(tail);
+		reserved = PageReserved(head);
 		if (head != tail) {
-			smp_rmb();
 			/*
-			 * head may be a dangling pointer.
-			 * __split_huge_page_refcount clears PageTail
-			 * before overwriting first_page, so if
-			 * PageTail is still there it means the head
-			 * pointer isn't dangling.
+			 * "head" is not a dangling pointer
+			 * (compound_trans_head takes care of that)
+			 * but the hugepage may have been splitted
+			 * from under us (and we may not hold a
+			 * reference count on the head page so it can
+			 * be reused before we run PageReferenced), so
+			 * we've to check PageTail before returning
+			 * what we just read.
 			 */
-			if (PageTail(tail)) {
-				/*
-				 * the "head" is not a dangling
-				 * pointer but the hugepage may have
-				 * been splitted from under us (and we
-				 * may not hold a reference count on
-				 * the head page so it can be reused
-				 * before we run PageReferenced), so
-				 * we've to recheck PageTail before
-				 * returning what we just read.
-				 */
-				int reserved = PageReserved(head);
-				smp_rmb();
-				if (PageTail(tail))
-					return reserved;
-			}
+			smp_rmb();
+			if (PageTail(tail))
+				return reserved;
 		}
 		return PageReserved(tail);
 	}
-- 
cgit v1.2.3-71-gd317


From 29c1f677d424e8c5683a837fc4f03fc9f19201d7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:47:21 -0800
Subject: mm: migration: use rcu_dereference_protected when dereferencing the
 radix tree slot during file page migration

migrate_pages() -> unmap_and_move() only calls rcu_read_lock() for
anonymous pages, as introduced by git commit
989f89c57e6361e7d16fbd9572b5da7d313b073d ("fix rcu_read_lock() in page
migraton").  The point of the RCU protection there is part of getting a
stable reference to anon_vma and is only held for anon pages as file pages
are locked which is sufficient protection against freeing.

However, while a file page's mapping is being migrated, the radix tree is
double checked to ensure it is the expected page.  This uses
radix_tree_deref_slot() -> rcu_dereference() without the RCU lock held
triggering the following warning.

[  173.674290] ===================================================
[  173.676016] [ INFO: suspicious rcu_dereference_check() usage. ]
[  173.676016] ---------------------------------------------------
[  173.676016] include/linux/radix-tree.h:145 invoked rcu_dereference_check() without protection!
[  173.676016]
[  173.676016] other info that might help us debug this:
[  173.676016]
[  173.676016]
[  173.676016] rcu_scheduler_active = 1, debug_locks = 0
[  173.676016] 1 lock held by hugeadm/2899:
[  173.676016]  #0:  (&(&inode->i_data.tree_lock)->rlock){..-.-.}, at: [<c10e3d2b>] migrate_page_move_mapping+0x40/0x1ab
[  173.676016]
[  173.676016] stack backtrace:
[  173.676016] Pid: 2899, comm: hugeadm Not tainted 2.6.37-rc5-autobuild
[  173.676016] Call Trace:
[  173.676016]  [<c128cc01>] ? printk+0x14/0x1b
[  173.676016]  [<c1063502>] lockdep_rcu_dereference+0x7d/0x86
[  173.676016]  [<c10e3db5>] migrate_page_move_mapping+0xca/0x1ab
[  173.676016]  [<c10e41ad>] migrate_page+0x23/0x39
[  173.676016]  [<c10e491b>] buffer_migrate_page+0x22/0x107
[  173.676016]  [<c10e48f9>] ? buffer_migrate_page+0x0/0x107
[  173.676016]  [<c10e425d>] move_to_new_page+0x9a/0x1ae
[  173.676016]  [<c10e47e6>] migrate_pages+0x1e7/0x2fa

This patch introduces radix_tree_deref_slot_protected() which calls
rcu_dereference_protected().  Users of it must pass in the
mapping->tree_lock that is protecting this dereference.  Holding the tree
lock protects against parallel updaters of the radix tree meaning that
rcu_dereference_protected is allowable.

[akpm@linux-foundation.org: remove unneeded casts]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Milton Miller <miltonm@bga.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: <stable@kernel.org>		[2.6.37.early]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 16 ++++++++++++++++
 mm/migrate.c               |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ab2baa5c4884..23241c2fecce 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -145,6 +145,22 @@ static inline void *radix_tree_deref_slot(void **pslot)
 	return rcu_dereference(*pslot);
 }
 
+/**
+ * radix_tree_deref_slot_protected	- dereference a slot without RCU lock but with tree lock held
+ * @pslot:	pointer to slot, returned by radix_tree_lookup_slot
+ * Returns:	item that was stored in that slot with any direct pointer flag
+ *		removed.
+ *
+ * Similar to radix_tree_deref_slot but only used during migration when a pages
+ * mapping is being moved. The caller does not hold the RCU read lock but it
+ * must hold the tree lock to prevent parallel updates.
+ */
+static inline void *radix_tree_deref_slot_protected(void **pslot,
+							spinlock_t *treelock)
+{
+	return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
+}
+
 /**
  * radix_tree_deref_retry	- check radix_tree_deref_slot
  * @arg:	pointer returned by radix_tree_deref_slot
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a531b760b3b..89a6bc8cd307 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -248,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 
 	expected_count = 2 + page_has_private(page);
 	if (page_count(page) != expected_count ||
-			(struct page *)radix_tree_deref_slot(pslot) != page) {
+		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
@@ -320,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 	expected_count = 2 + page_has_private(page);
 	if (page_count(page) != expected_count ||
-	    (struct page *)radix_tree_deref_slot(pslot) != page) {
+		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
-- 
cgit v1.2.3-71-gd317


From db16d5ec1f87f17511599bc77857dd1662b5a22f Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 13 Jan 2011 15:47:35 -0800
Subject: memcg: add page_cgroup flags for dirty page tracking

This patchset provides the ability for each cgroup to have independent
dirty page limits.

Limiting dirty memory is like fixing the max amount of dirty (hard to
reclaim) page cache used by a cgroup.  So, in case of multiple cgroup
writers, they will not be able to consume more than their designated share
of dirty pages and will be forced to perform write-out if they cross that
limit.

The patches are based on a series proposed by Andrea Righi in Mar 2010.

Overview:

- Add page_cgroup flags to record when pages are dirty, in writeback, or nfs
  unstable.

- Extend mem_cgroup to record the total number of pages in each of the
  interesting dirty states (dirty, writeback, unstable_nfs).

- Add dirty parameters similar to the system-wide  /proc/sys/vm/dirty_*
  limits to mem_cgroup.  The mem_cgroup dirty parameters are accessible
  via cgroupfs control files.

- Consider both system and per-memcg dirty limits in page writeback when
  deciding to queue background writeback or block for foreground writeback.

Known shortcomings:

- When a cgroup dirty limit is exceeded, then bdi writeback is employed to
  writeback dirty inodes.  Bdi writeback considers inodes from any cgroup, not
  just inodes contributing dirty pages to the cgroup exceeding its limit.

- When memory.use_hierarchy is set, then dirty limits are disabled.  This is a
  implementation detail.  An enhanced implementation is needed to check the
  chain of parents to ensure that no dirty limit is exceeded.

Performance data:
- A page fault microbenchmark workload was used to measure performance, which
  can be called in read or write mode:
        f = open(foo. $cpu)
        truncate(f, 4096)
        alarm(60)
        while (1) {
                p = mmap(f, 4096)
                if (write)
			*p = 1
		else
			x = *p
                munmap(p)
        }

- The workload was called for several points in the patch series in different
  modes:
  - s_read is a single threaded reader
  - s_write is a single threaded writer
  - p_read is a 16 thread reader, each operating on a different file
  - p_write is a 16 thread writer, each operating on a different file

- Measurements were collected on a 16 core non-numa system using "perf stat
  --repeat 3".  The -a option was used for parallel (p_*) runs.

- All numbers are page fault rate (M/sec).  Higher is better.

- To compare the performance of a kernel without non-memcg compare the first and
  last rows, neither has memcg configured.  The first row does not include any
  of these memcg patches.

- To compare the performance of using memcg dirty limits, compare the baseline
  (2nd row titled "w/ memcg") with the the code and memcg enabled (2nd to last
  row titled "all patches").

                           root_cgroup                    child_cgroup
                 s_read s_write p_read p_write   s_read s_write p_read p_write
mmotm w/o memcg   0.428  0.390   0.429  0.388
mmotm w/ memcg    0.411  0.378   0.391  0.362     0.412  0.377   0.385  0.363
all patches       0.384  0.360   0.370  0.348     0.381  0.363   0.368  0.347
all patches       0.431  0.402   0.427  0.395
  w/o memcg

This patch:

Add additional flags to page_cgroup to track dirty pages within a
mem_cgroup.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index b02195dfc1b0..fdb5a92b5ac7 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -40,6 +40,9 @@ enum {
 	PCG_USED, /* this object is in use. */
 	PCG_ACCT_LRU, /* page has been accounted for */
 	PCG_FILE_MAPPED, /* page is accounted as "mapped" */
+	PCG_FILE_DIRTY, /* page is dirty */
+	PCG_FILE_WRITEBACK, /* page is under writeback */
+	PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
 	PCG_MIGRATION, /* under page migration */
 };
 
@@ -59,6 +62,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
 	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
 
+#define TESTSETPCGFLAG(uname, lname)			\
+static inline int TestSetPageCgroup##uname(struct page_cgroup *pc)	\
+	{ return test_and_set_bit(PCG_##lname, &pc->flags);  }
+
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
 CLEARPCGFLAG(Cache, CACHE)
@@ -78,6 +85,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
 CLEARPCGFLAG(FileMapped, FILE_MAPPED)
 TESTPCGFLAG(FileMapped, FILE_MAPPED)
 
+SETPCGFLAG(FileDirty, FILE_DIRTY)
+CLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTPCGFLAG(FileDirty, FILE_DIRTY)
+TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTSETPCGFLAG(FileDirty, FILE_DIRTY)
+
+SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
+CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
+TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
+
+SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+
 SETPCGFLAG(Migration, MIGRATION)
 CLEARPCGFLAG(Migration, MIGRATION)
 TESTPCGFLAG(Migration, MIGRATION)
-- 
cgit v1.2.3-71-gd317


From 2a7106f2cb0768d00fe8c1eb42a754a7d8518f08 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 13 Jan 2011 15:47:37 -0800
Subject: memcg: create extensible page stat update routines

Replace usage of the mem_cgroup_update_file_mapped() memcg
statistic update routine with two new routines:
* mem_cgroup_inc_page_stat()
* mem_cgroup_dec_page_stat()

As before, only the file_mapped statistic is managed.  However, these more
general interfaces allow for new statistics to be more easily added.  New
statistics are added with memcg dirty page accounting.

Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 31 ++++++++++++++++++++++++++++---
 mm/memcontrol.c            | 16 +++++++---------
 mm/rmap.c                  |  4 ++--
 3 files changed, 37 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 159a0762aeaf..067115ce6b3e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,11 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
 
+/* Stats that can be updated by kernel. */
+enum mem_cgroup_page_stat_item {
+	MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+};
+
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
@@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+void mem_cgroup_update_page_stat(struct page *page,
+				 enum mem_cgroup_page_stat_item idx,
+				 int val);
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat(page, idx, 1);
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat(page, idx, -1);
+}
+
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask);
 u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
@@ -293,8 +313,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void mem_cgroup_update_file_mapped(struct page *page,
-							int val)
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 741206ffdace..3d8a0c79dece 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1600,7 +1600,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
  * possibility of race condition. If there is, we take a lock.
  */
 
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+				 enum mem_cgroup_page_stat_item idx, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -1623,30 +1624,27 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
 			goto out;
 	}
 
-	this_cpu_add(mem->stat->count[idx], val);
-
 	switch (idx) {
-	case MEM_CGROUP_STAT_FILE_MAPPED:
+	case MEMCG_NR_FILE_MAPPED:
 		if (val > 0)
 			SetPageCgroupFileMapped(pc);
 		else if (!page_mapped(page))
 			ClearPageCgroupFileMapped(pc);
+		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
 	default:
 		BUG();
 	}
 
+	this_cpu_add(mem->stat->count[idx], val);
+
 out:
 	if (unlikely(need_unlock))
 		unlock_page_cgroup(pc);
 	rcu_read_unlock();
 	return;
 }
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
-	mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
diff --git a/mm/rmap.c b/mm/rmap.c
index c30f33854f97..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -937,7 +937,7 @@ void page_add_file_rmap(struct page *page)
 {
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, 1);
+		mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
 	}
 }
 
@@ -979,7 +979,7 @@ void page_remove_rmap(struct page *page)
 					      NR_ANON_TRANSPARENT_HUGEPAGES);
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, -1);
+		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
 	}
 	/*
 	 * It would be tidy to reset the PageAnon mapping here,
-- 
cgit v1.2.3-71-gd317


From dbd4ea78f002df283c95d9774837041735fa1bf9 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 13 Jan 2011 15:47:38 -0800
Subject: memcg: add lock to synchronize page accounting and migration

Introduce a new bit spin lock, PCG_MOVE_LOCK, to synchronize the page
accounting and migration code.  This reworks the locking scheme of
_update_stat() and _move_account() by adding new lock bit PCG_MOVE_LOCK,
which is always taken under IRQ disable.

1. If pages are being migrated from a memcg, then updates to that
   memcg page statistics are protected by grabbing PCG_MOVE_LOCK using
   move_lock_page_cgroup().  In an upcoming commit, memcg dirty page
   accounting will be updating memcg page accounting (specifically: num
   writeback pages) from IRQ context (softirq).  Avoid a deadlocking
   nested spin lock attempt by disabling irq on the local processor when
   grabbing the PCG_MOVE_LOCK.

2. lock for update_page_stat is used only for avoiding race with
   move_account().  So, IRQ awareness of lock_page_cgroup() itself is not
   a problem.  The problem is between mem_cgroup_update_page_stat() and
   mem_cgroup_move_account_page().

Trade-off:
  * Changing lock_page_cgroup() to always disable IRQ (or
    local_bh) has some impacts on performance and I think
    it's bad to disable IRQ when it's not necessary.
  * adding a new lock makes move_account() slower.  Score is
    here.

Performance Impact: moving a 8G anon process.

Before:
	real    0m0.792s
	user    0m0.000s
	sys     0m0.780s

After:
	real    0m0.854s
	user    0m0.000s
	sys     0m0.842s

This score is bad but planned patches for optimization can reduce
this impact.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Andrea Righi <arighi@develer.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h | 31 ++++++++++++++++++++++++++++---
 mm/memcontrol.c             |  9 +++++++--
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index fdb5a92b5ac7..5b0c971d7cae 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -35,15 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page);
 
 enum {
 	/* flags for mem_cgroup */
-	PCG_LOCK,  /* page cgroup is locked */
+	PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
-	PCG_ACCT_LRU, /* page has been accounted for */
+	PCG_MIGRATION, /* under page migration */
+	/* flags for mem_cgroup and file and I/O status */
+	PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
 	PCG_FILE_MAPPED, /* page is accounted as "mapped" */
 	PCG_FILE_DIRTY, /* page is dirty */
 	PCG_FILE_WRITEBACK, /* page is under writeback */
 	PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
-	PCG_MIGRATION, /* under page migration */
+	/* No lock in page_cgroup */
+	PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
 };
 
 #define TESTPCGFLAG(uname, lname)			\
@@ -117,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
+	/*
+	 * Don't take this lock in IRQ context.
+	 * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
+	 */
 	bit_spin_lock(PCG_LOCK, &pc->flags);
 }
 
@@ -130,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc)
 	return bit_spin_is_locked(PCG_LOCK, &pc->flags);
 }
 
+static inline void move_lock_page_cgroup(struct page_cgroup *pc,
+	unsigned long *flags)
+{
+	/*
+	 * We know updates to pc->flags of page cache's stats are from both of
+	 * usual context or IRQ context. Disable IRQ to avoid deadlock.
+	 */
+	local_irq_save(*flags);
+	bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
+}
+
+static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
+	unsigned long *flags)
+{
+	bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
+	local_irq_restore(*flags);
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3d8a0c79dece..d888956a2cfc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1606,6 +1606,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	bool need_unlock = false;
+	unsigned long uninitialized_var(flags);
 
 	if (unlikely(!pc))
 		return;
@@ -1617,7 +1618,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 	/* pc->mem_cgroup is unstable ? */
 	if (unlikely(mem_cgroup_stealed(mem))) {
 		/* take a lock against to access pc->mem_cgroup */
-		lock_page_cgroup(pc);
+		move_lock_page_cgroup(pc, &flags);
 		need_unlock = true;
 		mem = pc->mem_cgroup;
 		if (!mem || !PageCgroupUsed(pc))
@@ -1640,7 +1641,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 
 out:
 	if (unlikely(need_unlock))
-		unlock_page_cgroup(pc);
+		move_unlock_page_cgroup(pc, &flags);
 	rcu_read_unlock();
 	return;
 }
@@ -2211,9 +2212,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	int ret = -EINVAL;
+	unsigned long flags;
+
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+		move_lock_page_cgroup(pc, &flags);
 		__mem_cgroup_move_account(pc, from, to, uncharge);
+		move_unlock_page_cgroup(pc, &flags);
 		ret = 0;
 	}
 	unlock_page_cgroup(pc);
-- 
cgit v1.2.3-71-gd317


From 50de1dd967d4ba3b8a90ebe7a4f5feca24191317 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Thu, 13 Jan 2011 15:47:43 -0800
Subject: memcg: fix memory migration of shmem swapcache

In the current implementation mem_cgroup_end_migration() decides whether
the page migration has succeeded or not by checking "oldpage->mapping".

But if we are tring to migrate a shmem swapcache, the page->mapping of it
is NULL from the begining, so the check would be invalid.  As a result,
mem_cgroup_end_migration() assumes the migration has succeeded even if
it's not, so "newpage" would be freed while it's not uncharged.

This patch fixes it by passing mem_cgroup_end_migration() the result of
the page migration.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 ++---
 mm/memcontrol.c            | 5 ++---
 mm/migrate.c               | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 067115ce6b3e..6a576f989437 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -98,7 +98,7 @@ extern int
 mem_cgroup_prepare_migration(struct page *page,
 	struct page *newpage, struct mem_cgroup **ptr);
 extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
-	struct page *oldpage, struct page *newpage);
+	struct page *oldpage, struct page *newpage, bool migration_ok);
 
 /*
  * For memory reclaim.
@@ -251,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 }
 
 static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
-					struct page *oldpage,
-					struct page *newpage)
+		struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6424ba0fce83..8ab841031436 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2896,7 +2896,7 @@ int mem_cgroup_prepare_migration(struct page *page,
 
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
-	struct page *oldpage, struct page *newpage)
+	struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 	struct page *used, *unused;
 	struct page_cgroup *pc;
@@ -2905,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 		return;
 	/* blocks rmdir() */
 	cgroup_exclude_rmdir(&mem->css);
-	/* at migration success, oldpage->mapping is NULL. */
-	if (oldpage->mapping) {
+	if (!migration_ok) {
 		used = oldpage;
 		unused = newpage;
 	} else {
diff --git a/mm/migrate.c b/mm/migrate.c
index 5b7d1fd29621..46fe8cc13d67 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -768,7 +768,7 @@ skip_unmap:
 
 uncharge:
 	if (!charge)
-		mem_cgroup_end_migration(mem, page, newpage);
+		mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
 	unlock_page(page);
 
-- 
cgit v1.2.3-71-gd317


From 9ce137eee4febaabca81143be07d4205d2bd52d4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 11 Jan 2011 14:07:12 -0500
Subject: nfsd: don't support msnfs export option

We've long had these pointless #ifdef MSNFS's sprinkled throughout the
code--pointless because MSNFS is always defined (and we give no config
option to make that easy to change).  So we could just remove the
ifdef's and compile the resulting code unconditionally.

But as long as we're there: why not just rip out this code entirely?
The only purpose is to implement the "msnfs" export option which turns
on Windows-like behavior in some cases, and:

	- the export option isn't documented anywhere;
	- the userland utilities (which would need to be able to parse
	  "msnfs" in an export file) don't support it;
	- I don't know how to maintain this, as I don't know what the
	  proper behavior is; and
	- google shows no evidence that anyone has ever used this.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c            |  4 ----
 fs/nfsd/vfs.c               | 41 ++---------------------------------------
 include/linux/nfsd/export.h |  2 +-
 3 files changed, 3 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6d..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
-#define MSNFS	/* HACK HACK */
 /*
  * NFS exporting and validation.
  *
@@ -1444,9 +1443,6 @@ static struct flags {
 	{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
 	{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
 	{ NFSEXP_V4ROOT, {"v4root", ""}},
-#ifdef MSNFS
-	{ NFSEXP_MSNFS, {"msnfs", ""}},
-#endif
 	{ 0, {"", ""}}
 };
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b991125ce4a5..0a01e2fc5dda 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
-#define MSNFS	/* HACK HACK */
 /*
  * File operations used by nfsd. Some of these have been ripped from
  * other parts of the kernel because they weren't exported, others
@@ -875,15 +874,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
 	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
 
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
-	return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
-	return 0;
-#endif
-}
-
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -896,9 +886,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	err = nfserr_perm;
 	inode = file->f_path.dentry->d_inode;
 
-	if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
-		goto out;
-
 	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
 		struct splice_desc sd = {
 			.len		= 0,
@@ -923,7 +910,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		fsnotify_access(file);
 	} else 
 		err = nfserrno(host_err);
-out:
 	return err;
 }
 
@@ -988,14 +974,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	int			stable = *stablep;
 	int			use_wgather;
 
-#ifdef MSNFS
-	err = nfserr_perm;
-
-	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-		(!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
-		goto out;
-#endif
-
 	dentry = file->f_path.dentry;
 	inode = dentry->d_inode;
 	exp   = fhp->fh_export;
@@ -1046,7 +1024,6 @@ out_nfserr:
 		err = 0;
 	else
 		err = nfserrno(host_err);
-out:
 	return err;
 }
 
@@ -1751,13 +1728,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (ndentry == trap)
 		goto out_dput_new;
 
-	if (svc_msnfs(ffhp) &&
-		((atomic_read(&odentry->d_count) > 1)
-		 || (atomic_read(&ndentry->d_count) > 1))) {
-			host_err = -EPERM;
-			goto out_dput_new;
-	}
-
 	host_err = -EXDEV;
 	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
 		goto out_dput_new;
@@ -1836,17 +1806,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (host_err)
 		goto out_nfserr;
 
-	if (type != S_IFDIR) { /* It's UNLINK */
-#ifdef MSNFS
-		if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-			(atomic_read(&rdentry->d_count) > 1)) {
-			host_err = -EPERM;
-		} else
-#endif
+	if (type != S_IFDIR)
 		host_err = vfs_unlink(dirp, rdentry);
-	} else { /* It's RMDIR */
+	else
 		host_err = vfs_rmdir(dirp, rdentry);
-	}
 
 	dput(rdentry);
 
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 8ae78a61eea4..bd316159278c 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -35,7 +35,7 @@
 #define NFSEXP_NOHIDE		0x0200
 #define NFSEXP_NOSUBTREECHECK	0x0400
 #define	NFSEXP_NOAUTHNLM	0x0800		/* Don't authenticate NLM requests - just trust */
-#define NFSEXP_MSNFS		0x1000	/* do silly things that MS clients expect */
+#define NFSEXP_MSNFS		0x1000	/* do silly things that MS clients expect; no longer supported */
 #define NFSEXP_FSID		0x2000
 #define	NFSEXP_CROSSMOUNT	0x4000
 #define	NFSEXP_NOACL		0x8000	/* reserved for possible ACL related use */
-- 
cgit v1.2.3-71-gd317


From 2c6755988afc003a0332406a134fb6a1626f9b28 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 14 Jan 2011 02:36:43 +0000
Subject: fs: hlist UP debug fixup

Po-Yu Chuang <ratbert.chuang@gmail.com> noticed that hlist_bl_set_first could
crash on a UP system when LIST_BL_LOCKMASK is 0, because

	LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK));

always evaulates to true.

Fix the expression, and also avoid a dependency between bit spinlock
implementation and list bl code (list code shouldn't know anything
except that bit 0 is set when adding and removing elements). Eventually
if a good use case comes up, we might use this list to store 1 or more
arbitrary bits of data, so it really shouldn't be tied to locking either,
but for now they are helpful for debugging.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 include/linux/list_bl.h    | 5 +++--
 include/linux/rculist_bl.h | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 9ee97e7f2be4..b2adbb4b2f73 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -16,7 +16,7 @@
  * some fast and compact auxiliary data.
  */
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#if defined(CONFIG_SMP)
 #define LIST_BL_LOCKMASK	1UL
 #else
 #define LIST_BL_LOCKMASK	0UL
@@ -62,7 +62,8 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
 					struct hlist_bl_node *n)
 {
 	LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
-	LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK));
+	LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
+							LIST_BL_LOCKMASK);
 	h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
 }
 
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
index b872b493724d..cf1244fbf3b6 100644
--- a/include/linux/rculist_bl.h
+++ b/include/linux/rculist_bl.h
@@ -11,7 +11,8 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
 					struct hlist_bl_node *n)
 {
 	LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
-	LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK));
+	LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
+							LIST_BL_LOCKMASK);
 	rcu_assign_pointer(h->first,
 		(struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
 }
-- 
cgit v1.2.3-71-gd317


From 412dc11d3fd01f96fdf4a8cbfbc5584a17dab7c8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 24 Nov 2010 18:01:41 +0000
Subject: mfd: Add WM8326 support

The WM8326 is a high performance variant of the WM832x series with
no software visible differences.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c       |  7 +++++++
 drivers/mfd/wm831x-i2c.c        |  1 +
 drivers/mfd/wm831x-spi.c        | 18 ++++++++++++++++++
 include/linux/mfd/wm831x/core.h |  1 +
 4 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index d1b2d534b991..3fe9a58fe6c7 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -1541,6 +1541,12 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 		dev_info(wm831x->dev, "WM8325 revision %c\n", 'A' + rev);
 		break;
 
+	case WM8326:
+		parent = WM8326;
+		wm831x->num_gpio = 12;
+		dev_info(wm831x->dev, "WM8326 revision %c\n", 'A' + rev);
+		break;
+
 	default:
 		dev_err(wm831x->dev, "Unknown WM831x device %04x\n", ret);
 		ret = -EINVAL;
@@ -1612,6 +1618,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	case WM8320:
 	case WM8321:
 	case WM8325:
+	case WM8326:
 		ret = mfd_add_devices(wm831x->dev, -1,
 				      wm8320_devs, ARRAY_SIZE(wm8320_devs),
 				      NULL, wm831x->irq_base);
diff --git a/drivers/mfd/wm831x-i2c.c b/drivers/mfd/wm831x-i2c.c
index 156b19859e81..38be5201c783 100644
--- a/drivers/mfd/wm831x-i2c.c
+++ b/drivers/mfd/wm831x-i2c.c
@@ -108,6 +108,7 @@ static const struct i2c_device_id wm831x_i2c_id[] = {
 	{ "wm8320", WM8320 },
 	{ "wm8321", WM8321 },
 	{ "wm8325", WM8325 },
+	{ "wm8326", WM8326 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, wm831x_i2c_id);
diff --git a/drivers/mfd/wm831x-spi.c b/drivers/mfd/wm831x-spi.c
index 2789b151b0f9..0a8f772be88c 100644
--- a/drivers/mfd/wm831x-spi.c
+++ b/drivers/mfd/wm831x-spi.c
@@ -81,6 +81,8 @@ static int __devinit wm831x_spi_probe(struct spi_device *spi)
 		type = WM8321;
 	else if (strcmp(spi->modalias, "wm8325") == 0)
 		type = WM8325;
+	else if (strcmp(spi->modalias, "wm8326") == 0)
+		type = WM8326;
 	else {
 		dev_err(&spi->dev, "Unknown device type\n");
 		return -EINVAL;
@@ -184,6 +186,17 @@ static struct spi_driver wm8325_spi_driver = {
 	.suspend	= wm831x_spi_suspend,
 };
 
+static struct spi_driver wm8326_spi_driver = {
+	.driver = {
+		.name	= "wm8326",
+		.bus	= &spi_bus_type,
+		.owner	= THIS_MODULE,
+	},
+	.probe		= wm831x_spi_probe,
+	.remove		= __devexit_p(wm831x_spi_remove),
+	.suspend	= wm831x_spi_suspend,
+};
+
 static int __init wm831x_spi_init(void)
 {
 	int ret;
@@ -212,12 +225,17 @@ static int __init wm831x_spi_init(void)
 	if (ret != 0)
 		pr_err("Failed to register WM8325 SPI driver: %d\n", ret);
 
+	ret = spi_register_driver(&wm8326_spi_driver);
+	if (ret != 0)
+		pr_err("Failed to register WM8326 SPI driver: %d\n", ret);
+
 	return 0;
 }
 subsys_initcall(wm831x_spi_init);
 
 static void __exit wm831x_spi_exit(void)
 {
+	spi_unregister_driver(&wm8326_spi_driver);
 	spi_unregister_driver(&wm8325_spi_driver);
 	spi_unregister_driver(&wm8321_spi_driver);
 	spi_unregister_driver(&wm8320_spi_driver);
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index a1239c48b41a..903280d21866 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -245,6 +245,7 @@ enum wm831x_parent {
 	WM8320 = 0x8320,
 	WM8321 = 0x8321,
 	WM8325 = 0x8325,
+	WM8326 = 0x8326,
 };
 
 struct wm831x {
-- 
cgit v1.2.3-71-gd317


From 4c90aa94f6b3e33f57faaf19ef9819195dff61d3 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 26 Nov 2010 17:19:34 +0000
Subject: mfd: Provide pm_runtime_no_callbacks flag in cell data

Allow MFD cells to have pm_runtime_no_callbacks() called on them during
registration. This causes the runtime PM framework to ignore them,
allowing use of runtime PM to suspend the device as a whole even if
not all drivers for the MFD can usefully implement runtime PM. For
example, RTCs are likely to run continuously regardless of the power
state of the system.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/mfd-core.c   | 4 ++++
 include/linux/mfd/core.h | 6 ++++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index ec99f681e773..d83ad0f141af 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -15,6 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/acpi.h>
 #include <linux/mfd/core.h>
+#include <linux/pm_runtime.h>
 #include <linux/slab.h>
 
 static int mfd_add_device(struct device *parent, int id,
@@ -82,6 +83,9 @@ static int mfd_add_device(struct device *parent, int id,
 	if (ret)
 		goto fail_res;
 
+	if (cell->pm_runtime_no_callbacks)
+		pm_runtime_no_callbacks(&pdev->dev);
+
 	kfree(res);
 
 	return 0;
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 5582ab3d3e48..835996e167e1 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -47,6 +47,12 @@ struct mfd_cell {
 
 	/* don't check for resource conflicts */
 	bool			ignore_resource_conflicts;
+
+	/*
+	 * Disable runtime PM callbacks for this subdevice - see
+	 * pm_runtime_no_callbacks().
+	 */
+	bool			pm_runtime_no_callbacks;
 };
 
 extern int mfd_add_devices(struct device *parent, int id,
-- 
cgit v1.2.3-71-gd317


From e098aded79f24e2024139e82f778ff9db6dc142a Mon Sep 17 00:00:00 2001
From: Mattias Wallin <mattias.wallin@stericsson.com>
Date: Thu, 2 Dec 2010 15:40:31 +0100
Subject: mfd: ab8500-core ioresources irq for subdrivers added

This patch adds the ioresources used by subdrivers to
retrieve their interrupt.

Signed-off-by: Mattias Wallin <mattias.wallin@stericsson.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-core.c  | 206 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mfd/ab8500.h |   4 +-
 2 files changed, 201 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index e464e94c4bc0..7f01a3a1d10f 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -397,12 +397,188 @@ static struct resource ab8500_poweronkey_db_resources[] = {
 	},
 };
 
+static struct resource ab8500_bm_resources[] = {
+	{
+		.name = "MAIN_EXT_CH_NOT_OK",
+		.start = AB8500_INT_MAIN_EXT_CH_NOT_OK,
+		.end = AB8500_INT_MAIN_EXT_CH_NOT_OK,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "BATT_OVV",
+		.start = AB8500_INT_BATT_OVV,
+		.end = AB8500_INT_BATT_OVV,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "MAIN_CH_UNPLUG_DET",
+		.start = AB8500_INT_MAIN_CH_UNPLUG_DET,
+		.end = AB8500_INT_MAIN_CH_UNPLUG_DET,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "MAIN_CHARGE_PLUG_DET",
+		.start = AB8500_INT_MAIN_CH_PLUG_DET,
+		.end = AB8500_INT_MAIN_CH_PLUG_DET,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "VBUS_DET_F",
+		.start = AB8500_INT_VBUS_DET_F,
+		.end = AB8500_INT_VBUS_DET_F,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "VBUS_DET_R",
+		.start = AB8500_INT_VBUS_DET_R,
+		.end = AB8500_INT_VBUS_DET_R,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "BAT_CTRL_INDB",
+		.start = AB8500_INT_BAT_CTRL_INDB,
+		.end = AB8500_INT_BAT_CTRL_INDB,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "CH_WD_EXP",
+		.start = AB8500_INT_CH_WD_EXP,
+		.end = AB8500_INT_CH_WD_EXP,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "VBUS_OVV",
+		.start = AB8500_INT_VBUS_OVV,
+		.end = AB8500_INT_VBUS_OVV,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "NCONV_ACCU",
+		.start = AB8500_INT_CCN_CONV_ACC,
+		.end = AB8500_INT_CCN_CONV_ACC,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "LOW_BAT_F",
+		.start = AB8500_INT_LOW_BAT_F,
+		.end = AB8500_INT_LOW_BAT_F,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "LOW_BAT_R",
+		.start = AB8500_INT_LOW_BAT_R,
+		.end = AB8500_INT_LOW_BAT_R,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "BTEMP_LOW",
+		.start = AB8500_INT_BTEMP_LOW,
+		.end = AB8500_INT_BTEMP_LOW,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "BTEMP_HIGH",
+		.start = AB8500_INT_BTEMP_HIGH,
+		.end = AB8500_INT_BTEMP_HIGH,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "USB_CHARGER_NOT_OKR",
+		.start = AB8500_INT_USB_CHARGER_NOT_OK,
+		.end = AB8500_INT_USB_CHARGER_NOT_OK,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "USB_CHARGE_DET_DONE",
+		.start = AB8500_INT_USB_CHG_DET_DONE,
+		.end = AB8500_INT_USB_CHG_DET_DONE,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "USB_CH_TH_PROT_R",
+		.start = AB8500_INT_USB_CH_TH_PROT_R,
+		.end = AB8500_INT_USB_CH_TH_PROT_R,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "MAIN_CH_TH_PROT_R",
+		.start = AB8500_INT_MAIN_CH_TH_PROT_R,
+		.end = AB8500_INT_MAIN_CH_TH_PROT_R,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "USB_CHARGER_NOT_OKF",
+		.start = AB8500_INT_USB_CHARGER_NOT_OKF,
+		.end = AB8500_INT_USB_CHARGER_NOT_OKF,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource ab8500_debug_resources[] = {
+	{
+		.name	= "IRQ_FIRST",
+		.start	= AB8500_INT_MAIN_EXT_CH_NOT_OK,
+		.end	= AB8500_INT_MAIN_EXT_CH_NOT_OK,
+		.flags	= IORESOURCE_IRQ,
+	},
+	{
+		.name	= "IRQ_LAST",
+		.start	= AB8500_INT_USB_CHARGER_NOT_OKF,
+		.end	= AB8500_INT_USB_CHARGER_NOT_OKF,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct resource ab8500_usb_resources[] = {
+	{
+		.name = "ID_WAKEUP_R",
+		.start = AB8500_INT_ID_WAKEUP_R,
+		.end = AB8500_INT_ID_WAKEUP_R,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "ID_WAKEUP_F",
+		.start = AB8500_INT_ID_WAKEUP_F,
+		.end = AB8500_INT_ID_WAKEUP_F,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "VBUS_DET_F",
+		.start = AB8500_INT_VBUS_DET_F,
+		.end = AB8500_INT_VBUS_DET_F,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name = "VBUS_DET_R",
+		.start = AB8500_INT_VBUS_DET_R,
+		.end = AB8500_INT_VBUS_DET_R,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
+static struct resource ab8500_temp_resources[] = {
+	{
+		.name  = "AB8500_TEMP_WARM",
+		.start = AB8500_INT_TEMP_WARM,
+		.end   = AB8500_INT_TEMP_WARM,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
 static struct mfd_cell ab8500_devs[] = {
 #ifdef CONFIG_DEBUG_FS
 	{
 		.name = "ab8500-debug",
+		.num_resources = ARRAY_SIZE(ab8500_debug_resources),
+		.resources = ab8500_debug_resources,
 	},
 #endif
+	{
+		.name = "ab8500-sysctrl",
+	},
+	{
+		.name = "ab8500-regulator",
+	},
 	{
 		.name = "ab8500-gpadc",
 		.num_resources = ARRAY_SIZE(ab8500_gpadc_resources),
@@ -413,6 +589,22 @@ static struct mfd_cell ab8500_devs[] = {
 		.num_resources = ARRAY_SIZE(ab8500_rtc_resources),
 		.resources = ab8500_rtc_resources,
 	},
+	{
+		.name = "ab8500-bm",
+		.num_resources = ARRAY_SIZE(ab8500_bm_resources),
+		.resources = ab8500_bm_resources,
+	},
+	{ .name = "ab8500-codec", },
+	{
+		.name = "ab8500-usb",
+		.num_resources = ARRAY_SIZE(ab8500_usb_resources),
+		.resources = ab8500_usb_resources,
+	},
+	{
+		.name = "ab8500-poweron-key",
+		.num_resources = ARRAY_SIZE(ab8500_poweronkey_db_resources),
+		.resources = ab8500_poweronkey_db_resources,
+	},
 	{
 		.name = "ab8500-pwm",
 		.id = 1,
@@ -425,14 +617,14 @@ static struct mfd_cell ab8500_devs[] = {
 		.name = "ab8500-pwm",
 		.id = 3,
 	},
-	{ .name = "ab8500-charger", },
-	{ .name = "ab8500-audio", },
-	{ .name = "ab8500-usb", },
-	{ .name = "ab8500-regulator", },
+	{ .name = "ab8500-leds", },
 	{
-		.name = "ab8500-poweron-key",
-		.num_resources = ARRAY_SIZE(ab8500_poweronkey_db_resources),
-		.resources = ab8500_poweronkey_db_resources,
+		.name = "ab8500-denc",
+	},
+	{
+		.name = "ab8500-temp",
+		.num_resources = ARRAY_SIZE(ab8500_temp_resources),
+		.resources = ab8500_temp_resources,
 	},
 };
 
diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h
index 85cf2c28fac6..1ad169655235 100644
--- a/include/linux/mfd/ab8500.h
+++ b/include/linux/mfd/ab8500.h
@@ -91,8 +91,8 @@
 #define AB8500_INT_ID_DET_R4F		93
 #define AB8500_INT_USB_CHG_DET_DONE	94
 #define AB8500_INT_USB_CH_TH_PROT_F	96
-#define AB8500_INT_USB_CH_TH_PROP_R	97
-#define AB8500_INT_MAIN_CH_TH_PROP_F	98
+#define AB8500_INT_USB_CH_TH_PROT_R	97
+#define AB8500_INT_MAIN_CH_TH_PROT_F	98
 #define AB8500_INT_MAIN_CH_TH_PROT_R	99
 #define AB8500_INT_USB_CHARGER_NOT_OKF	103
 
-- 
cgit v1.2.3-71-gd317


From cdd137c9c86c201ddb7f42ec978d2da45e7b7a17 Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Thu, 23 Dec 2010 17:53:36 +0900
Subject: mfd: MAX8998/LP3974 hibernation support

This patch makes the driver to save and restore register values
for hibernation.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/max8998-irq.c           |   7 +++
 drivers/mfd/max8998.c               | 108 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/max8998-private.h |   2 +
 include/linux/mfd/max8998.h         |   1 +
 4 files changed, 118 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/max8998-irq.c b/drivers/mfd/max8998-irq.c
index c6b61fcc5808..3903e1fbb334 100644
--- a/drivers/mfd/max8998-irq.c
+++ b/drivers/mfd/max8998-irq.c
@@ -183,6 +183,13 @@ static irqreturn_t max8998_irq_thread(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+int max8998_irq_resume(struct max8998_dev *max8998)
+{
+	if (max8998->irq && max8998->irq_base)
+		max8998_irq_thread(max8998->irq_base, max8998);
+	return 0;
+}
+
 int max8998_irq_init(struct max8998_dev *max8998)
 {
 	int i;
diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c
index bb9977bebe78..5ce00ad5241a 100644
--- a/drivers/mfd/max8998.c
+++ b/drivers/mfd/max8998.c
@@ -25,6 +25,8 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/pm_runtime.h>
 #include <linux/mutex.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/max8998.h>
@@ -135,6 +137,7 @@ static int max8998_i2c_probe(struct i2c_client *i2c,
 	if (pdata) {
 		max8998->ono = pdata->ono;
 		max8998->irq_base = pdata->irq_base;
+		max8998->wakeup = pdata->wakeup;
 	}
 	mutex_init(&max8998->iolock);
 
@@ -146,6 +149,8 @@ static int max8998_i2c_probe(struct i2c_client *i2c,
 	ret = mfd_add_devices(max8998->dev, -1,
 			      max8998_devs, ARRAY_SIZE(max8998_devs),
 			      NULL, 0);
+	pm_runtime_set_active(max8998->dev);
+
 	if (ret < 0)
 		goto err;
 
@@ -178,10 +183,113 @@ static const struct i2c_device_id max8998_i2c_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, max8998_i2c_id);
 
+static int max8998_suspend(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	struct max8998_dev *max8998 = i2c_get_clientdata(i2c);
+
+	if (max8998->wakeup)
+		set_irq_wake(max8998->irq, 1);
+	return 0;
+}
+
+static int max8998_resume(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	struct max8998_dev *max8998 = i2c_get_clientdata(i2c);
+
+	if (max8998->wakeup)
+		set_irq_wake(max8998->irq, 0);
+	/*
+	 * In LP3974, if IRQ registers are not "read & clear"
+	 * when it's set during sleep, the interrupt becomes
+	 * disabled.
+	 */
+	return max8998_irq_resume(i2c_get_clientdata(i2c));
+}
+
+struct max8998_reg_dump {
+	u8	addr;
+	u8	val;
+};
+#define SAVE_ITEM(x)	{ .addr = (x), .val = 0x0, }
+struct max8998_reg_dump max8998_dump[] = {
+	SAVE_ITEM(MAX8998_REG_IRQM1),
+	SAVE_ITEM(MAX8998_REG_IRQM2),
+	SAVE_ITEM(MAX8998_REG_IRQM3),
+	SAVE_ITEM(MAX8998_REG_IRQM4),
+	SAVE_ITEM(MAX8998_REG_STATUSM1),
+	SAVE_ITEM(MAX8998_REG_STATUSM2),
+	SAVE_ITEM(MAX8998_REG_CHGR1),
+	SAVE_ITEM(MAX8998_REG_CHGR2),
+	SAVE_ITEM(MAX8998_REG_LDO_ACTIVE_DISCHARGE1),
+	SAVE_ITEM(MAX8998_REG_LDO_ACTIVE_DISCHARGE1),
+	SAVE_ITEM(MAX8998_REG_BUCK_ACTIVE_DISCHARGE3),
+	SAVE_ITEM(MAX8998_REG_ONOFF1),
+	SAVE_ITEM(MAX8998_REG_ONOFF2),
+	SAVE_ITEM(MAX8998_REG_ONOFF3),
+	SAVE_ITEM(MAX8998_REG_ONOFF4),
+	SAVE_ITEM(MAX8998_REG_BUCK1_VOLTAGE1),
+	SAVE_ITEM(MAX8998_REG_BUCK1_VOLTAGE2),
+	SAVE_ITEM(MAX8998_REG_BUCK1_VOLTAGE3),
+	SAVE_ITEM(MAX8998_REG_BUCK1_VOLTAGE4),
+	SAVE_ITEM(MAX8998_REG_BUCK2_VOLTAGE1),
+	SAVE_ITEM(MAX8998_REG_BUCK2_VOLTAGE2),
+	SAVE_ITEM(MAX8998_REG_LDO2_LDO3),
+	SAVE_ITEM(MAX8998_REG_LDO4),
+	SAVE_ITEM(MAX8998_REG_LDO5),
+	SAVE_ITEM(MAX8998_REG_LDO6),
+	SAVE_ITEM(MAX8998_REG_LDO7),
+	SAVE_ITEM(MAX8998_REG_LDO8_LDO9),
+	SAVE_ITEM(MAX8998_REG_LDO10_LDO11),
+	SAVE_ITEM(MAX8998_REG_LDO12),
+	SAVE_ITEM(MAX8998_REG_LDO13),
+	SAVE_ITEM(MAX8998_REG_LDO14),
+	SAVE_ITEM(MAX8998_REG_LDO15),
+	SAVE_ITEM(MAX8998_REG_LDO16),
+	SAVE_ITEM(MAX8998_REG_LDO17),
+	SAVE_ITEM(MAX8998_REG_BKCHR),
+	SAVE_ITEM(MAX8998_REG_LBCNFG1),
+	SAVE_ITEM(MAX8998_REG_LBCNFG2),
+};
+/* Save registers before hibernation */
+static int max8998_freeze(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(max8998_dump); i++)
+		max8998_read_reg(i2c, max8998_dump[i].addr,
+				&max8998_dump[i].val);
+
+	return 0;
+}
+
+/* Restore registers after hibernation */
+static int max8998_restore(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(max8998_dump); i++)
+		max8998_write_reg(i2c, max8998_dump[i].addr,
+				max8998_dump[i].val);
+
+	return 0;
+}
+
+const struct dev_pm_ops max8998_pm = {
+	.suspend = max8998_suspend,
+	.resume = max8998_resume,
+	.freeze = max8998_freeze,
+	.restore = max8998_restore,
+};
+
 static struct i2c_driver max8998_i2c_driver = {
 	.driver = {
 		   .name = "max8998",
 		   .owner = THIS_MODULE,
+		   .pm = &max8998_pm,
 	},
 	.probe = max8998_i2c_probe,
 	.remove = max8998_i2c_remove,
diff --git a/include/linux/mfd/max8998-private.h b/include/linux/mfd/max8998-private.h
index 7363dea6bbcd..effa5d3b96ae 100644
--- a/include/linux/mfd/max8998-private.h
+++ b/include/linux/mfd/max8998-private.h
@@ -159,10 +159,12 @@ struct max8998_dev {
 	u8 irq_masks_cur[MAX8998_NUM_IRQ_REGS];
 	u8 irq_masks_cache[MAX8998_NUM_IRQ_REGS];
 	int type;
+	bool wakeup;
 };
 
 int max8998_irq_init(struct max8998_dev *max8998);
 void max8998_irq_exit(struct max8998_dev *max8998);
+int max8998_irq_resume(struct max8998_dev *max8998);
 
 extern int max8998_read_reg(struct i2c_client *i2c, u8 reg, u8 *dest);
 extern int max8998_bulk_read(struct i2c_client *i2c, u8 reg, int count,
diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h
index f8c9f884aff2..686a744e63fb 100644
--- a/include/linux/mfd/max8998.h
+++ b/include/linux/mfd/max8998.h
@@ -88,6 +88,7 @@ struct max8998_platform_data {
 	int				buck1_set1;
 	int				buck1_set2;
 	int				buck2_set3;
+	bool				wakeup;
 };
 
 #endif /*  __LINUX_MFD_MAX8998_H */
-- 
cgit v1.2.3-71-gd317


From 337ce5d1c5759644cea6c47220ce7e84f0398362 Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Tue, 4 Jan 2011 14:17:39 +0900
Subject: mfd: Support LP3974 RTC

The first releases of LP3974 have a large delay in RTC registers,
which requires 2 seconds of delay after writing to a rtc register
(recommended by National Semiconductor's engineers)
before reading it.

If "rtc_delay" field of the platform data is true, the rtc driver
assumes that such delays are required. Although we have not seen
LP3974s without requiring such delays, we assume that such LP3974s
will be released soon (or they have done so already) and they are
supported by "lp3974" without setting "rtc_delay" at the platform
data.

This patch adds delays with msleep when writing values to RTC registers
if the platform data has rtc_delay set.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/max8998.c       | 26 +++++++++++++++++++---
 drivers/regulator/max8998.c |  7 ++++++
 drivers/rtc/rtc-max8998.c   | 54 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/mfd/max8998.h |  4 ++++
 4 files changed, 83 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c
index 5ce00ad5241a..bbfe86732602 100644
--- a/drivers/mfd/max8998.c
+++ b/drivers/mfd/max8998.c
@@ -42,6 +42,14 @@ static struct mfd_cell max8998_devs[] = {
 	},
 };
 
+static struct mfd_cell lp3974_devs[] = {
+	{
+		.name = "lp3974-pmic",
+	}, {
+		.name = "lp3974-rtc",
+	},
+};
+
 int max8998_read_reg(struct i2c_client *i2c, u8 reg, u8 *dest)
 {
 	struct max8998_dev *max8998 = i2c_get_clientdata(i2c);
@@ -146,11 +154,23 @@ static int max8998_i2c_probe(struct i2c_client *i2c,
 
 	max8998_irq_init(max8998);
 
-	ret = mfd_add_devices(max8998->dev, -1,
-			      max8998_devs, ARRAY_SIZE(max8998_devs),
-			      NULL, 0);
 	pm_runtime_set_active(max8998->dev);
 
+	switch (id->driver_data) {
+	case TYPE_LP3974:
+		ret = mfd_add_devices(max8998->dev, -1,
+				lp3974_devs, ARRAY_SIZE(lp3974_devs),
+				NULL, 0);
+		break;
+	case TYPE_MAX8998:
+		ret = mfd_add_devices(max8998->dev, -1,
+				max8998_devs, ARRAY_SIZE(max8998_devs),
+				NULL, 0);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
 	if (ret < 0)
 		goto err;
 
diff --git a/drivers/regulator/max8998.c b/drivers/regulator/max8998.c
index 7568df6122ab..af52ebfc4927 100644
--- a/drivers/regulator/max8998.c
+++ b/drivers/regulator/max8998.c
@@ -835,6 +835,12 @@ static int __devexit max8998_pmic_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct platform_device_id max8998_pmic_id[] = {
+	{ "max8998-pmic", TYPE_MAX8998 },
+	{ "lp3974-pmic", TYPE_LP3974 },
+	{ }
+};
+
 static struct platform_driver max8998_pmic_driver = {
 	.driver = {
 		.name = "max8998-pmic",
@@ -842,6 +848,7 @@ static struct platform_driver max8998_pmic_driver = {
 	},
 	.probe = max8998_pmic_probe,
 	.remove = __devexit_p(max8998_pmic_remove),
+	.id_table = max8998_pmic_id,
 };
 
 static int __init max8998_pmic_init(void)
diff --git a/drivers/rtc/rtc-max8998.c b/drivers/rtc/rtc-max8998.c
index f22dee35f330..3f7bc6b9fefa 100644
--- a/drivers/rtc/rtc-max8998.c
+++ b/drivers/rtc/rtc-max8998.c
@@ -20,6 +20,7 @@
 #include <linux/platform_device.h>
 #include <linux/mfd/max8998.h>
 #include <linux/mfd/max8998-private.h>
+#include <linux/delay.h>
 
 #define MAX8998_RTC_SEC			0x00
 #define MAX8998_RTC_MIN			0x01
@@ -73,6 +74,7 @@ struct max8998_rtc_info {
 	struct i2c_client	*rtc;
 	struct rtc_device	*rtc_dev;
 	int irq;
+	bool lp3974_bug_workaround;
 };
 
 static void max8998_data_to_tm(u8 *data, struct rtc_time *tm)
@@ -124,10 +126,16 @@ static int max8998_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
 	struct max8998_rtc_info *info = dev_get_drvdata(dev);
 	u8 data[8];
+	int ret;
 
 	max8998_tm_to_data(tm, data);
 
-	return max8998_bulk_write(info->rtc, MAX8998_RTC_SEC, 8, data);
+	ret = max8998_bulk_write(info->rtc, MAX8998_RTC_SEC, 8, data);
+
+	if (info->lp3974_bug_workaround)
+		msleep(2000);
+
+	return ret;
 }
 
 static int max8998_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
@@ -163,12 +171,29 @@ static int max8998_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 
 static int max8998_rtc_stop_alarm(struct max8998_rtc_info *info)
 {
-	return max8998_write_reg(info->rtc, MAX8998_ALARM0_CONF, 0);
+	int ret = max8998_write_reg(info->rtc, MAX8998_ALARM0_CONF, 0);
+
+	if (info->lp3974_bug_workaround)
+		msleep(2000);
+
+	return ret;
 }
 
 static int max8998_rtc_start_alarm(struct max8998_rtc_info *info)
 {
-	return max8998_write_reg(info->rtc, MAX8998_ALARM0_CONF, 0x77);
+	int ret;
+	u8 alarm0_conf = 0x77;
+
+	/* LP3974 with delay bug chips has rtc alarm bugs with "MONTH" field */
+	if (info->lp3974_bug_workaround)
+		alarm0_conf = 0x57;
+
+	ret = max8998_write_reg(info->rtc, MAX8998_ALARM0_CONF, alarm0_conf);
+
+	if (info->lp3974_bug_workaround)
+		msleep(2000);
+
+	return ret;
 }
 
 static int max8998_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
@@ -187,10 +212,13 @@ static int max8998_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	if (ret < 0)
 		return ret;
 
+	if (info->lp3974_bug_workaround)
+		msleep(2000);
+
 	if (alrm->enabled)
-		return max8998_rtc_start_alarm(info);
+		ret = max8998_rtc_start_alarm(info);
 
-	return 0;
+	return ret;
 }
 
 static int max8998_rtc_alarm_irq_enable(struct device *dev,
@@ -224,6 +252,7 @@ static const struct rtc_class_ops max8998_rtc_ops = {
 static int __devinit max8998_rtc_probe(struct platform_device *pdev)
 {
 	struct max8998_dev *max8998 = dev_get_drvdata(pdev->dev.parent);
+	struct max8998_platform_data *pdata = dev_get_platdata(max8998->dev);
 	struct max8998_rtc_info *info;
 	int ret;
 
@@ -249,10 +278,18 @@ static int __devinit max8998_rtc_probe(struct platform_device *pdev)
 
 	ret = request_threaded_irq(info->irq, NULL, max8998_rtc_alarm_irq, 0,
 			"rtc-alarm0", info);
+
 	if (ret < 0)
 		dev_err(&pdev->dev, "Failed to request alarm IRQ: %d: %d\n",
 			info->irq, ret);
 
+	dev_info(&pdev->dev, "RTC CHIP NAME: %s\n", pdev->id_entry->name);
+	if (pdata->rtc_delay) {
+		info->lp3974_bug_workaround = true;
+		dev_warn(&pdev->dev, "LP3974 with RTC REGERR option."
+				" RTC updates will be extremely slow.\n");
+	}
+
 	return 0;
 
 out_rtc:
@@ -273,6 +310,12 @@ static int __devexit max8998_rtc_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct platform_device_id max8998_rtc_id[] = {
+	{ "max8998-rtc", TYPE_MAX8998 },
+	{ "lp3974-rtc", TYPE_LP3974 },
+	{ }
+};
+
 static struct platform_driver max8998_rtc_driver = {
 	.driver		= {
 		.name	= "max8998-rtc",
@@ -280,6 +323,7 @@ static struct platform_driver max8998_rtc_driver = {
 	},
 	.probe		= max8998_rtc_probe,
 	.remove		= __devexit_p(max8998_rtc_remove),
+	.id_table	= max8998_rtc_id,
 };
 
 static int __init max8998_rtc_init(void)
diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h
index 686a744e63fb..c22b9a417c35 100644
--- a/include/linux/mfd/max8998.h
+++ b/include/linux/mfd/max8998.h
@@ -76,6 +76,9 @@ struct max8998_regulator_data {
  * @buck1_set1: BUCK1 gpio pin 1 to set output voltage
  * @buck1_set2: BUCK1 gpio pin 2 to set output voltage
  * @buck2_set3: BUCK2 gpio pin to set output voltage
+ * @wakeup: Allow to wake up from suspend
+ * @rtc_delay: LP3974 RTC chip bug that requires delay after a register
+ * write before reading it.
  */
 struct max8998_platform_data {
 	struct max8998_regulator_data	*regulators;
@@ -89,6 +92,7 @@ struct max8998_platform_data {
 	int				buck1_set2;
 	int				buck2_set3;
 	bool				wakeup;
+	bool				rtc_delay;
 };
 
 #endif /*  __LINUX_MFD_MAX8998_H */
-- 
cgit v1.2.3-71-gd317


From 735a3d9efdc5aeebe201008e6655b235c7f02aeb Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Tue, 11 Jan 2011 12:20:05 +0100
Subject: regulator: Support MAX8998/LP3974 DVS-GPIO

The previous driver did not support BUCK1-DVS3, BUCK1-DVS4, and
BUCK2-DVS2 modes. This patch adds such modes and an option to block
setting buck1/2 voltages out of the preset values.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/regulator/max8998.c | 87 ++++++++++++++++++++++++++++++++++-----------
 include/linux/mfd/max8998.h | 26 ++++++++++----
 2 files changed, 87 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/max8998.c b/drivers/regulator/max8998.c
index af52ebfc4927..0ec49ca527a8 100644
--- a/drivers/regulator/max8998.c
+++ b/drivers/regulator/max8998.c
@@ -424,6 +424,9 @@ static int max8998_set_voltage_buck(struct regulator_dev *rdev,
 				}
 			}
 
+			if (pdata->buck_voltage_lock)
+				return -EINVAL;
+
 			/* no predefine regulator found */
 			max8998->buck1_idx = (buck1_last_val % 2) + 2;
 			dev_dbg(max8998->dev, "max8998->buck1_idx:%d\n",
@@ -451,18 +454,26 @@ buck1_exit:
 			"BUCK2, i:%d buck2_vol1:%d, buck2_vol2:%d\n"
 			, i, max8998->buck2_vol[0], max8998->buck2_vol[1]);
 		if (gpio_is_valid(pdata->buck2_set3)) {
-			if (max8998->buck2_vol[0] == i) {
-				max8998->buck1_idx = 0;
-				buck2_gpio_set(pdata->buck2_set3, 0);
-			} else {
-				max8998->buck1_idx = 1;
-				ret = max8998_get_voltage_register(rdev, &reg,
-								   &shift,
-								   &mask);
-				ret = max8998_write_reg(i2c, reg, i);
-				max8998->buck2_vol[1] = i;
-				buck2_gpio_set(pdata->buck2_set3, 1);
+
+			/* check if requested voltage */
+			/* value is already defined */
+			for (j = 0; j < ARRAY_SIZE(max8998->buck2_vol); j++) {
+				if (max8998->buck2_vol[j] == i) {
+					max8998->buck2_idx = j;
+					buck2_gpio_set(pdata->buck2_set3, j);
+					goto buck2_exit;
+				}
 			}
+
+			if (pdata->buck_voltage_lock)
+				return -EINVAL;
+
+			max8998_get_voltage_register(rdev,
+					&reg, &shift, &mask);
+			ret = max8998_write_reg(i2c, reg, i);
+			max8998->buck2_vol[max8998->buck2_idx] = i;
+			buck2_gpio_set(pdata->buck2_set3, max8998->buck2_idx);
+buck2_exit:
 			dev_dbg(max8998->dev, "%s: SET3:%d\n", i2c->name,
 				gpio_get_value(pdata->buck2_set3));
 		} else {
@@ -707,6 +718,9 @@ static __devinit int max8998_pmic_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, max8998);
 	i2c = max8998->iodev->i2c;
 
+	max8998->buck1_idx = pdata->buck1_default_idx;
+	max8998->buck2_idx = pdata->buck2_default_idx;
+
 	/* NOTE: */
 	/* For unused GPIO NOT marked as -1 (thereof equal to 0)  WARN_ON */
 	/* will be displayed */
@@ -739,23 +753,46 @@ static __devinit int max8998_pmic_probe(struct platform_device *pdev)
 		i = 0;
 		while (buck12_voltage_map_desc.min +
 		       buck12_voltage_map_desc.step*i
-		       != (pdata->buck1_max_voltage1 / 1000))
+		       < (pdata->buck1_voltage1 / 1000))
 			i++;
-		printk(KERN_ERR "i:%d, buck1_idx:%d\n", i, max8998->buck1_idx);
 		max8998->buck1_vol[0] = i;
 		ret = max8998_write_reg(i2c, MAX8998_REG_BUCK1_VOLTAGE1, i);
+		if (ret)
+			return ret;
 
 		/* Set predefined value for BUCK1 register 2 */
 		i = 0;
 		while (buck12_voltage_map_desc.min +
 		       buck12_voltage_map_desc.step*i
-		       != (pdata->buck1_max_voltage2 / 1000))
+		       < (pdata->buck1_voltage2 / 1000))
 			i++;
 
 		max8998->buck1_vol[1] = i;
-		printk(KERN_ERR "i:%d, buck1_idx:%d\n", i, max8998->buck1_idx);
-		ret = max8998_write_reg(i2c, MAX8998_REG_BUCK1_VOLTAGE2, i)
-			+ ret;
+		ret = max8998_write_reg(i2c, MAX8998_REG_BUCK1_VOLTAGE2, i);
+		if (ret)
+			return ret;
+
+		/* Set predefined value for BUCK1 register 3 */
+		i = 0;
+		while (buck12_voltage_map_desc.min +
+		       buck12_voltage_map_desc.step*i
+		       < (pdata->buck1_voltage3 / 1000))
+			i++;
+
+		max8998->buck1_vol[2] = i;
+		ret = max8998_write_reg(i2c, MAX8998_REG_BUCK1_VOLTAGE3, i);
+		if (ret)
+			return ret;
+
+		/* Set predefined value for BUCK1 register 4 */
+		i = 0;
+		while (buck12_voltage_map_desc.min +
+		       buck12_voltage_map_desc.step*i
+		       < (pdata->buck1_voltage4 / 1000))
+			i++;
+
+		max8998->buck1_vol[3] = i;
+		ret = max8998_write_reg(i2c, MAX8998_REG_BUCK1_VOLTAGE4, i);
 		if (ret)
 			return ret;
 
@@ -772,18 +809,28 @@ static __devinit int max8998_pmic_probe(struct platform_device *pdev)
 		gpio_direction_output(pdata->buck2_set3,
 				      max8998->buck2_idx & 0x1);
 
-		/* BUCK2 - set preset default voltage value to buck2_vol[0] */
+		/* BUCK2 register 1 */
 		i = 0;
 		while (buck12_voltage_map_desc.min +
 		       buck12_voltage_map_desc.step*i
-		       != (pdata->buck2_max_voltage / 1000))
+		       < (pdata->buck2_voltage1 / 1000))
 			i++;
-		printk(KERN_ERR "i:%d, buck2_idx:%d\n", i, max8998->buck2_idx);
 		max8998->buck2_vol[0] = i;
 		ret = max8998_write_reg(i2c, MAX8998_REG_BUCK2_VOLTAGE1, i);
 		if (ret)
 			return ret;
 
+		/* BUCK2 register 2 */
+		i = 0;
+		while (buck12_voltage_map_desc.min +
+		       buck12_voltage_map_desc.step*i
+		       < (pdata->buck2_voltage2 / 1000))
+			i++;
+		printk(KERN_ERR "i2:%d, buck2_idx:%d\n", i, max8998->buck2_idx);
+		max8998->buck2_vol[1] = i;
+		ret = max8998_write_reg(i2c, MAX8998_REG_BUCK2_VOLTAGE2, i);
+		if (ret)
+			return ret;
 	}
 
 	for (i = 0; i < pdata->num_regulators; i++) {
diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h
index c22b9a417c35..61daa167b576 100644
--- a/include/linux/mfd/max8998.h
+++ b/include/linux/mfd/max8998.h
@@ -70,12 +70,20 @@ struct max8998_regulator_data {
  * @num_regulators: number of regultors used
  * @irq_base: base IRQ number for max8998, required for IRQs
  * @ono: power onoff IRQ number for max8998
- * @buck1_max_voltage1: BUCK1 maximum alowed voltage register 1
- * @buck1_max_voltage2: BUCK1 maximum alowed voltage register 2
- * @buck2_max_voltage: BUCK2 maximum alowed voltage
+ * @buck_voltage_lock: Do NOT change the values of the following six
+ *   registers set by buck?_voltage?. The voltage of BUCK1/2 cannot
+ *   be other than the preset values.
+ * @buck1_voltage1: BUCK1 DVS mode 1 voltage register
+ * @buck1_voltage2: BUCK1 DVS mode 2 voltage register
+ * @buck1_voltage3: BUCK1 DVS mode 3 voltage register
+ * @buck1_voltage4: BUCK1 DVS mode 4 voltage register
+ * @buck2_voltage1: BUCK2 DVS mode 1 voltage register
+ * @buck2_voltage2: BUCK2 DVS mode 2 voltage register
  * @buck1_set1: BUCK1 gpio pin 1 to set output voltage
  * @buck1_set2: BUCK1 gpio pin 2 to set output voltage
+ * @buck1_default_idx: Default for BUCK1 gpio pin 1, 2
  * @buck2_set3: BUCK2 gpio pin to set output voltage
+ * @buck2_default_idx: Default for BUCK2 gpio pin.
  * @wakeup: Allow to wake up from suspend
  * @rtc_delay: LP3974 RTC chip bug that requires delay after a register
  * write before reading it.
@@ -85,12 +93,18 @@ struct max8998_platform_data {
 	int				num_regulators;
 	int				irq_base;
 	int				ono;
-	int                             buck1_max_voltage1;
-	int                             buck1_max_voltage2;
-	int                             buck2_max_voltage;
+	bool				buck_voltage_lock;
+	int				buck1_voltage1;
+	int				buck1_voltage2;
+	int				buck1_voltage3;
+	int				buck1_voltage4;
+	int				buck2_voltage1;
+	int				buck2_voltage2;
 	int				buck1_set1;
 	int				buck1_set2;
+	int				buck1_default_idx;
 	int				buck2_set3;
+	int				buck2_default_idx;
 	bool				wakeup;
 	bool				rtc_delay;
 };
-- 
cgit v1.2.3-71-gd317


From 92d50a4132977b932ed830fa58c05deeb5c524f0 Mon Sep 17 00:00:00 2001
From: Mattias Wallin <mattias.wallin@stericsson.com>
Date: Tue, 7 Dec 2010 11:20:47 +0100
Subject: mfd: ab8500-core chip version cut 2.0 support

This patch adds support for chip version 2.0 or cut 2.0.
One new interrupt latch register - latch 12 - is introduced.

Signed-off-by: Mattias Wallin <mattias.wallin@stericsson.com>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-core.c  | 36 +++++++++++++++++++++----------
 include/linux/mfd/ab8500.h | 53 ++++++++++++++++++++++++++--------------------
 2 files changed, 55 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 86cc8874e6de..b6887014d687 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -52,6 +52,7 @@
 #define AB8500_IT_LATCH8_REG		0x27
 #define AB8500_IT_LATCH9_REG		0x28
 #define AB8500_IT_LATCH10_REG		0x29
+#define AB8500_IT_LATCH12_REG		0x2B
 #define AB8500_IT_LATCH19_REG		0x32
 #define AB8500_IT_LATCH20_REG		0x33
 #define AB8500_IT_LATCH21_REG		0x34
@@ -98,7 +99,7 @@
  * offset 0.
  */
 static const int ab8500_irq_regoffset[AB8500_NUM_IRQ_REGS] = {
-	0, 1, 2, 3, 4, 6, 7, 8, 9, 18, 19, 20, 21,
+	0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 18, 19, 20, 21,
 };
 
 static int ab8500_get_chip_id(struct device *dev)
@@ -252,6 +253,10 @@ static void ab8500_irq_sync_unlock(struct irq_data *data)
 		if (new == old)
 			continue;
 
+		/* Interrupt register 12 does'nt exist prior to version 0x20 */
+		if (ab8500_irq_regoffset[i] == 11 && ab8500->chip_id < 0x20)
+			continue;
+
 		ab8500->oldmask[i] = new;
 
 		reg = AB8500_IT_MASK1_REG + ab8500_irq_regoffset[i];
@@ -301,6 +306,10 @@ static irqreturn_t ab8500_irq(int irq, void *dev)
 		int status;
 		u8 value;
 
+		/* Interrupt register 12 does'nt exist prior to version 0x20 */
+		if (regoffset == 11 && ab8500->chip_id < 0x20)
+			continue;
+
 		status = get_register_interruptible(ab8500, AB8500_INTERRUPT,
 			AB8500_IT_LATCH1_REG + regoffset, &value);
 		if (status < 0 || value == 0)
@@ -554,6 +563,12 @@ static struct resource ab8500_usb_resources[] = {
 		.end = AB8500_INT_VBUS_DET_R,
 		.flags = IORESOURCE_IRQ,
 	},
+	{
+		.name = "USB_LINK_STATUS",
+		.start = AB8500_INT_USB_LINK_STATUS,
+		.end = AB8500_INT_USB_LINK_STATUS,
+		.flags = IORESOURCE_IRQ,
+	},
 };
 
 static struct resource ab8500_temp_resources[] = {
@@ -670,8 +685,9 @@ int __devinit ab8500_init(struct ab8500 *ab8500)
 	 * 0x0 - Early Drop
 	 * 0x10 - Cut 1.0
 	 * 0x11 - Cut 1.1
+	 * 0x20 - Cut 2.0
 	 */
-	if (value == 0x0 || value == 0x10 || value == 0x11) {
+	if (value == 0x0 || value == 0x10 || value == 0x11 || value == 0x20) {
 		ab8500->revision = value;
 		dev_info(ab8500->dev, "detected chip, revision: %#x\n", value);
 	} else {
@@ -684,18 +700,16 @@ int __devinit ab8500_init(struct ab8500 *ab8500)
 		plat->init(ab8500);
 
 	/* Clear and mask all interrupts */
-	for (i = 0; i < 10; i++) {
-		get_register_interruptible(ab8500, AB8500_INTERRUPT,
-			AB8500_IT_LATCH1_REG + i, &value);
-		set_register_interruptible(ab8500, AB8500_INTERRUPT,
-			AB8500_IT_MASK1_REG + i, 0xff);
-	}
+	for (i = 0; i < AB8500_NUM_IRQ_REGS; i++) {
+		/* Interrupt register 12 does'nt exist prior to version 0x20 */
+		if (ab8500_irq_regoffset[i] == 11 && ab8500->chip_id < 0x20)
+			continue;
 
-	for (i = 18; i < 24; i++) {
 		get_register_interruptible(ab8500, AB8500_INTERRUPT,
-			AB8500_IT_LATCH1_REG + i, &value);
+			AB8500_IT_LATCH1_REG + ab8500_irq_regoffset[i],
+			&value);
 		set_register_interruptible(ab8500, AB8500_INTERRUPT,
-			AB8500_IT_MASK1_REG + i, 0xff);
+			AB8500_IT_MASK1_REG + ab8500_irq_regoffset[i], 0xff);
 	}
 
 	ret = abx500_register_ops(ab8500->dev, &ab8500_ops);
diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h
index 1ad169655235..37f56b7c4c15 100644
--- a/include/linux/mfd/ab8500.h
+++ b/include/linux/mfd/ab8500.h
@@ -74,30 +74,37 @@
 #define AB8500_INT_ACC_DETECT_21DB_F	37
 #define AB8500_INT_ACC_DETECT_21DB_R	38
 #define AB8500_INT_GP_SW_ADC_CONV_END	39
-#define AB8500_INT_BTEMP_LOW		72
-#define AB8500_INT_BTEMP_LOW_MEDIUM	73
-#define AB8500_INT_BTEMP_MEDIUM_HIGH	74
-#define AB8500_INT_BTEMP_HIGH		75
-#define AB8500_INT_USB_CHARGER_NOT_OK	81
-#define AB8500_INT_ID_WAKEUP_R		82
-#define AB8500_INT_ID_DET_R1R		84
-#define AB8500_INT_ID_DET_R2R		85
-#define AB8500_INT_ID_DET_R3R		86
-#define AB8500_INT_ID_DET_R4R		87
-#define AB8500_INT_ID_WAKEUP_F		88
-#define AB8500_INT_ID_DET_R1F		90
-#define AB8500_INT_ID_DET_R2F		91
-#define AB8500_INT_ID_DET_R3F		92
-#define AB8500_INT_ID_DET_R4F		93
-#define AB8500_INT_USB_CHG_DET_DONE	94
-#define AB8500_INT_USB_CH_TH_PROT_F	96
-#define AB8500_INT_USB_CH_TH_PROT_R	97
-#define AB8500_INT_MAIN_CH_TH_PROT_F	98
-#define AB8500_INT_MAIN_CH_TH_PROT_R	99
-#define AB8500_INT_USB_CHARGER_NOT_OKF	103
+#define AB8500_INT_ADP_SOURCE_ERROR	72
+#define AB8500_INT_ADP_SINK_ERROR	73
+#define AB8500_INT_ADP_PROBE_PLUG	74
+#define AB8500_INT_ADP_PROBE_UNPLUG	75
+#define AB8500_INT_ADP_SENSE_OFF	76
+#define AB8500_INT_USB_PHY_POWER_ERR	78
+#define AB8500_INT_USB_LINK_STATUS	79
+#define AB8500_INT_BTEMP_LOW		80
+#define AB8500_INT_BTEMP_LOW_MEDIUM	81
+#define AB8500_INT_BTEMP_MEDIUM_HIGH	82
+#define AB8500_INT_BTEMP_HIGH		83
+#define AB8500_INT_USB_CHARGER_NOT_OK	89
+#define AB8500_INT_ID_WAKEUP_R		90
+#define AB8500_INT_ID_DET_R1R		92
+#define AB8500_INT_ID_DET_R2R		93
+#define AB8500_INT_ID_DET_R3R		94
+#define AB8500_INT_ID_DET_R4R		95
+#define AB8500_INT_ID_WAKEUP_F		96
+#define AB8500_INT_ID_DET_R1F		98
+#define AB8500_INT_ID_DET_R2F		99
+#define AB8500_INT_ID_DET_R3F		100
+#define AB8500_INT_ID_DET_R4F		101
+#define AB8500_INT_USB_CHG_DET_DONE	102
+#define AB8500_INT_USB_CH_TH_PROT_F	104
+#define AB8500_INT_USB_CH_TH_PROT_R    105
+#define AB8500_INT_MAIN_CH_TH_PROT_F   106
+#define AB8500_INT_MAIN_CH_TH_PROT_R	107
+#define AB8500_INT_USB_CHARGER_NOT_OKF	111
 
-#define AB8500_NR_IRQS			104
-#define AB8500_NUM_IRQ_REGS		13
+#define AB8500_NR_IRQS			112
+#define AB8500_NUM_IRQ_REGS		14
 
 /**
  * struct ab8500 - ab8500 internal structure
-- 
cgit v1.2.3-71-gd317


From 32385c7cf60a78375b63afc4f02001df84dfd1a0 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 14 Jan 2011 13:12:45 +0000
Subject: kernel: fix hlist_bl again

__d_rehash is dereferencing an almost-NULL pointer on my ARM926.
CONFIG_SMP=n and CONFIG_DEBUG_SPINLOCK=y.

The faulting instruction is:    strne   r3, [r2, #4]
and as can be seen from the register dump below, r2 is 0x00000001, hence
the faulting 0x00000005 address.

__d_rehash is essentially:

       spin_lock_bucket(b);
       entry->d_flags &= ~DCACHE_UNHASHED;
       hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
       spin_unlock_bucket(b);

which is:

       bit_spin_lock(0, (unsigned long *)&b->head.first);
       entry->d_flags &= ~DCACHE_UNHASHED;
       hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
       __bit_spin_unlock(0, (unsigned long *)&b->head.first);

bit_spin_lock(0, ptr) sets bit 0 of *ptr, in this case b->head.first if
CONFIG_SMP or CONFIG_DEBUG_SPINLOCK is set:

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
       while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
               while (test_bit(bitnum, addr)) {
                       preempt_enable();
                       cpu_relax();
                       preempt_disable();
               }
       }
#endif

So, b->head.first starts off NULL, and becomes a non-NULL (address 1).
hlist_bl_add_head_rcu() does this:

static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
                                       struct hlist_bl_head *h)
{
       first = hlist_bl_first(h);
       n->next = first;
       if (first)
               first->pprev = &n->next;

It is the store to first->pprev which is faulting.

hlist_bl_first():

static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
       return (struct hlist_bl_node *)
               ((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}

but:
#if defined(CONFIG_SMP)
#define LIST_BL_LOCKMASK        1UL
#else
#define LIST_BL_LOCKMASK        0UL
#endif

So, we have one piece of code which sets bit 0 of addresses, and another
bit of code which doesn't clear it before dereferencing the pointer if
!CONFIG_SMP && CONFIG_DEBUG_SPINLOCK.  With the patch below, I can again
sucessfully boot the kernel on my Versatile PB/926 platform.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/list_bl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index b2adbb4b2f73..5bad17d1acde 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -16,7 +16,7 @@
  * some fast and compact auxiliary data.
  */
 
-#if defined(CONFIG_SMP)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 #define LIST_BL_LOCKMASK	1UL
 #else
 #define LIST_BL_LOCKMASK	0UL
-- 
cgit v1.2.3-71-gd317


From 359ab9f5b154cbd807a11e22792235f0f36b0cd5 Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Fri, 14 Jan 2011 14:46:11 +0900
Subject: power_supply: Add MAX17042 Fuel Gauge Driver

The MAX17042 is a fuel gauge with an I2C interface for lithium-ion
betteries. Unlike its predecessor MAX17040, MAX17042 uses 16bit
registers. Besides, MAX17042 has much more features than MAX17040; e.g.,
a thermistor, current and current accumulation measurement, battery
internal resistance estimate, average values of measurement, and others.

This patch implements a driver for MAX17042.
In this initial release, we have implemented the most basic features of
a fuel gauge: measure the battery capacity and voltage.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/Kconfig                  |  10 ++
 drivers/power/Makefile                 |   1 +
 drivers/power/max17042_battery.c       | 239 +++++++++++++++++++++++++++++++++
 include/linux/power/max17042_battery.h |  30 +++++
 4 files changed, 280 insertions(+)
 create mode 100644 drivers/power/max17042_battery.c
 create mode 100644 include/linux/power/max17042_battery.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 32165295cb1b..61bf5d724139 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -136,6 +136,16 @@ config BATTERY_MAX17040
 	  in handheld and portable equipment. The MAX17040 is configured
 	  to operate with a single lithium cell
 
+config BATTERY_MAX17042
+	tristate "Maxim MAX17042/8997/8966 Fuel Gauge"
+	depends on I2C
+	help
+	  MAX17042 is fuel-gauge systems for lithium-ion (Li+) batteries
+	  in handheld and portable equipment. The MAX17042 is configured
+	  to operate with a single lithium cell. MAX8997 and MAX8966 are
+	  multi-function devices that include fuel gauages that are compatible
+	  with MAX17042.
+
 config BATTERY_Z2
 	tristate "Z2 battery driver"
 	depends on I2C && MACH_ZIPIT2
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 545459f9f356..8385bfae8728 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_BATTERY_BQ20Z75)	+= bq20z75.o
 obj-$(CONFIG_BATTERY_BQ27x00)	+= bq27x00_battery.o
 obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
 obj-$(CONFIG_BATTERY_MAX17040)	+= max17040_battery.o
+obj-$(CONFIG_BATTERY_MAX17042)	+= max17042_battery.o
 obj-$(CONFIG_BATTERY_Z2)	+= z2_battery.o
 obj-$(CONFIG_BATTERY_S3C_ADC)	+= s3c_adc_battery.o
 obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
diff --git a/drivers/power/max17042_battery.c b/drivers/power/max17042_battery.c
new file mode 100644
index 000000000000..c5c8805156cb
--- /dev/null
+++ b/drivers/power/max17042_battery.c
@@ -0,0 +1,239 @@
+/*
+ * Fuel gauge driver for Maxim 17042 / 8966 / 8997
+ *  Note that Maxim 8966 and 8997 are mfd and this is its subdevice.
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max17040_battery.c
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/mod_devicetable.h>
+#include <linux/power_supply.h>
+#include <linux/power/max17042_battery.h>
+
+enum max17042_register {
+	MAX17042_STATUS		= 0x00,
+	MAX17042_VALRT_Th	= 0x01,
+	MAX17042_TALRT_Th	= 0x02,
+	MAX17042_SALRT_Th	= 0x03,
+	MAX17042_AtRate		= 0x04,
+	MAX17042_RepCap		= 0x05,
+	MAX17042_RepSOC		= 0x06,
+	MAX17042_Age		= 0x07,
+	MAX17042_TEMP		= 0x08,
+	MAX17042_VCELL		= 0x09,
+	MAX17042_Current	= 0x0A,
+	MAX17042_AvgCurrent	= 0x0B,
+	MAX17042_Qresidual	= 0x0C,
+	MAX17042_SOC		= 0x0D,
+	MAX17042_AvSOC		= 0x0E,
+	MAX17042_RemCap		= 0x0F,
+	MAX17402_FullCAP	= 0x10,
+	MAX17042_TTE		= 0x11,
+	MAX17042_V_empty	= 0x12,
+
+	MAX17042_RSLOW		= 0x14,
+
+	MAX17042_AvgTA		= 0x16,
+	MAX17042_Cycles		= 0x17,
+	MAX17042_DesignCap	= 0x18,
+	MAX17042_AvgVCELL	= 0x19,
+	MAX17042_MinMaxTemp	= 0x1A,
+	MAX17042_MinMaxVolt	= 0x1B,
+	MAX17042_MinMaxCurr	= 0x1C,
+	MAX17042_CONFIG		= 0x1D,
+	MAX17042_ICHGTerm	= 0x1E,
+	MAX17042_AvCap		= 0x1F,
+	MAX17042_ManName	= 0x20,
+	MAX17042_DevName	= 0x21,
+	MAX17042_DevChem	= 0x22,
+
+	MAX17042_TempNom	= 0x24,
+	MAX17042_TempCold	= 0x25,
+	MAX17042_TempHot	= 0x26,
+	MAX17042_AIN		= 0x27,
+	MAX17042_LearnCFG	= 0x28,
+	MAX17042_SHFTCFG	= 0x29,
+	MAX17042_RelaxCFG	= 0x2A,
+	MAX17042_MiscCFG	= 0x2B,
+	MAX17042_TGAIN		= 0x2C,
+	MAx17042_TOFF		= 0x2D,
+	MAX17042_CGAIN		= 0x2E,
+	MAX17042_COFF		= 0x2F,
+
+	MAX17042_Q_empty	= 0x33,
+	MAX17042_T_empty	= 0x34,
+
+	MAX17042_RCOMP0		= 0x38,
+	MAX17042_TempCo		= 0x39,
+	MAX17042_Rx		= 0x3A,
+	MAX17042_T_empty0	= 0x3B,
+	MAX17042_TaskPeriod	= 0x3C,
+	MAX17042_FSTAT		= 0x3D,
+
+	MAX17042_SHDNTIMER	= 0x3F,
+
+	MAX17042_VFRemCap	= 0x4A,
+
+	MAX17042_QH		= 0x4D,
+	MAX17042_QL		= 0x4E,
+};
+
+struct max17042_chip {
+	struct i2c_client *client;
+	struct power_supply battery;
+	struct max17042_platform_data *pdata;
+};
+
+static int max17042_write_reg(struct i2c_client *client, u8 reg, u16 value)
+{
+	int ret = i2c_smbus_write_word_data(client, reg, value);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
+
+	return ret;
+}
+
+static int max17042_read_reg(struct i2c_client *client, u8 reg)
+{
+	int ret = i2c_smbus_read_word_data(client, reg);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
+
+	return ret;
+}
+
+static enum power_supply_property max17042_battery_props[] = {
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_VOLTAGE_AVG,
+	POWER_SUPPLY_PROP_CAPACITY,
+};
+
+static int max17042_get_property(struct power_supply *psy,
+			    enum power_supply_property psp,
+			    union power_supply_propval *val)
+{
+	struct max17042_chip *chip = container_of(psy,
+				struct max17042_chip, battery);
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = max17042_read_reg(chip->client,
+				MAX17042_VCELL) * 83; /* 1000 / 12 = 83 */
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_AVG:
+		val->intval = max17042_read_reg(chip->client,
+				MAX17042_AvgVCELL) * 83;
+		break;
+	case POWER_SUPPLY_PROP_CAPACITY:
+		val->intval = max17042_read_reg(chip->client,
+				MAX17042_SOC) / 256;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __devinit max17042_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	struct max17042_chip *chip;
+	int ret;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_WORD_DATA))
+		return -EIO;
+
+	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	chip->client = client;
+	chip->pdata = client->dev.platform_data;
+
+	i2c_set_clientdata(client, chip);
+
+	chip->battery.name		= "max17042_battery";
+	chip->battery.type		= POWER_SUPPLY_TYPE_BATTERY;
+	chip->battery.get_property	= max17042_get_property;
+	chip->battery.properties	= max17042_battery_props;
+	chip->battery.num_properties	= ARRAY_SIZE(max17042_battery_props);
+
+	ret = power_supply_register(&client->dev, &chip->battery);
+	if (ret) {
+		dev_err(&client->dev, "failed: power supply register\n");
+		i2c_set_clientdata(client, NULL);
+		kfree(chip);
+		return ret;
+	}
+
+	if (!chip->pdata->enable_current_sense) {
+		max17042_write_reg(client, MAX17042_CGAIN, 0x0000);
+		max17042_write_reg(client, MAX17042_MiscCFG, 0x0003);
+		max17042_write_reg(client, MAX17042_LearnCFG, 0x0007);
+	}
+
+	return 0;
+}
+
+static int __devexit max17042_remove(struct i2c_client *client)
+{
+	struct max17042_chip *chip = i2c_get_clientdata(client);
+
+	power_supply_unregister(&chip->battery);
+	i2c_set_clientdata(client, NULL);
+	kfree(chip);
+	return 0;
+}
+
+static const struct i2c_device_id max17042_id[] = {
+	{ "max17042", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, max17042_id);
+
+static struct i2c_driver max17042_i2c_driver = {
+	.driver	= {
+		.name	= "max17042",
+	},
+	.probe		= max17042_probe,
+	.remove		= __devexit_p(max17042_remove),
+	.id_table	= max17042_id,
+};
+
+static int __init max17042_init(void)
+{
+	return i2c_add_driver(&max17042_i2c_driver);
+}
+module_init(max17042_init);
+
+static void __exit max17042_exit(void)
+{
+	i2c_del_driver(&max17042_i2c_driver);
+}
+module_exit(max17042_exit);
+
+MODULE_AUTHOR("MyungJoo Ham <myungjoo.ham@samsung.com>");
+MODULE_DESCRIPTION("MAX17042 Fuel Gauge");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h
new file mode 100644
index 000000000000..7995deb8bfc1
--- /dev/null
+++ b/include/linux/power/max17042_battery.h
@@ -0,0 +1,30 @@
+/*
+ * Fuel gauge driver for Maxim 17042 / 8966 / 8997
+ *  Note that Maxim 8966 and 8997 are mfd and this is its subdevice.
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __MAX17042_BATTERY_H_
+#define __MAX17042_BATTERY_H_
+
+struct max17042_platform_data {
+	bool enable_current_sense;
+};
+
+#endif /* __MAX17042_BATTERY_H_ */
-- 
cgit v1.2.3-71-gd317


From 836cb711ad7960e52625b24195d6e70b79ab0816 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa@bluextal.(none)>
Date: Fri, 14 Jan 2011 14:56:53 +0900
Subject: Revert update for dirty_ratio for memcg.

The flags added by commit db16d5ec1f87f17511599bc77857dd1662b5a22f
has no user now. We believe we'll use it soon but considering
patch reviewing, the change itself should be folded into incoming
set of "dirty ratio for memcg" patches.

So, it's better to drop this change from current mainline tree.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 5b0c971d7cae..6d6cb7a57bb3 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -42,9 +42,6 @@ enum {
 	/* flags for mem_cgroup and file and I/O status */
 	PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
 	PCG_FILE_MAPPED, /* page is accounted as "mapped" */
-	PCG_FILE_DIRTY, /* page is dirty */
-	PCG_FILE_WRITEBACK, /* page is under writeback */
-	PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
 	/* No lock in page_cgroup */
 	PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
 };
@@ -65,10 +62,6 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
 	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
 
-#define TESTSETPCGFLAG(uname, lname)			\
-static inline int TestSetPageCgroup##uname(struct page_cgroup *pc)	\
-	{ return test_and_set_bit(PCG_##lname, &pc->flags);  }
-
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
 CLEARPCGFLAG(Cache, CACHE)
@@ -88,22 +81,6 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
 CLEARPCGFLAG(FileMapped, FILE_MAPPED)
 TESTPCGFLAG(FileMapped, FILE_MAPPED)
 
-SETPCGFLAG(FileDirty, FILE_DIRTY)
-CLEARPCGFLAG(FileDirty, FILE_DIRTY)
-TESTPCGFLAG(FileDirty, FILE_DIRTY)
-TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY)
-TESTSETPCGFLAG(FileDirty, FILE_DIRTY)
-
-SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
-CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
-TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
-
-SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
-CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
-TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
-TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
-TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
-
 SETPCGFLAG(Migration, MIGRATION)
 CLEARPCGFLAG(Migration, MIGRATION)
 TESTPCGFLAG(Migration, MIGRATION)
-- 
cgit v1.2.3-71-gd317


From 323b7fe8f8f6d5ac6214382cf30e8b3a80b265c9 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Fri, 14 Jan 2011 09:34:29 +0100
Subject: include/gpio.h: remove remaining __must_check-annotiations

Commit 5f829e405ec4e96f711165a4a7b55c271d4363e2 (gpiolib: add missing functions
to generic fallback) also introduced two.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Cc: Greg KH <gregkh@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gpio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 4b47ed96f131..32720baf70f1 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -35,13 +35,13 @@ static inline int gpio_request(unsigned gpio, const char *label)
 	return -ENOSYS;
 }
 
-static inline int __must_check gpio_request_one(unsigned gpio,
+static inline int gpio_request_one(unsigned gpio,
 					unsigned long flags, const char *label)
 {
 	return -ENOSYS;
 }
 
-static inline int __must_check gpio_request_array(struct gpio *array, size_t num)
+static inline int gpio_request_array(struct gpio *array, size_t num)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3-71-gd317


From 415e12b2379239973feab91850b0dce985c6058a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 7 Jan 2011 00:55:09 +0100
Subject: PCI/ACPI: Request _OSC control once for each root bridge (v3)

Move the evaluation of acpi_pci_osc_control_set() (to request control of
PCI Express native features) into acpi_pci_root_add() to avoid calling
it many times for the same root complex with the same arguments.
Additionally, check if all of the requisite _OSC support bits are set
before calling acpi_pci_osc_control_set() for a given root complex.

References: https://bugzilla.kernel.org/show_bug.cgi?id=20232
Reported-by: Ozan Caglayan <ozan@pardus.org.tr>
Tested-by: Ozan Caglayan <ozan@pardus.org.tr>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/apei/hest.c        | 22 +++++++++-------------
 drivers/acpi/pci_root.c         | 35 +++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h               |  8 --------
 drivers/pci/pcie/aer/aerdrv.c   |  1 +
 drivers/pci/pcie/aer/aerdrv.h   |  3 ---
 drivers/pci/pcie/portdrv.h      |  3 ---
 drivers/pci/pcie/portdrv_acpi.c | 23 ++++-------------------
 include/acpi/apei.h             |  6 ++++++
 include/linux/pci-acpi.h        |  6 ++++++
 include/linux/pci.h             | 11 +++++++++++
 10 files changed, 72 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index daa7bc63f1d4..4ee58e72b730 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -195,24 +195,24 @@ static int __init setup_hest_disable(char *str)
 
 __setup("hest_disable", setup_hest_disable);
 
-static int __init hest_init(void)
+void __init acpi_hest_init(void)
 {
 	acpi_status status;
 	int rc = -ENODEV;
 	unsigned int ghes_count = 0;
 
 	if (acpi_disabled)
-		goto err;
+		return;
 
 	if (hest_disable) {
-		pr_info(HEST_PFX "HEST tabling parsing is disabled.\n");
-		goto err;
+		pr_info(HEST_PFX "Table parsing disabled.\n");
+		return;
 	}
 
 	status = acpi_get_table(ACPI_SIG_HEST, 0,
 				(struct acpi_table_header **)&hest_tab);
 	if (status == AE_NOT_FOUND) {
-		pr_info(HEST_PFX "Table is not found!\n");
+		pr_info(HEST_PFX "Table not found.\n");
 		goto err;
 	} else if (ACPI_FAILURE(status)) {
 		const char *msg = acpi_format_exception(status);
@@ -226,15 +226,11 @@ static int __init hest_init(void)
 		goto err;
 
 	rc = hest_ghes_dev_register(ghes_count);
-	if (rc)
-		goto err;
-
-	pr_info(HEST_PFX "HEST table parsing is initialized.\n");
+	if (!rc) {
+		pr_info(HEST_PFX "Table parsing has been initialized.\n");
+		return;
+	}
 
-	return 0;
 err:
 	hest_disable = 1;
-	return rc;
 }
-
-subsys_initcall(hest_init);
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 96668ad09622..d9766797cd98 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <acpi/apei.h>
 
 #define PREFIX "ACPI: "
 
@@ -47,6 +48,11 @@ static int acpi_pci_root_add(struct acpi_device *device);
 static int acpi_pci_root_remove(struct acpi_device *device, int type);
 static int acpi_pci_root_start(struct acpi_device *device);
 
+#define ACPI_PCIE_REQ_SUPPORT (OSC_EXT_PCI_CONFIG_SUPPORT \
+				| OSC_ACTIVE_STATE_PWR_SUPPORT \
+				| OSC_CLOCK_PWR_CAPABILITY_SUPPORT \
+				| OSC_MSI_SUPPORT)
+
 static const struct acpi_device_id root_device_ids[] = {
 	{"PNP0A03", 0},
 	{"", 0},
@@ -566,6 +572,33 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	if (flags != base_flags)
 		acpi_pci_osc_support(root, flags);
 
+	if (!pcie_ports_disabled
+	    && (flags & ACPI_PCIE_REQ_SUPPORT) == ACPI_PCIE_REQ_SUPPORT) {
+		flags = OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL
+			| OSC_PCI_EXPRESS_NATIVE_HP_CONTROL
+			| OSC_PCI_EXPRESS_PME_CONTROL;
+
+		if (pci_aer_available()) {
+			if (aer_acpi_firmware_first())
+				dev_dbg(root->bus->bridge,
+					"PCIe errors handled by BIOS.\n");
+			else
+				flags |= OSC_PCI_EXPRESS_AER_CONTROL;
+		}
+
+		dev_info(root->bus->bridge,
+			"Requesting ACPI _OSC control (0x%02x)\n", flags);
+
+		status = acpi_pci_osc_control_set(device->handle, &flags,
+					OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL);
+		if (ACPI_SUCCESS(status))
+			dev_info(root->bus->bridge,
+				"ACPI _OSC control (0x%02x) granted\n", flags);
+		else
+			dev_dbg(root->bus->bridge,
+				"ACPI _OSC request failed (code %d)\n", status);
+	}
+
 	pci_acpi_add_bus_pm_notifier(device, root->bus);
 	if (device->wakeup.flags.run_wake)
 		device_set_run_wake(root->bus->bridge, true);
@@ -603,6 +636,8 @@ static int __init acpi_pci_root_init(void)
 	if (acpi_pci_disabled)
 		return 0;
 
+	acpi_hest_init();
+
 	pci_acpi_crs_quirks();
 	if (acpi_bus_register_driver(&acpi_pci_root_driver) < 0)
 		return -ENODEV;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 7d33f6673868..16ae9659346b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -140,14 +140,6 @@ static inline void pci_no_msi(void) { }
 static inline void pci_msi_init_pci_dev(struct pci_dev *dev) { }
 #endif
 
-#ifdef CONFIG_PCIEAER
-void pci_no_aer(void);
-bool pci_aer_available(void);
-#else
-static inline void pci_no_aer(void) { }
-static inline bool pci_aer_available(void) { return false; }
-#endif
-
 static inline int pci_no_d1d2(struct pci_dev *dev)
 {
 	unsigned int parent_dstates = 0;
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 2b2b6508efde..58ad7917553c 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -17,6 +17,7 @@
 
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/pci-acpi.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 9656e3060412..80c11d131499 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -132,7 +132,6 @@ static inline int aer_osc_setup(struct pcie_device *pciedev)
 
 #ifdef CONFIG_ACPI_APEI
 extern int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
-extern bool aer_acpi_firmware_first(void);
 #else
 static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev)
 {
@@ -140,8 +139,6 @@ static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev)
 		return pci_dev->__aer_firmware_first;
 	return 0;
 }
-
-static inline bool aer_acpi_firmware_first(void) { return false; }
 #endif
 
 static inline void pcie_aer_force_firmware_first(struct pci_dev *pci_dev,
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 8fcc03598b29..bd00a01aef14 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,9 +20,6 @@
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
-extern bool pcie_ports_disabled;
-extern bool pcie_ports_auto;
-
 extern struct bus_type pcie_port_bus_type;
 extern int pcie_port_device_register(struct pci_dev *dev);
 #ifdef CONFIG_PM
diff --git a/drivers/pci/pcie/portdrv_acpi.c b/drivers/pci/pcie/portdrv_acpi.c
index 5982b6a63b89..a86b56e5f2f2 100644
--- a/drivers/pci/pcie/portdrv_acpi.c
+++ b/drivers/pci/pcie/portdrv_acpi.c
@@ -33,7 +33,7 @@
  */
 int pcie_port_acpi_setup(struct pci_dev *port, int *srv_mask)
 {
-	acpi_status status;
+	struct acpi_pci_root *root;
 	acpi_handle handle;
 	u32 flags;
 
@@ -44,26 +44,11 @@ int pcie_port_acpi_setup(struct pci_dev *port, int *srv_mask)
 	if (!handle)
 		return -EINVAL;
 
-	flags = OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL
-		| OSC_PCI_EXPRESS_NATIVE_HP_CONTROL
-		| OSC_PCI_EXPRESS_PME_CONTROL;
-
-	if (pci_aer_available()) {
-		if (aer_acpi_firmware_first())
-			dev_dbg(&port->dev, "PCIe errors handled by BIOS.\n");
-		else
-			flags |= OSC_PCI_EXPRESS_AER_CONTROL;
-	}
-
-	status = acpi_pci_osc_control_set(handle, &flags,
-					OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL);
-	if (ACPI_FAILURE(status)) {
-		dev_dbg(&port->dev, "ACPI _OSC request failed (code %d)\n",
-			status);
+	root = acpi_pci_find_root(handle);
+	if (!root)
 		return -ENODEV;
-	}
 
-	dev_info(&port->dev, "ACPI _OSC control granted for 0x%02x\n", flags);
+	flags = root->osc_control_set;
 
 	*srv_mask = PCIE_PORT_SERVICE_VC;
 	if (flags & OSC_PCI_EXPRESS_NATIVE_HP_CONTROL)
diff --git a/include/acpi/apei.h b/include/acpi/apei.h
index b3365025ff8d..c4dbb132d902 100644
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -19,6 +19,12 @@
 extern int hest_disable;
 extern int erst_disable;
 
+#ifdef CONFIG_ACPI_APEI
+void __init acpi_hest_init(void);
+#else
+static inline void acpi_hest_init(void) { return; }
+#endif
+
 typedef int (*apei_hest_func_t)(struct acpi_hest_header *hest_hdr, void *data);
 int apei_hest_parse(apei_hest_func_t func, void *data);
 
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index c8b6473c5f42..479d9bb88e11 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -40,4 +40,10 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 { return NULL; }
 #endif
 
+#ifdef CONFIG_ACPI_APEI
+extern bool aer_acpi_firmware_first(void);
+#else
+static inline bool aer_acpi_firmware_first(void) { return false; }
+#endif
+
 #endif	/* _PCI_ACPI_H_ */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 63cbadce337e..12dd86a82a15 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -994,6 +994,9 @@ extern void pci_restore_msi_state(struct pci_dev *dev);
 extern int pci_msi_enabled(void);
 #endif
 
+extern bool pcie_ports_disabled;
+extern bool pcie_ports_auto;
+
 #ifndef CONFIG_PCIEASPM
 static inline int pcie_aspm_enabled(void)
 {
@@ -1003,6 +1006,14 @@ static inline int pcie_aspm_enabled(void)
 extern int pcie_aspm_enabled(void);
 #endif
 
+#ifdef CONFIG_PCIEAER
+void pci_no_aer(void);
+bool pci_aer_available(void);
+#else
+static inline void pci_no_aer(void) { }
+static inline bool pci_aer_available(void) { return false; }
+#endif
+
 #ifndef CONFIG_PCIE_ECRC
 static inline void pcie_set_ecrc_checking(struct pci_dev *dev)
 {
-- 
cgit v1.2.3-71-gd317


From b6e335aeeb114dccb07eaa09e8b62ff9510cf745 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 29 Dec 2010 13:21:23 +0100
Subject: PCI/PM: Use pm_wakeup_event() directly for reporting wakeup events

After recent changes related to wakeup events pm_wakeup_event()
automatically checks if the given device is configured to signal wakeup,
so pci_wakeup_event() may be a static inline function calling
pm_wakeup_event() directly.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c   | 16 ----------------
 drivers/pci/pci.h   |  6 ++++++
 include/linux/pci.h |  1 -
 3 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6762dcae90ab..bf7ad2c09955 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1297,22 +1297,6 @@ bool pci_check_pme_status(struct pci_dev *dev)
 	return ret;
 }
 
-/*
- * Time to wait before the system can be put into a sleep state after reporting
- * a wakeup event signaled by a PCI device.
- */
-#define PCI_WAKEUP_COOLDOWN	100
-
-/**
- * pci_wakeup_event - Report a wakeup event related to a given PCI device.
- * @dev: Device to report the wakeup event for.
- */
-void pci_wakeup_event(struct pci_dev *dev)
-{
-	if (device_may_wakeup(&dev->dev))
-		pm_wakeup_event(&dev->dev, PCI_WAKEUP_COOLDOWN);
-}
-
 /**
  * pci_pme_wakeup - Wake up a PCI device if its PME Status bit is set.
  * @dev: Device to handle.
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 16ae9659346b..f69d6e0fda75 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -74,6 +74,12 @@ extern void pci_pm_init(struct pci_dev *dev);
 extern void platform_pci_wakeup_init(struct pci_dev *dev);
 extern void pci_allocate_cap_save_buffers(struct pci_dev *dev);
 
+static inline void pci_wakeup_event(struct pci_dev *dev)
+{
+	/* Wait 100 ms before the system can be put into a sleep state. */
+	pm_wakeup_event(&dev->dev, 100);
+}
+
 static inline bool pci_is_bridge(struct pci_dev *pci_dev)
 {
 	return !!(pci_dev->subordinate);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 12dd86a82a15..9e67dcd054c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -820,7 +820,6 @@ int pci_prepare_to_sleep(struct pci_dev *dev);
 int pci_back_from_sleep(struct pci_dev *dev);
 bool pci_dev_run_wake(struct pci_dev *dev);
 bool pci_check_pme_status(struct pci_dev *dev);
-void pci_wakeup_event(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
 
 static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state,
-- 
cgit v1.2.3-71-gd317


From 49731baa41df404c2c3f44555869ab387363af43 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Jan 2011 18:43:57 +0100
Subject: block: restore multiple bd_link_disk_holder() support

Commit e09b457b (block: simplify holder symlink handling) incorrectly
assumed that there is only one link at maximum.  dm may use multiple
links and expects block layer to track reference count for each link,
which is different from and unrelated to the exclusive device holder
identified by @holder when the device is opened.

Remove the single holder assumption and automatic removal of the link
and revive the per-link reference count tracking.  The code
essentially behaves the same as before commit e09b457b sans the
unnecessary kobject reference count dancing.

While at it, note that this facility should not be used by anyone else
than the current ones.  Sysfs symlinks shouldn't be abused like this
and the whole thing doesn't belong in the block layer at all.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Milan Broz <mbroz@redhat.com>
Cc: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Neil Brown <neilb@suse.de>
Cc: linux-raid@vger.kernel.org
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm-table.c |  1 +
 drivers/md/md.c       |  1 +
 fs/block_dev.c        | 93 +++++++++++++++++++++++++++++++++++++++++----------
 include/linux/fs.h    |  8 ++++-
 4 files changed, 84 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index dffa0ac7c4f0..38e4eb1bb965 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -350,6 +350,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 	if (!d->dm_dev.bdev)
 		return;
 
+	bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md));
 	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
 	d->dm_dev.bdev = NULL;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cf8594c5ea21..b76cfc89e1b5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1912,6 +1912,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
+	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
 	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe3f59c14a02..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -432,6 +432,9 @@ static void init_once(void *foo)
 	mutex_init(&bdev->bd_mutex);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
+#ifdef CONFIG_SYSFS
+	INIT_LIST_HEAD(&bdev->bd_holder_disks);
+#endif
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
@@ -779,6 +782,23 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 }
 
 #ifdef CONFIG_SYSFS
+struct bd_holder_disk {
+	struct list_head	list;
+	struct gendisk		*disk;
+	int			refcnt;
+};
+
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+						  struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+
+	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+		if (holder->disk == disk)
+			return holder;
+	return NULL;
+}
+
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	return sysfs_create_link(from, to, kobject_name(to));
@@ -794,6 +814,8 @@ static void del_symlink(struct kobject *from, struct kobject *to)
  * @bdev: the claimed slave bdev
  * @disk: the holding disk
  *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
  * This functions creates the following sysfs symlinks.
  *
  * - from "slaves" directory of the holder @disk to the claimed @bdev
@@ -817,47 +839,83 @@ static void del_symlink(struct kobject *from, struct kobject *to)
  */
 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
+	struct bd_holder_disk *holder;
 	int ret = 0;
 
 	mutex_lock(&bdev->bd_mutex);
 
-	WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk);
+	WARN_ON_ONCE(!bdev->bd_holder);
 
 	/* FIXME: remove the following once add_disk() handles errors */
 	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
 		goto out_unlock;
 
-	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
-	if (ret)
+	holder = bd_find_holder_disk(bdev, disk);
+	if (holder) {
+		holder->refcnt++;
 		goto out_unlock;
+	}
 
-	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
-	if (ret) {
-		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+	if (!holder) {
+		ret = -ENOMEM;
 		goto out_unlock;
 	}
 
-	bdev->bd_holder_disk = disk;
+	INIT_LIST_HEAD(&holder->list);
+	holder->disk = disk;
+	holder->refcnt = 1;
+
+	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	if (ret)
+		goto out_free;
+
+	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	if (ret)
+		goto out_del;
+
+	list_add(&holder->list, &bdev->bd_holder_disks);
+	goto out_unlock;
+
+out_del:
+	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+out_free:
+	kfree(holder);
 out_unlock:
 	mutex_unlock(&bdev->bd_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
-static void bd_unlink_disk_holder(struct block_device *bdev)
+/**
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-	struct gendisk *disk = bdev->bd_holder_disk;
+	struct bd_holder_disk *holder;
 
-	bdev->bd_holder_disk = NULL;
-	if (!disk)
-		return;
+	mutex_lock(&bdev->bd_mutex);
 
-	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
-	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	holder = bd_find_holder_disk(bdev, disk);
+
+	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		del_symlink(bdev->bd_part->holder_dir,
+			    &disk_to_dev(disk)->kobj);
+		list_del_init(&holder->list);
+		kfree(holder);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
 }
-#else
-static inline void bd_unlink_disk_holder(struct block_device *bdev)
-{ }
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 #endif
 
 /**
@@ -1380,7 +1438,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 		 * unblock evpoll if it was a write holder.
 		 */
 		if (bdev_free) {
-			bd_unlink_disk_holder(bdev);
 			if (bdev->bd_write_holder) {
 				disk_unblock_events(bdev->bd_disk);
 				bdev->bd_write_holder = false;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3984f2358d1f..fb2190349cdf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -666,7 +666,7 @@ struct block_device {
 	int			bd_holders;
 	bool			bd_write_holder;
 #ifdef CONFIG_SYSFS
-	struct gendisk *	bd_holder_disk;	/* for sysfs slave linkng */
+	struct list_head	bd_holder_disks;
 #endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
@@ -2058,12 +2058,18 @@ extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
 extern int blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
+extern void bd_unlink_disk_holder(struct block_device *bdev,
+				  struct gendisk *disk);
 #else
 static inline int bd_link_disk_holder(struct block_device *bdev,
 				      struct gendisk *disk)
 {
 	return 0;
 }
+static inline void bd_unlink_disk_holder(struct block_device *bdev,
+					 struct gendisk *disk)
+{
+}
 #endif
 #endif
 
-- 
cgit v1.2.3-71-gd317


From ab0724ffee24409a7f81afb539b2ac29090bff3e Mon Sep 17 00:00:00 2001
From: Markus Trippelsdorf <markus@trippelsdorf.de>
Date: Sat, 15 Jan 2011 00:07:22 +0100
Subject: PCI / ACPI: Fix build issue in pci_root.c for !CONFIG_PCIEPORTBUS

The compilation of drivers/acpi/pci_root.c fails if
CONFIG_PCIEPORTBUS is unset.  Fix the problem.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9e67dcd054c4..559d02897075 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -993,8 +993,13 @@ extern void pci_restore_msi_state(struct pci_dev *dev);
 extern int pci_msi_enabled(void);
 #endif
 
+#ifdef CONFIG_PCIEPORTBUS
 extern bool pcie_ports_disabled;
 extern bool pcie_ports_auto;
+#else
+#define pcie_ports_disabled	true
+#define pcie_ports_auto		false
+#endif
 
 #ifndef CONFIG_PCIEASPM
 static inline int pcie_aspm_enabled(void)
-- 
cgit v1.2.3-71-gd317


From 98d530fe246b65fbd3cdeeeca319a80c46cb4793 Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Sat, 1 Jan 2011 23:00:23 +0000
Subject: Fix dmaengine_submit() return type

desc->tx_submit's return type is dma_cookie_t, not int.  Therefore,
dmaengine_submit() should match this return type as it's just
wrapping this detail.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 9d8688b92d8b..830935b7c49c 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -532,7 +532,7 @@ static inline int dmaengine_resume(struct dma_chan *chan)
 	return dmaengine_device_control(chan, DMA_RESUME, 0);
 }
 
-static inline int dmaengine_submit(struct dma_async_tx_descriptor *desc)
+static inline dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc)
 {
 	return desc->tx_submit(desc);
 }
-- 
cgit v1.2.3-71-gd317


From 96a608a4bfd8468c21881b3f92024923886eb015 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 14 Jan 2011 17:51:11 -0800
Subject: ARM: PL08x: fix a warning

drivers/dma/amba-pl08x.c: In function 'pl08x_start_txd':
drivers/dma/amba-pl08x.c:205: warning: dereferencing 'void *' pointer

We never dereference llis_va aside from assigning it to a struct
pl08x_lli pointer or calculating the address of array element 0.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/amba/pl08x.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 933b4ed12be5..5b87b6aac3f8 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -118,7 +118,7 @@ struct pl08x_txd {
 	dma_addr_t dst_addr;
 	size_t len;
 	dma_addr_t llis_bus;
-	void *llis_va;
+	struct pl08x_lli *llis_va;
 	bool active;
 	/* Default cctl value for LLIs */
 	u32 cctl;
-- 
cgit v1.2.3-71-gd317


From 9875cf806403fae66b2410a3c2cc820d97731e04 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:21 +0000
Subject: Add a dentry op to handle automounting rather than abusing
 follow_link()

Add a dentry op (d_automount) to handle automounting directories rather than
abusing the follow_link() inode operation.  The operation is keyed off a new
dentry flag (DCACHE_NEED_AUTOMOUNT).

This also makes it easier to add an AT_ flag to suppress terminal segment
automount during pathwalk and removes the need for the kludge code in the
pathwalk algorithm to handle directories with follow_link() semantics.

The ->d_automount() dentry operation:

	struct vfsmount *(*d_automount)(struct path *mountpoint);

takes a pointer to the directory to be mounted upon, which is expected to
provide sufficient data to determine what should be mounted.  If successful, it
should return the vfsmount struct it creates (which it should also have added
to the namespace using do_add_mount() or similar).  If there's a collision with
another automount attempt, NULL should be returned.  If the directory specified
by the parameter should be used directly rather than being mounted upon,
-EISDIR should be returned.  In any other case, an error code should be
returned.

The ->d_automount() operation is called with no locks held and may sleep.  At
this point the pathwalk algorithm will be in ref-walk mode.

Within fs/namei.c itself, a new pathwalk subroutine (follow_automount()) is
added to handle mountpoints.  It will return -EREMOTE if the automount flag was
set, but no d_automount() op was supplied, -ELOOP if we've encountered too many
symlinks or mountpoints, -EISDIR if the walk point should be used without
mounting and 0 if successful.  The path will be updated to point to the mounted
filesystem if a successful automount took place.

__follow_mount() is replaced by follow_managed() which is more generic
(especially with the patch that adds ->d_manage()).  This handles transits from
directories during pathwalk, including automounting and skipping over
mountpoints (and holding processes with the next patch).

__follow_mount_rcu() will jump out of RCU-walk mode if it encounters an
automount point with nothing mounted on it.

follow_dotdot*() does not handle automounts as you don't want to trigger them
whilst following "..".

I've also extracted the mount/don't-mount logic from autofs4 and included it
here.  It makes the mount go ahead anyway if someone calls open() or creat(),
tries to traverse the directory, tries to chdir/chroot/etc. into the directory,
or sticks a '/' on the end of the pathname.  If they do a stat(), however,
they'll only trigger the automount if they didn't also say O_NOFOLLOW.

I've also added an inode flag (S_AUTOMOUNT) so that filesystems can mark their
inodes as automount points.  This flag is automatically propagated to the
dentry as DCACHE_NEED_AUTOMOUNT by __d_instantiate().  This saves NFS and could
save AFS a private flag bit apiece, but is not strictly necessary.  It would be
preferable to do the propagation in d_set_d_op(), but that doesn't normally
have access to the inode.

[AV: fixed breakage in case if __follow_mount_rcu() fails and nameidata_drop_rcu()
succeeds in RCU case of do_lookup(); we need to fall through to non-RCU case after
that, rather than just returning with ungrabbed *path]

Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |   2 +
 Documentation/filesystems/vfs.txt |  14 +++
 fs/dcache.c                       |   5 +-
 fs/namei.c                        | 226 ++++++++++++++++++++++++++++----------
 include/linux/dcache.h            |   7 +-
 include/linux/fs.h                |   2 +
 6 files changed, 198 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 977d8919cc69..5f0c52a07386 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -19,6 +19,7 @@ prototypes:
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
+	struct vfsmount *(*d_automount)(struct path *path);
 
 locking rules:
 		rename_lock	->d_lock	may block	rcu-walk
@@ -29,6 +30,7 @@ d_delete:	no		yes		no		no
 d_release:	no		no		yes		no
 d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
+d_automount:	no		no		yes		no
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index cae6d27c9f5b..726a4f6fa3c9 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -864,6 +864,7 @@ struct dentry_operations {
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
+	struct vfsmount *(*d_automount)(struct path *);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -930,6 +931,19 @@ struct dentry_operations {
 	at the end of the buffer, and returns a pointer to the first char.
 	dynamic_dname() helper function is provided to take care of this.
 
+  d_automount: called when an automount dentry is to be traversed (optional).
+	This should create a new VFS mount record, mount it on the directory
+	and return the record to the caller.  The caller is supplied with a
+	path parameter giving the automount directory to describe the automount
+	target and the parent VFS mount record to provide inheritable mount
+	parameters.  NULL should be returned if someone else managed to make
+	the automount first.  If the automount failed, then an error code
+	should be returned.
+
+	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
+	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
+	inode being added.
+
 Example :
 
 static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
diff --git a/fs/dcache.c b/fs/dcache.c
index 0c6d5c549d84..51f7bb6463af 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1380,8 +1380,11 @@ EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
-	if (inode)
+	if (inode) {
+		if (unlikely(IS_AUTOMOUNT(inode)))
+			dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
 		list_add(&dentry->d_alias, &inode->i_dentry);
+	}
 	dentry->d_inode = inode;
 	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
diff --git a/fs/namei.c b/fs/namei.c
index 529e917ad2fc..16109da68bbf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -896,51 +896,120 @@ int follow_up(struct path *path)
 }
 
 /*
- * serialization is taken care of in namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
  */
-static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
-				struct inode **inode)
+static int follow_automount(struct path *path, unsigned flags,
+			    bool *need_mntput)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted;
-		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
-		if (!mounted)
-			return;
-		path->mnt = mounted;
-		path->dentry = mounted->mnt_root;
-		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-		*inode = path->dentry->d_inode;
+	struct vfsmount *mnt;
+
+	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+		return -EREMOTE;
+
+	/* We want to mount if someone is trying to open/create a file of any
+	 * type under the mountpoint, wants to traverse through the mountpoint
+	 * or wants to open the mounted directory.
+	 *
+	 * We don't want to mount if someone's just doing a stat and they've
+	 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+	 * appended a '/' to the name.
+	 */
+	if (!(flags & LOOKUP_FOLLOW) &&
+	    !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+		       LOOKUP_OPEN | LOOKUP_CREATE)))
+		return -EISDIR;
+
+	current->total_link_count++;
+	if (current->total_link_count >= 40)
+		return -ELOOP;
+
+	mnt = path->dentry->d_op->d_automount(path);
+	if (IS_ERR(mnt)) {
+		/*
+		 * The filesystem is allowed to return -EISDIR here to indicate
+		 * it doesn't want to automount.  For instance, autofs would do
+		 * this so that its userspace daemon can mount on this dentry.
+		 *
+		 * However, we can only permit this if it's a terminal point in
+		 * the path being looked up; if it wasn't then the remainder of
+		 * the path is inaccessible and we should say so.
+		 */
+		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+			return -EREMOTE;
+		return PTR_ERR(mnt);
 	}
-}
+	if (!mnt) /* mount collision */
+		return 0;
 
-static int __follow_mount(struct path *path)
-{
-	int res = 0;
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		if (res)
-			mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
-		res = 1;
+	if (mnt->mnt_sb == path->mnt->mnt_sb &&
+	    mnt->mnt_root == path->dentry) {
+		mntput(mnt);
+		return -ELOOP;
 	}
-	return res;
+
+	dput(path->dentry);
+	if (*need_mntput)
+		mntput(path->mnt);
+	path->mnt = mnt;
+	path->dentry = dget(mnt->mnt_root);
+	*need_mntput = true;
+	return 0;
 }
 
-static void follow_mount(struct path *path)
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
+	unsigned managed;
+	bool need_mntput = false;
+	int ret;
+
+	/* Given that we're not holding a lock here, we retain the value in a
+	 * local variable for each dentry as we look at it so that we don't see
+	 * the components of that value change under us */
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       managed &= DCACHE_MANAGED_DENTRY,
+	       unlikely(managed != 0)) {
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (mounted) {
+				dput(path->dentry);
+				if (need_mntput)
+					mntput(path->mnt);
+				path->mnt = mounted;
+				path->dentry = dget(mounted->mnt_root);
+				need_mntput = true;
+				continue;
+			}
+
+			/* Something is mounted on this dentry in another
+			 * namespace and/or whatever was mounted there in this
+			 * namespace got unmounted before we managed to get the
+			 * vfsmount_lock */
+		}
+
+		/* Handle an automount point */
+		if (managed & DCACHE_NEED_AUTOMOUNT) {
+			ret = follow_automount(path, flags, &need_mntput);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+			continue;
+		}
+
+		/* We didn't change the current path point */
+		break;
 	}
+	return 0;
 }
 
 int follow_down(struct path *path)
@@ -958,13 +1027,37 @@ int follow_down(struct path *path)
 	return 0;
 }
 
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * meet an automount point and we're not walking to "..".  True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+			       struct inode **inode, bool reverse_transit)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted;
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			break;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		*inode = path->dentry->d_inode;
+	}
+
+	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+		return reverse_transit;
+	return true;
+}
+
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct inode *inode = nd->inode;
 
 	set_root_rcu(nd);
 
-	while(1) {
+	while (1) {
 		if (nd->path.dentry == nd->root.dentry &&
 		    nd->path.mnt == nd->root.mnt) {
 			break;
@@ -987,12 +1080,28 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 		inode = nd->path.dentry->d_inode;
 	}
-	__follow_mount_rcu(nd, &nd->path, &inode);
+	__follow_mount_rcu(nd, &nd->path, &inode, true);
 	nd->inode = inode;
 
 	return 0;
 }
 
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path);
+		if (!mounted)
+			break;
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+	}
+}
+
 static void follow_dotdot(struct nameidata *nd)
 {
 	set_root(nd);
@@ -1057,12 +1166,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
 	struct inode *dir;
+	int err;
+
 	/*
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
 	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-		int err = parent->d_op->d_hash(parent, nd->inode, name);
+		err = parent->d_op->d_hash(parent, nd->inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -1092,20 +1203,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 done2:
 		path->mnt = mnt;
 		path->dentry = dentry;
-		__follow_mount_rcu(nd, path, inode);
-	} else {
-		dentry = __d_lookup(parent, name);
-		if (!dentry)
-			goto need_lookup;
+		if (likely(__follow_mount_rcu(nd, path, inode, false)))
+			return 0;
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
+		/* fallthru */
+	}
+	dentry = __d_lookup(parent, name);
+	if (!dentry)
+		goto need_lookup;
 found:
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-			goto need_revalidate;
+	if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+		goto need_revalidate;
 done:
-		path->mnt = mnt;
-		path->dentry = dentry;
-		__follow_mount(path);
-		*inode = path->dentry->d_inode;
-	}
+	path->mnt = mnt;
+	path->dentry = dentry;
+	err = follow_managed(path, nd->flags);
+	if (unlikely(err < 0))
+		return err;
+	*inode = path->dentry->d_inode;
 	return 0;
 
 need_lookup:
@@ -2203,11 +2319,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (open_flag & O_EXCL)
 		goto exit_dput;
 
-	if (__follow_mount(path)) {
-		error = -ELOOP;
-		if (open_flag & O_NOFOLLOW)
-			goto exit_dput;
-	}
+	error = follow_managed(path, nd->flags);
+	if (error < 0)
+		goto exit_dput;
 
 	error = -ENOENT;
 	if (!path->dentry->d_inode)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 59fcd24b1468..ee6c26d142c3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -167,6 +167,7 @@ struct dentry_operations {
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
+	struct vfsmount *(*d_automount)(struct path *);
 } ____cacheline_aligned;
 
 /*
@@ -205,13 +206,17 @@ struct dentry_operations {
 
 #define DCACHE_CANT_MOUNT	0x0100
 #define DCACHE_GENOCIDE		0x0200
-#define DCACHE_MOUNTED		0x0400	/* is a mountpoint */
 
 #define DCACHE_OP_HASH		0x1000
 #define DCACHE_OP_COMPARE	0x2000
 #define DCACHE_OP_REVALIDATE	0x4000
 #define DCACHE_OP_DELETE	0x8000
 
+#define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
+#define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
+#define DCACHE_MANAGED_DENTRY \
+	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT)
+
 extern seqlock_t rename_lock;
 
 static inline int dname_external(struct dentry *dentry)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3984f2358d1f..4c00ed53be8f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -242,6 +242,7 @@ struct inodes_stat_t {
 #define S_SWAPFILE	256	/* Do not truncate: swapon got its bmaps */
 #define S_PRIVATE	512	/* Inode is fs-internal */
 #define S_IMA		1024	/* Inode has an associated IMA struct */
+#define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -277,6 +278,7 @@ struct inodes_stat_t {
 #define IS_SWAPFILE(inode)	((inode)->i_flags & S_SWAPFILE)
 #define IS_PRIVATE(inode)	((inode)->i_flags & S_PRIVATE)
 #define IS_IMA(inode)		((inode)->i_flags & S_IMA)
+#define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
-- 
cgit v1.2.3-71-gd317


From cc53ce53c86924bfe98a12ea20b7465038a08792 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:26 +0000
Subject: Add a dentry op to allow processes to be held during pathwalk transit

Add a dentry op (d_manage) to permit a filesystem to hold a process and make it
sleep when it tries to transit away from one of that filesystem's directories
during a pathwalk.  The operation is keyed off a new dentry flag
(DCACHE_MANAGE_TRANSIT).

The filesystem is allowed to be selective about which processes it holds and
which it permits to continue on or prohibits from transiting from each flagged
directory.  This will allow autofs to hold up client processes whilst letting
its userspace daemon through to maintain the directory or the stuff behind it
or mounted upon it.

The ->d_manage() dentry operation:

	int (*d_manage)(struct path *path, bool mounting_here);

takes a pointer to the directory about to be transited away from and a flag
indicating whether the transit is undertaken by do_add_mount() or
do_move_mount() skipping through a pile of filesystems mounted on a mountpoint.

It should return 0 if successful and to let the process continue on its way;
-EISDIR to prohibit the caller from skipping to overmounted filesystems or
automounting, and to use this directory; or some other error code to return to
the user.

->d_manage() is called with namespace_sem writelocked if mounting_here is true
and no other locks held, so it may sleep.  However, if mounting_here is true,
it may not initiate or wait for a mount or unmount upon the parameter
directory, even if the act is actually performed by userspace.

Within fs/namei.c, follow_managed() is extended to check with d_manage() first
on each managed directory, before transiting away from it or attempting to
automount upon it.

follow_down() is renamed follow_down_one() and should only be used where the
filesystem deliberately intends to avoid management steps (e.g. autofs).

A new follow_down() is added that incorporates the loop done by all other
callers of follow_down() (do_add/move_mount(), autofs and NFSD; whilst AFS, NFS
and CIFS do use it, their use is removed by converting them to use
d_automount()).  The new follow_down() calls d_manage() as appropriate.  It
also takes an extra parameter to indicate if it is being called from mount code
(with namespace_sem writelocked) which it passes to d_manage().  follow_down()
ignores automount points so that it can be used to mount on them.

__follow_mount_rcu() is made to abort rcu-walk mode if it hits a directory with
DCACHE_MANAGE_TRANSIT set on the basis that we're probably going to have to
sleep.  It would be possible to enter d_manage() in rcu-walk mode too, and have
that determine whether to abort or not itself.  That would allow the autofs
daemon to continue on in rcu-walk mode.

Note that DCACHE_MANAGE_TRANSIT on a directory should be cleared when it isn't
required as every tranist from that directory will cause d_manage() to be
invoked.  It can always be set again when necessary.

==========================
WHAT THIS MEANS FOR AUTOFS
==========================

Autofs currently uses the lookup() inode op and the d_revalidate() dentry op to
trigger the automounting of indirect mounts, and both of these can be called
with i_mutex held.

autofs knows that the i_mutex will be held by the caller in lookup(), and so
can drop it before invoking the daemon - but this isn't so for d_revalidate(),
since the lock is only held on _some_ of the code paths that call it.  This
means that autofs can't risk dropping i_mutex from its d_revalidate() function
before it calls the daemon.

The bug could manifest itself as, for example, a process that's trying to
validate an automount dentry that gets made to wait because that dentry is
expired and needs cleaning up:

	mkdir         S ffffffff8014e05a     0 32580  24956
	Call Trace:
	 [<ffffffff885371fd>] :autofs4:autofs4_wait+0x674/0x897
	 [<ffffffff80127f7d>] avc_has_perm+0x46/0x58
	 [<ffffffff8009fdcf>] autoremove_wake_function+0x0/0x2e
	 [<ffffffff88537be6>] :autofs4:autofs4_expire_wait+0x41/0x6b
	 [<ffffffff88535cfc>] :autofs4:autofs4_revalidate+0x91/0x149
	 [<ffffffff80036d96>] __lookup_hash+0xa0/0x12f
	 [<ffffffff80057a2f>] lookup_create+0x46/0x80
	 [<ffffffff800e6e31>] sys_mkdirat+0x56/0xe4

versus the automount daemon which wants to remove that dentry, but can't
because the normal process is holding the i_mutex lock:

	automount     D ffffffff8014e05a     0 32581      1              32561
	Call Trace:
	 [<ffffffff80063c3f>] __mutex_lock_slowpath+0x60/0x9b
	 [<ffffffff8000ccf1>] do_path_lookup+0x2ca/0x2f1
	 [<ffffffff80063c89>] .text.lock.mutex+0xf/0x14
	 [<ffffffff800e6d55>] do_rmdir+0x77/0xde
	 [<ffffffff8005d229>] tracesys+0x71/0xe0
	 [<ffffffff8005d28d>] tracesys+0xd5/0xe0

which means that the system is deadlocked.

This patch allows autofs to hold up normal processes whilst the daemon goes
ahead and does things to the dentry tree behind the automouter point without
risking a deadlock as almost no locks are held in d_manage() and none in
d_automount().

Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  2 ++
 Documentation/filesystems/vfs.txt | 21 +++++++++++-
 drivers/staging/autofs/dirhash.c  |  5 ++-
 fs/afs/mntpt.c                    |  5 +--
 fs/autofs4/autofs_i.h             | 13 -------
 fs/autofs4/dev-ioctl.c            |  2 +-
 fs/autofs4/expire.c               |  2 +-
 fs/autofs4/root.c                 | 11 +++---
 fs/cifs/cifs_dfs_ref.c            |  5 +--
 fs/namei.c                        | 72 +++++++++++++++++++++++++++++++++++++--
 fs/namespace.c                    | 14 ++++----
 fs/nfs/namespace.c                |  5 +--
 fs/nfsd/vfs.c                     |  5 +--
 include/linux/dcache.h            | 11 ++++--
 include/linux/namei.h             |  3 +-
 15 files changed, 126 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 5f0c52a07386..cbf98b989b11 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -20,6 +20,7 @@ prototypes:
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
 	struct vfsmount *(*d_automount)(struct path *path);
+	int (*d_manage)(struct dentry *, bool);
 
 locking rules:
 		rename_lock	->d_lock	may block	rcu-walk
@@ -31,6 +32,7 @@ d_release:	no		no		yes		no
 d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
 d_automount:	no		no		yes		no
+d_manage:	no		no		yes		no
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 726a4f6fa3c9..4682586b147a 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -865,6 +865,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
+	int (*d_manage)(struct dentry *, bool);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -938,12 +939,30 @@ struct dentry_operations {
 	target and the parent VFS mount record to provide inheritable mount
 	parameters.  NULL should be returned if someone else managed to make
 	the automount first.  If the automount failed, then an error code
-	should be returned.
+	should be returned.  If -EISDIR is returned, then the directory will
+	be treated as an ordinary directory and returned to pathwalk to
+	continue walking.
 
 	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
 	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
 	inode being added.
 
+  d_manage: called to allow the filesystem to manage the transition from a
+	dentry (optional).  This allows autofs, for example, to hold up clients
+	waiting to explore behind a 'mountpoint' whilst letting the daemon go
+	past and construct the subtree there.  0 should be returned to let the
+	calling process continue.  -EISDIR can be returned to tell pathwalk to
+	use this directory as an ordinary directory and to ignore anything
+	mounted on it and not to check the automount flag.  Any other error
+	code will abort pathwalk completely.
+
+	If the 'mounting_here' parameter is true, then namespace_sem is being
+	held by the caller and the function should not initiate any mounts or
+	unmounts that it will then wait for.
+
+	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
+	dentry being transited from.
+
 Example :
 
 static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
diff --git a/drivers/staging/autofs/dirhash.c b/drivers/staging/autofs/dirhash.c
index d3f42c8325f7..a08bd7355035 100644
--- a/drivers/staging/autofs/dirhash.c
+++ b/drivers/staging/autofs/dirhash.c
@@ -88,14 +88,13 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 		}
 		path.mnt = mnt;
 		path_get(&path);
-		if (!follow_down(&path)) {
+		if (!follow_down_one(&path)) {
 			path_put(&path);
 			DPRINTK(("autofs: not expirable\
 			(not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(path.dentry) && follow_down(&path))
-			;
+		follow_down(&path, false);  // TODO: need to check error
 		umount_ok = may_umount(path.mnt);
 		path_put(&path);
 
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index e83c0336e7b5..f3e891d57a2c 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -273,10 +273,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 		break;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
+		err = follow_down(&nd->path, false);
 	default:
 		mntput(newmnt);
 		break;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24cec..eb67953452bb 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -229,19 +229,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct path *path)
-{
-	int res = 0;
-
-	while (d_mountpoint(path->dentry)) {
-		int followed = follow_down(path);
-		if (!followed)
-			break;
-		res = 1;
-	}
-	return res;
-}
-
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
 	return new_encode_dev(sbi->sb->s_dev);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469a..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 		err = have_submounts(path.dentry);
 
-		if (follow_down(&path))
+		if (follow_down_one(&path))
 			magic = path.mnt->mnt_sb->s_magic;
 	}
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d01365905..6a930b90d389 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 
 	path_get(&path);
 
-	if (!follow_down(&path))
+	if (!follow_down_one(&path))
 		goto done;
 
 	if (is_autofs4_dentry(path.dentry)) {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b1..20225636a4e9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -234,7 +234,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 	/*
 	 * For an expire of a covered direct or offset mount we need
-	 * to break out of follow_down() at the autofs mount trigger
+	 * to break out of follow_down_one() at the autofs mount trigger
 	 * (d_mounted--), so we can see the expiring flag, and manage
 	 * the blocking and following here until the expire is completed.
 	 */
@@ -243,7 +243,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		if (ino->flags & AUTOFS_INF_EXPIRING) {
 			spin_unlock(&sbi->fs_lock);
 			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path))
+			if (!follow_down_one(&nd->path))
 				goto done;
 			goto follow;
 		}
@@ -292,11 +292,10 @@ follow:
 	 * multi-mount with no root offset so we don't need
 	 * to follow it.
 	 */
-	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path)) {
-			status = -ENOENT;
+	if (d_managed(dentry)) {
+		status = follow_down(&nd->path, false);
+		if (status < 0)
 			goto out_error;
-		}
 	}
 
 done:
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27fd..83479cf63f96 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -273,10 +273,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 		break;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
+		err = follow_down(&nd->path, false);
 	default:
 		mntput(newmnt);
 		break;
diff --git a/fs/namei.c b/fs/namei.c
index 16109da68bbf..9d3033dc22e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -960,6 +960,7 @@ static int follow_automount(struct path *path, unsigned flags,
 
 /*
  * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
  * - Flagged as mountpoint
  * - Flagged as automount point
  *
@@ -979,6 +980,16 @@ static int follow_managed(struct path *path, unsigned flags)
 	while (managed = ACCESS_ONCE(path->dentry->d_flags),
 	       managed &= DCACHE_MANAGED_DENTRY,
 	       unlikely(managed != 0)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held. */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
 		/* Transit to a mounted filesystem. */
 		if (managed & DCACHE_MOUNTED) {
 			struct vfsmount *mounted = lookup_mnt(path);
@@ -1012,7 +1023,7 @@ static int follow_managed(struct path *path, unsigned flags)
 	return 0;
 }
 
-int follow_down(struct path *path)
+int follow_down_one(struct path *path)
 {
 	struct vfsmount *mounted;
 
@@ -1029,14 +1040,19 @@ int follow_down(struct path *path)
 
 /*
  * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
- * meet an automount point and we're not walking to "..".  True is returned to
+ * meet a managed dentry and we're not walking to "..".  True is returned to
  * continue, false to abort.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 {
+	unsigned abort_mask =
+		reverse_transit ? 0 : DCACHE_MANAGE_TRANSIT;
+
 	while (d_mountpoint(path->dentry)) {
 		struct vfsmount *mounted;
+		if (path->dentry->d_flags & abort_mask)
+			return true;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 			break;
@@ -1086,6 +1102,57 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	return 0;
 }
 
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
+ */
+int follow_down(struct path *path, bool mounting_here)
+{
+	unsigned managed;
+	int ret;
+
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held.
+		 *
+		 * We indicate to the filesystem if someone is trying to mount
+		 * something here.  This gives autofs the chance to deny anyone
+		 * other than its daemon the right to mount on its
+		 * superstructure.
+		 *
+		 * The filesystem may sleep at this point.
+		 */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, mounting_here);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (!mounted)
+				break;
+			dput(path->dentry);
+			mntput(path->mnt);
+			path->mnt = mounted;
+			path->dentry = dget(mounted->mnt_root);
+			continue;
+		}
+
+		/* Don't handle automount points here */
+		break;
+	}
+	return 0;
+}
+
 /*
  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
  */
@@ -3530,6 +3597,7 @@ const struct inode_operations page_symlink_inode_operations = {
 };
 
 EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c44..d94ccd6ddafd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1844,9 +1844,10 @@ static int do_move_mount(struct path *path, char *old_name)
 		return err;
 
 	down_write(&namespace_sem);
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto out;
+
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
@@ -1940,9 +1941,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto unlock;
+
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
 		goto unlock;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c10..bfcb933e5755 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -176,10 +176,7 @@ out_err:
 	path_put(&nd->path);
 	goto out;
 out_follow:
-	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path))
-		;
-	err = 0;
+	err = follow_down(&nd->path, false);
 	goto out;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 230b79fbf005..0f79e33a65d7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -88,8 +88,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (d_mountpoint(path.dentry) && follow_down(&path))
-		;
+	err = follow_down(&path, false);
+	if (err < 0)
+		goto out;
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ee6c26d142c3..1a87760d6532 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -168,6 +168,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
+	int (*d_manage)(struct dentry *, bool);
 } ____cacheline_aligned;
 
 /*
@@ -214,8 +215,9 @@ struct dentry_operations {
 
 #define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
 #define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
+#define DCACHE_MANAGE_TRANSIT	0x40000	/* manage transit from this dirent */
 #define DCACHE_MANAGED_DENTRY \
-	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT)
+	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
 extern seqlock_t rename_lock;
 
@@ -404,7 +406,12 @@ static inline void dont_mount(struct dentry *dentry)
 
 extern void dput(struct dentry *);
 
-static inline int d_mountpoint(struct dentry *dentry)
+static inline bool d_managed(struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_MANAGED_DENTRY;
+}
+
+static inline bool d_mountpoint(struct dentry *dentry)
 {
 	return dentry->d_flags & DCACHE_MOUNTED;
 }
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 18d06add0a40..8ef2c789c2a8 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,7 +79,8 @@ extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 
-extern int follow_down(struct path *);
+extern int follow_down_one(struct path *);
+extern int follow_down(struct path *, bool);
 extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
-- 
cgit v1.2.3-71-gd317


From 6f45b65672c8017d5e210e338bb5858a938ef445 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:31 +0000
Subject: Add an AT_NO_AUTOMOUNT flag to suppress terminal automount

Add an AT_NO_AUTOMOUNT flag to suppress terminal automounting of automount
point directories.  This can be used by fstatat() users to permit the
gathering of attributes on an automount point and also prevent
mass-automounting of a directory of automount points by ls.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 6 ++++++
 fs/stat.c             | 4 +++-
 include/linux/fcntl.h | 1 +
 include/linux/namei.h | 2 ++
 4 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 9d3033dc22e9..dc50bfb2f5d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -908,6 +908,12 @@ static int follow_automount(struct path *path, unsigned flags,
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 		return -EREMOTE;
 
+	/* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+	 * and this is the terminal part of the path.
+	 */
+	if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+		return -EISDIR; /* we actually want to stop here */
+
 	/* We want to mount if someone is trying to open/create a file of any
 	 * type under the mountpoint, wants to traverse through the mountpoint
 	 * or wants to open the mounted directory.
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 	int error = -EINVAL;
 	int lookup_flags = 0;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
 		goto out;
 
 	if (!(flag & AT_SYMLINK_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
+	if (flag & AT_NO_AUTOMOUNT)
+		lookup_flags |= LOOKUP_NO_AUTOMOUNT;
 
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index afc00af3229b..a562fa5fb4e3 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -45,6 +45,7 @@
 #define AT_REMOVEDIR		0x200   /* Remove directory instead of
                                            unlinking file.  */
 #define AT_SYMLINK_FOLLOW	0x400   /* Follow symbolic links.  */
+#define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 8ef2c789c2a8..f276d4fa01fc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -45,6 +45,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  *  - ending slashes ok even for nonexistent files
  *  - internal "there are more path components" flag
  *  - dentry cache is untrusted; force a real lookup
+ *  - suppress terminal automount
  */
 #define LOOKUP_FOLLOW		0x0001
 #define LOOKUP_DIRECTORY	0x0002
@@ -53,6 +54,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_PARENT		0x0010
 #define LOOKUP_REVAL		0x0020
 #define LOOKUP_RCU		0x0040
+#define LOOKUP_NO_AUTOMOUNT	0x0080
 /*
  * Intent data
  */
-- 
cgit v1.2.3-71-gd317


From 36d43a43761b004ad1879ac21471d8fc5f3157ec Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:42 +0000
Subject: NFS: Use d_automount() rather than abusing follow_link()

Make NFS use the new d_automount() dentry operation rather than abusing
follow_link() on directories.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c           |  4 ++-
 fs/nfs/inode.c         |  4 +--
 fs/nfs/internal.h      |  1 +
 fs/nfs/namespace.c     | 84 ++++++++++++++++++++++++--------------------------
 include/linux/nfs_fs.h |  1 -
 5 files changed, 46 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index df8c03a02161..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -970,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 
-	if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+	if (IS_AUTOMOUNT(inode))
 		return 0;
 	if (nd != NULL) {
 		/* VFS wants an on-the-wire revalidation */
@@ -1173,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
 };
 
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1246,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs_open_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
 };
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ce00b704452c..d8512423ba72 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				else
 					inode->i_op = &nfs_mountpoint_inode_operations;
 				inode->i_fop = NULL;
-				set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
+				inode->i_flags |= S_AUTOMOUNT;
 			}
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
@@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/* Update the fsid? */
 	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
-			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
+			!IS_AUTOMOUNT(inode))
 		server->fsid = fattr->fsid;
 
 	/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bfa3a34af801..4644f04b4b46 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -252,6 +252,7 @@ extern char *nfs_path(const char *base,
 		      const struct dentry *droot,
 		      const struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
 
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index bfcb933e5755..f3fbb1bf3f18 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -97,9 +97,8 @@ Elong:
 }
 
 /*
- * nfs_follow_mountpoint - handle crossing a mountpoint on the server
- * @dentry - dentry of mountpoint
- * @nd - nameidata info
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
  *
  * When we encounter a mountpoint on the server, we want to set up
  * a mountpoint on the client too, to prevent inode numbers from
@@ -109,84 +108,81 @@ Elong:
  * situation, and that different filesystems may want to use
  * different security flavours.
  */
-static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *nfs_d_automount(struct path *path)
 {
 	struct vfsmount *mnt;
-	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+	struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
 	struct dentry *parent;
 	struct nfs_fh *fh = NULL;
 	struct nfs_fattr *fattr = NULL;
 	int err;
 
-	dprintk("--> nfs_follow_mountpoint()\n");
+	dprintk("--> nfs_d_automount()\n");
 
-	err = -ESTALE;
-	if (IS_ROOT(dentry))
-		goto out_err;
+	mnt = ERR_PTR(-ESTALE);
+	if (IS_ROOT(path->dentry))
+		goto out_nofree;
 
-	err = -ENOMEM;
+	mnt = ERR_PTR(-ENOMEM);
 	fh = nfs_alloc_fhandle();
 	fattr = nfs_alloc_fattr();
 	if (fh == NULL || fattr == NULL)
-		goto out_err;
+		goto out;
 
 	dprintk("%s: enter\n", __func__);
-	dput(nd->path.dentry);
-	nd->path.dentry = dget(dentry);
 
-	/* Look it up again */
-	parent = dget_parent(nd->path.dentry);
+	/* Look it up again to get its attributes */
+	parent = dget_parent(path->dentry);
 	err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
-						  &nd->path.dentry->d_name,
+						  &path->dentry->d_name,
 						  fh, fattr);
 	dput(parent);
-	if (err != 0)
-		goto out_err;
+	if (err != 0) {
+		mnt = ERR_PTR(err);
+		goto out;
+	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-		mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
+		mnt = nfs_do_refmount(path->mnt, path->dentry);
 	else
-		mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
-				      fattr);
-	err = PTR_ERR(mnt);
+		mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
 	if (IS_ERR(mnt))
-		goto out_err;
+		goto out;
 
 	mntget(mnt);
-	err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
 			   &nfs_automount_list);
-	if (err < 0) {
+	switch (err) {
+	case 0:
+		dprintk("%s: done, success\n", __func__);
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+		break;
+	case -EBUSY:
+		/* someone else made a mount here whilst we were busy */
 		mntput(mnt);
-		if (err == -EBUSY)
-			goto out_follow;
-		goto out_err;
+		dprintk("%s: done, collision\n", __func__);
+		mnt = NULL;
+		break;
+	default:
+		mntput(mnt);
+		dprintk("%s: done, error %d\n", __func__, err);
+		mnt = ERR_PTR(err);
+		break;
 	}
-	path_put(&nd->path);
-	nd->path.mnt = mnt;
-	nd->path.dentry = dget(mnt->mnt_root);
-	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+
 out:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fh);
-	dprintk("%s: done, returned %d\n", __func__, err);
-
-	dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
-	return ERR_PTR(err);
-out_err:
-	path_put(&nd->path);
-	goto out;
-out_follow:
-	err = follow_down(&nd->path, false);
-	goto out;
+out_nofree:
+	dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+	return mnt;
 }
 
 const struct inode_operations nfs_mountpoint_inode_operations = {
-	.follow_link	= nfs_follow_mountpoint,
 	.getattr	= nfs_getattr,
 };
 
 const struct inode_operations nfs_referral_inode_operations = {
-	.follow_link	= nfs_follow_mountpoint,
 };
 
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0779bb8f95be..6023efa9f5d9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -215,7 +215,6 @@ struct nfs_inode {
 #define NFS_INO_ADVISE_RDPLUS	(0)		/* advise readdirplus */
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
-#define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
-- 
cgit v1.2.3-71-gd317


From 1972580bb4edea3ed6fe273b2ca72f44f10f8c86 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:40 +0000
Subject: autofs4: Bump version

Increase the autofs module sub-version so we can tell what kernel
implementation is being used from user space debug logging.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/auto_fs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index 8b49ac48a5b7..e02982fa2953 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -24,7 +24,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		1
+#define AUTOFS_PROTO_SUBVERSION		2
 
 /* Mask for expire behaviour */
 #define AUTOFS_EXP_IMMEDIATE		1
-- 
cgit v1.2.3-71-gd317


From ab90911ff90cdab59b31c045c3f0ae480d14f29d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:46:51 +0000
Subject: Allow d_manage() to be used in RCU-walk mode

Allow d_manage() to be called from pathwalk when it is in RCU-walk mode as well
as when it is in Ref-walk mode.  This permits __follow_mount_rcu() to call
d_manage() directly.  d_manage() needs a parameter to indicate that it is in
RCU-walk mode as it isn't allowed to sleep if in that mode (but should return
-ECHILD instead).

autofs4_d_manage() can then be set to retain RCU-walk mode if the daemon
accesses it and otherwise request dropping back to ref-walk mode.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  7 ++++++-
 fs/autofs4/root.c                 |  8 ++++++--
 fs/namei.c                        | 15 ++++++++-------
 include/linux/dcache.h            |  2 +-
 5 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index cbf98b989b11..39707748ed2d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -32,7 +32,7 @@ d_release:	no		no		yes		no
 d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
 d_automount:	no		no		yes		no
-d_manage:	no		no		yes		no
+d_manage:	no		no		yes (ref-walk)	maybe
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4682586b147a..3c4b2f1b64d0 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -865,7 +865,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
-	int (*d_manage)(struct dentry *, bool);
+	int (*d_manage)(struct dentry *, bool, bool);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -960,6 +960,11 @@ struct dentry_operations {
 	held by the caller and the function should not initiate any mounts or
 	unmounts that it will then wait for.
 
+	If the 'rcu_walk' parameter is true, then the caller is doing a
+	pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
+	and the caller can be asked to leave it and call again by returing
+	-ECHILD.
+
 	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
 	dentry being transited from.
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 9194e274f849..dbd95512808c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,7 +36,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool);
+static int autofs4_d_manage(struct dentry *, bool, bool);
 
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
@@ -450,7 +450,7 @@ done:
 	return NULL;
 }
 
-int autofs4_d_manage(struct dentry *dentry, bool mounting_here)
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
@@ -464,6 +464,10 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here)
 		return 0;
 	}
 
+	/* We need to sleep, so we need pathwalk to be in ref-mode */
+	if (rcu_walk)
+		return -ECHILD;
+
 	/* Wait for pending expires */
 	do_expire_wait(dentry);
 
diff --git a/fs/namei.c b/fs/namei.c
index 373852012713..5c89695ae1e4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -987,7 +987,8 @@ static int follow_managed(struct path *path, unsigned flags)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			ret = path->dentry->d_op->d_manage(path->dentry,
+							   false, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
@@ -1048,13 +1049,12 @@ int follow_down_one(struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 {
-	unsigned abort_mask =
-		reverse_transit ? 0 : DCACHE_MANAGE_TRANSIT;
-
 	while (d_mountpoint(path->dentry)) {
 		struct vfsmount *mounted;
-		if (path->dentry->d_flags & abort_mask)
-			return true;
+		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+		    !reverse_transit &&
+		    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+			return false;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 			break;
@@ -1132,7 +1132,8 @@ int follow_down(struct path *path, bool mounting_here)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry, mounting_here);
+			ret = path->dentry->d_op->d_manage(
+				path->dentry, mounting_here, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1a87760d6532..f958c19e3ca5 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -168,7 +168,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
-	int (*d_manage)(struct dentry *, bool);
+	int (*d_manage)(struct dentry *, bool, bool);
 } ____cacheline_aligned;
 
 /*
-- 
cgit v1.2.3-71-gd317


From ea5b778a8b98c85a87d66bf844904f9c3802b869 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 19:10:03 +0000
Subject: Unexport do_add_mount() and add in follow_automount(), not
 ->d_automount()

Unexport do_add_mount() and make ->d_automount() return the vfsmount to be
added rather than calling do_add_mount() itself.  follow_automount() will then
do the addition.

This slightly complicates things as ->d_automount() normally wants to add the
new vfsmount to an expiration list and start an expiration timer.  The problem
with that is that the vfsmount will be deleted if it has a refcount of 1 and
the timer will not repeat if the expiration list is empty.

To this end, we require the vfsmount to be returned from d_automount() with a
refcount of (at least) 2.  One of these refs will be dropped unconditionally.
In addition, follow_automount() must get a 3rd ref around the call to
do_add_mount() lest it eat a ref and return an error, leaving the mount we
have open to being expired as we would otherwise have only 1 ref on it.

d_automount() should also add the the vfsmount to the expiration list (by
calling mnt_set_expiry()) and start the expiration timer before returning, if
this mechanism is to be used.  The vfsmount will be unlinked from the
expiration list by follow_automount() if do_add_mount() fails.

This patch also fixes the call to do_add_mount() for AFS to propagate the mount
flags from the parent vfsmount.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/vfs.txt | 23 ++++++++++++---------
 fs/afs/mntpt.c                    | 25 ++++++-----------------
 fs/cifs/cifs_dfs_ref.c            | 26 ++++++------------------
 fs/internal.h                     |  2 ++
 fs/namei.c                        | 42 ++++++++++++++++++++++++++++++++-------
 fs/namespace.c                    | 41 ++++++++++++++++++++++++++++++--------
 fs/nfs/namespace.c                | 24 ++++------------------
 include/linux/mount.h             |  7 +------
 8 files changed, 101 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3c4b2f1b64d0..94cf97b901d7 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -933,15 +933,20 @@ struct dentry_operations {
 	dynamic_dname() helper function is provided to take care of this.
 
   d_automount: called when an automount dentry is to be traversed (optional).
-	This should create a new VFS mount record, mount it on the directory
-	and return the record to the caller.  The caller is supplied with a
-	path parameter giving the automount directory to describe the automount
-	target and the parent VFS mount record to provide inheritable mount
-	parameters.  NULL should be returned if someone else managed to make
-	the automount first.  If the automount failed, then an error code
-	should be returned.  If -EISDIR is returned, then the directory will
-	be treated as an ordinary directory and returned to pathwalk to
-	continue walking.
+	This should create a new VFS mount record and return the record to the
+	caller.  The caller is supplied with a path parameter giving the
+	automount directory to describe the automount target and the parent
+	VFS mount record to provide inheritable mount parameters.  NULL should
+	be returned if someone else managed to make the automount first.  If
+	the vfsmount creation failed, then an error code should be returned.
+	If -EISDIR is returned, then the directory will be treated as an
+	ordinary directory and returned to pathwalk to continue walking.
+
+	If a vfsmount is returned, the caller will attempt to mount it on the
+	mountpoint and will remove the vfsmount from its expiration list in
+	the case of failure.  The vfsmount should be returned with 2 refs on
+	it to prevent automatic expiration - the caller will clean up the
+	additional ref.
 
 	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
 	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index d23b2e344a78..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -241,7 +241,6 @@ error_no_devname:
 struct vfsmount *afs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
-	int err;
 
 	_enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
 
@@ -249,24 +248,12 @@ struct vfsmount *afs_d_automount(struct path *path)
 	if (IS_ERR(newmnt))
 		return newmnt;
 
-	mntget(newmnt);
-	err = do_add_mount(newmnt, path, MNT_SHRINKABLE, &afs_vfsmounts);
-	switch (err) {
-	case 0:
-		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
-				   afs_mntpt_expiry_timeout * HZ);
-		_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
-		return newmnt;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(newmnt);
-		_leave(" = NULL [EBUSY]");
-		return NULL;
-	default:
-		mntput(newmnt);
-		_leave(" = %d", err);
-		return ERR_PTR(err);
-	}
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &afs_vfsmounts);
+	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+			   afs_mntpt_expiry_timeout * HZ);
+	_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+	return newmnt;
 }
 
 /*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 0fc163808de3..7ed36536e754 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -351,7 +351,6 @@ free_xid:
 struct vfsmount *cifs_dfs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
-	int err;
 
 	cFYI(1, "in %s", __func__);
 
@@ -361,25 +360,12 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
 		return newmnt;
 	}
 
-	mntget(newmnt);
-	err = do_add_mount(newmnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
-			   &cifs_dfs_automount_list);
-	switch (err) {
-	case 0:
-		schedule_delayed_work(&cifs_dfs_automount_task,
-				      cifs_dfs_mountpoint_expiry_timeout);
-		cFYI(1, "leaving %s [ok]" , __func__);
-		return newmnt;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(newmnt);
-		cFYI(1, "leaving %s [EBUSY]" , __func__);
-		return NULL;
-	default:
-		mntput(newmnt);
-		cFYI(1, "leaving %s [error %d]" , __func__, err);
-		return ERR_PTR(err);
-	}
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+	schedule_delayed_work(&cifs_dfs_automount_task,
+			      cifs_dfs_mountpoint_expiry_timeout);
+	cFYI(1, "leaving %s [ok]" , __func__);
+	return newmnt;
 }
 
 const struct inode_operations cifs_dfs_referral_inode_operations = {
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee2735..4931060fd089 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int do_add_mount(struct vfsmount *, struct path *, int);
+extern void mnt_clear_expiry(struct vfsmount *);
 
 extern void __init mnt_init(void);
 
diff --git a/fs/namei.c b/fs/namei.c
index 5c89695ae1e4..c2e37727e3ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -900,6 +900,7 @@ static int follow_automount(struct path *path, unsigned flags,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
+	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 		return -EREMOTE;
@@ -942,22 +943,49 @@ static int follow_automount(struct path *path, unsigned flags,
 			return -EREMOTE;
 		return PTR_ERR(mnt);
 	}
+
 	if (!mnt) /* mount collision */
 		return 0;
 
+	/* The new mount record should have at least 2 refs to prevent it being
+	 * expired before we get a chance to add it
+	 */
+	BUG_ON(mnt_get_count(mnt) < 2);
+
 	if (mnt->mnt_sb == path->mnt->mnt_sb &&
 	    mnt->mnt_root == path->dentry) {
+		mnt_clear_expiry(mnt);
+		mntput(mnt);
 		mntput(mnt);
 		return -ELOOP;
 	}
 
-	dput(path->dentry);
-	if (*need_mntput)
-		mntput(path->mnt);
-	path->mnt = mnt;
-	path->dentry = dget(mnt->mnt_root);
-	*need_mntput = true;
-	return 0;
+	/* We need to add the mountpoint to the parent.  The filesystem may
+	 * have placed it on an expiry list, and so we need to make sure it
+	 * won't be expired under us if do_add_mount() fails (do_add_mount()
+	 * will eat a reference unconditionally).
+	 */
+	mntget(mnt);
+	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	switch (err) {
+	case -EBUSY:
+		/* Someone else made a mount here whilst we were busy */
+		err = 0;
+	default:
+		mnt_clear_expiry(mnt);
+		mntput(mnt);
+		mntput(mnt);
+		return err;
+	case 0:
+		mntput(mnt);
+		dput(path->dentry);
+		if (*need_mntput)
+			mntput(path->mnt);
+		path->mnt = mnt;
+		path->dentry = dget(mnt->mnt_root);
+		*need_mntput = true;
+		return 0;
+	}
 }
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index d94ccd6ddafd..bfcb701f9490 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1925,15 +1925,14 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	return do_add_mount(mnt, path, mnt_flags, NULL);
+	return do_add_mount(mnt, path, mnt_flags);
 }
 
 /*
  * add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
+ * - this unconditionally eats one of the caller's references to newmnt.
  */
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
-		 int mnt_flags, struct list_head *fslist)
+int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 {
 	int err;
 
@@ -1963,9 +1962,6 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	if ((err = graft_tree(newmnt, path)))
 		goto unlock;
 
-	if (fslist) /* add to the specified expiration list */
-		list_add_tail(&newmnt->mnt_expire, fslist);
-
 	up_write(&namespace_sem);
 	return 0;
 
@@ -1975,7 +1971,36 @@ unlock:
 	return err;
 }
 
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+
+	list_add_tail(&mnt->mnt_expire, expiry_list);
+
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
+
+/*
+ * Remove a vfsmount from any expiration list it may be on
+ */
+void mnt_clear_expiry(struct vfsmount *mnt)
+{
+	if (!list_empty(&mnt->mnt_expire)) {
+		down_write(&namespace_sem);
+		br_write_lock(vfsmount_lock);
+		list_del_init(&mnt->mnt_expire);
+		br_write_unlock(vfsmount_lock);
+		up_write(&namespace_sem);
+	}
+}
 
 /*
  * process a list of expirable mountpoints with the intent of discarding any
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f3fbb1bf3f18..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -149,26 +149,10 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	if (IS_ERR(mnt))
 		goto out;
 
-	mntget(mnt);
-	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
-			   &nfs_automount_list);
-	switch (err) {
-	case 0:
-		dprintk("%s: done, success\n", __func__);
-		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
-		break;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(mnt);
-		dprintk("%s: done, collision\n", __func__);
-		mnt = NULL;
-		break;
-	default:
-		mntput(mnt);
-		dprintk("%s: done, error %d\n", __func__, err);
-		mnt = ERR_PTR(err);
-		break;
-	}
+	dprintk("%s: done, success\n", __func__);
+	mntget(mnt); /* prevent immediate expiration */
+	mnt_set_expiry(mnt, &nfs_automount_list);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 
 out:
 	nfs_free_fattr(fattr);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 1869ea24a739..af4765ea8fde 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -110,12 +110,7 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
 
-struct nameidata;
-
-struct path;
-extern int do_add_mount(struct vfsmount *newmnt, struct path *path,
-			int mnt_flags, struct list_head *fslist);
-
+extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
 
 extern dev_t name_to_dev_t(char *name);
-- 
cgit v1.2.3-71-gd317


From 672c54466d24994eb9633f993862c89539504a42 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 13 Jan 2011 15:36:09 -0700
Subject: dt/flattree: Return virtual address from
 early_init_dt_alloc_memory_arch()

The physical address is never used by the device tree code when
allocating memory for unflattening.  Change the architecture's alloc
hook to return the virutal address instead.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/kernel/prom.c | 4 ++--
 arch/mips/kernel/prom.c       | 6 ++----
 arch/powerpc/kernel/prom.c    | 4 ++--
 drivers/of/fdt.c              | 8 +-------
 include/linux/of_fdt.h        | 2 +-
 5 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c
index c881393f07fd..bceaa5543e39 100644
--- a/arch/microblaze/kernel/prom.c
+++ b/arch/microblaze/kernel/prom.c
@@ -47,9 +47,9 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 	memblock_add(base, size);
 }
 
-u64 __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 {
-	return memblock_alloc(size, align);
+	return __va(memblock_alloc(size, align));
 }
 
 #ifdef CONFIG_EARLY_PRINTK
diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 9dbe58368953..a19811e98a41 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -45,11 +45,9 @@ void __init free_mem_mach(unsigned long addr, unsigned long size)
 	return free_bootmem(addr, size);
 }
 
-u64 __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 {
-	return virt_to_phys(
-		__alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS))
-		);
+	return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 9e3132db718b..7185f0da7dc3 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -519,9 +519,9 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 	memblock_add(base, size);
 }
 
-u64 __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 {
-	return memblock_alloc(size, align);
+	return __va(memblock_alloc(size, align));
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index c787c3d95c60..af824e7e0367 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -692,12 +692,6 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
 	return 1;
 }
 
-static void *__init early_device_tree_alloc(u64 size, u64 align)
-{
-	unsigned long mem = early_init_dt_alloc_memory_arch(size, align);
-	return __va(mem);
-}
-
 /**
  * unflatten_device_tree - create tree of device_nodes from flat blob
  *
@@ -709,7 +703,7 @@ static void *__init early_device_tree_alloc(u64 size, u64 align)
 void __init unflatten_device_tree(void)
 {
 	__unflatten_device_tree(initial_boot_params, &allnodes,
-				early_device_tree_alloc);
+				early_init_dt_alloc_memory_arch);
 
 	/* Get pointer to OF "/chosen" node for use everywhere */
 	of_chosen = of_find_node_by_path("/chosen");
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 0ef22a1f129e..c84d900fbbb3 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -97,7 +97,7 @@ extern void early_init_dt_check_for_initrd(unsigned long node);
 extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
 				     int depth, void *data);
 extern void early_init_dt_add_memory_arch(u64 base, u64 size);
-extern u64 early_init_dt_alloc_memory_arch(u64 size, u64 align);
+extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);
 extern u64 dt_mem_next_cell(int s, __be32 **cellp);
 
 /*
-- 
cgit v1.2.3-71-gd317


From f03c65993b98eeb909a4012ce7833c5857d74755 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Jan 2011 22:30:21 -0500
Subject: sanitize vfsmount refcounting changes

Instead of splitting refcount between (per-cpu) mnt_count
and (SMP-only) mnt_longrefs, make all references contribute
to mnt_count again and keep track of how many are longterm
ones.

Accounting rules for longterm count:
	* 1 for each fs_struct.root.mnt
	* 1 for each fs_struct.pwd.mnt
	* 1 for having non-NULL ->mnt_ns
	* decrement to 0 happens only under vfsmount lock exclusive

That allows nice common case for mntput() - since we can't drop the
final reference until after mnt_longterm has reached 0 due to the rules
above, mntput() can grab vfsmount lock shared and check mnt_longterm.
If it turns out to be non-zero (which is the common case), we know
that this is not the final mntput() and can just blindly decrement
percpu mnt_count.  Otherwise we grab vfsmount lock exclusive and
do usual decrement-and-check of percpu mnt_count.

For fs_struct.c we have mnt_make_longterm() and mnt_make_shortterm();
namespace.c uses the latter in places where we don't already hold
vfsmount lock exclusive and opencodes a few remaining spots where
we need to manipulate mnt_longterm.

Note that we mostly revert the code outside of fs/namespace.c back
to what we used to have; in particular, normal code doesn't need
to care about two kinds of references, etc.  And we get to keep
the optimization Nick's variant had bought us...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/mtdchar.c |   2 +-
 fs/anon_inodes.c      |   2 +-
 fs/fs_struct.c        |  35 ++++++++++-----
 fs/internal.h         |   3 ++
 fs/namei.c            |  24 -----------
 fs/namespace.c        | 116 +++++++++++++++++++-------------------------------
 fs/pipe.c             |   2 +-
 fs/super.c            |   2 +-
 include/linux/mount.h |   4 +-
 include/linux/path.h  |   2 -
 10 files changed, 75 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index ee4bb3330bdf..98240575a18d 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1201,7 +1201,7 @@ err_unregister_chdev:
 static void __exit cleanup_mtdchar(void)
 {
 	unregister_mtd_user(&mtdchar_notifier);
-	mntput_long(mtd_inode_mnt);
+	mntput(mtd_inode_mnt);
 	unregister_filesystem(&mtd_inodefs_type);
 	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index cbe57f3c4d89..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -233,7 +233,7 @@ static int __init anon_inode_init(void)
 	return 0;
 
 err_mntput:
-	mntput_long(anon_inode_mnt);
+	mntput(anon_inode_mnt);
 err_unregister_filesystem:
 	unregister_filesystem(&anon_inode_fs_type);
 err_exit:
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 68ca487bedb1..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/fs_struct.h>
+#include "internal.h"
+
+static inline void path_get_longterm(struct path *path)
+{
+	path_get(path);
+	mnt_make_longterm(path->mnt);
+}
+
+static inline void path_put_longterm(struct path *path)
+{
+	mnt_make_shortterm(path->mnt);
+	path_put(path);
+}
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_root = fs->root;
 	fs->root = *path;
-	path_get_long(path);
+	path_get_longterm(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 	if (old_root.dentry)
-		path_put_long(&old_root);
+		path_put_longterm(&old_root);
 }
 
 /*
@@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
-	path_get_long(path);
+	path_get_longterm(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
-		path_put_long(&old_pwd);
+		path_put_longterm(&old_pwd);
 }
 
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 			write_seqcount_begin(&fs->seq);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
-				path_get_long(new_root);
+				path_get_longterm(new_root);
 				fs->root = *new_root;
 				count++;
 			}
 			if (fs->pwd.dentry == old_root->dentry
 			    && fs->pwd.mnt == old_root->mnt) {
-				path_get_long(new_root);
+				path_get_longterm(new_root);
 				fs->pwd = *new_root;
 				count++;
 			}
@@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 	while (count--)
-		path_put_long(old_root);
+		path_put_longterm(old_root);
 }
 
 void free_fs_struct(struct fs_struct *fs)
 {
-	path_put_long(&fs->root);
-	path_put_long(&fs->pwd);
+	path_put_longterm(&fs->root);
+	path_put_longterm(&fs->pwd);
 	kmem_cache_free(fs_cachep, fs);
 }
 
@@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 
 		spin_lock(&old->lock);
 		fs->root = old->root;
-		path_get_long(&fs->root);
+		path_get_longterm(&fs->root);
 		fs->pwd = old->pwd;
-		path_get_long(&fs->pwd);
+		path_get_longterm(&fs->pwd);
 		spin_unlock(&old->lock);
 	}
 	return fs;
diff --git a/fs/internal.h b/fs/internal.h
index 4931060fd089..12ccb86edef7 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,6 +73,9 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern int do_add_mount(struct vfsmount *, struct path *, int);
 extern void mnt_clear_expiry(struct vfsmount *);
 
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
+
 extern void __init mnt_init(void);
 
 DECLARE_BRLOCK(vfsmount_lock);
diff --git a/fs/namei.c b/fs/namei.c
index c2e37727e3ab..8f7b41a14882 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -367,18 +367,6 @@ void path_get(struct path *path)
 }
 EXPORT_SYMBOL(path_get);
 
-/**
- * path_get_long - get a long reference to a path
- * @path: path to get the reference to
- *
- * Given a path increment the reference count to the dentry and the vfsmount.
- */
-void path_get_long(struct path *path)
-{
-	mntget_long(path->mnt);
-	dget(path->dentry);
-}
-
 /**
  * path_put - put a reference to a path
  * @path: path to put the reference to
@@ -392,18 +380,6 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
-/**
- * path_put_long - put a long reference to a path
- * @path: path to put the reference to
- *
- * Given a path decrement the reference count to the dentry and the vfsmount.
- */
-void path_put_long(struct path *path)
-{
-	dput(path->dentry);
-	mntput_long(path->mnt);
-}
-
 /**
  * nameidata_drop_rcu - drop this nameidata out of rcu-walk
  * @nd: nameidata pathwalk data to drop
diff --git a/fs/namespace.c b/fs/namespace.c
index d7fc05fac753..48809e21f270 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt)
 unsigned int mnt_get_count(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-	unsigned int count = atomic_read(&mnt->mnt_longrefs);
+	unsigned int count = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
@@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		if (!mnt->mnt_pcp)
 			goto out_free_devname;
 
-		atomic_set(&mnt->mnt_longrefs, 1);
+		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
@@ -624,8 +624,11 @@ static void commit_tree(struct vfsmount *mnt)
 	BUG_ON(parent == mnt);
 
 	list_add_tail(&head, &mnt->mnt_list);
-	list_for_each_entry(m, &head, mnt_list)
+	list_for_each_entry(m, &head, mnt_list) {
 		m->mnt_ns = n;
+		atomic_inc(&m->mnt_longterm);
+	}
+
 	list_splice(&head, n->list.prev);
 
 	list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -734,51 +737,30 @@ static inline void mntfree(struct vfsmount *mnt)
 	deactivate_super(sb);
 }
 
-#ifdef CONFIG_SMP
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
+static void mntput_no_expire(struct vfsmount *mnt)
 {
-	if (!longrefs) {
 put_again:
-		br_read_lock(vfsmount_lock);
-		if (likely(atomic_read(&mnt->mnt_longrefs))) {
-			mnt_dec_count(mnt);
-			br_read_unlock(vfsmount_lock);
-			return;
-		}
+#ifdef CONFIG_SMP
+	br_read_lock(vfsmount_lock);
+	if (likely(atomic_read(&mnt->mnt_longterm))) {
+		mnt_dec_count(mnt);
 		br_read_unlock(vfsmount_lock);
-	} else {
-		BUG_ON(!atomic_read(&mnt->mnt_longrefs));
-		if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
-			return;
+		return;
 	}
+	br_read_unlock(vfsmount_lock);
 
 	br_write_lock(vfsmount_lock);
-	if (!longrefs)
-		mnt_dec_count(mnt);
-	else
-		atomic_dec(&mnt->mnt_longrefs);
+	mnt_dec_count(mnt);
 	if (mnt_get_count(mnt)) {
 		br_write_unlock(vfsmount_lock);
 		return;
 	}
-	if (unlikely(mnt->mnt_pinned)) {
-		mnt_add_count(mnt, mnt->mnt_pinned + 1);
-		mnt->mnt_pinned = 0;
-		br_write_unlock(vfsmount_lock);
-		acct_auto_close_mnt(mnt);
-		goto put_again;
-	}
-	br_write_unlock(vfsmount_lock);
-	mntfree(mnt);
-}
 #else
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
-{
-put_again:
 	mnt_dec_count(mnt);
 	if (likely(mnt_get_count(mnt)))
 		return;
 	br_write_lock(vfsmount_lock);
+#endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
@@ -789,12 +771,6 @@ put_again:
 	br_write_unlock(vfsmount_lock);
 	mntfree(mnt);
 }
-#endif
-
-static void mntput_no_expire(struct vfsmount *mnt)
-{
-	__mntput(mnt, 0);
-}
 
 void mntput(struct vfsmount *mnt)
 {
@@ -802,7 +778,7 @@ void mntput(struct vfsmount *mnt)
 		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
 		if (unlikely(mnt->mnt_expiry_mark))
 			mnt->mnt_expiry_mark = 0;
-		__mntput(mnt, 0);
+		mntput_no_expire(mnt);
 	}
 }
 EXPORT_SYMBOL(mntput);
@@ -815,33 +791,6 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
-void mntput_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	if (mnt) {
-		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
-		if (unlikely(mnt->mnt_expiry_mark))
-			mnt->mnt_expiry_mark = 0;
-		__mntput(mnt, 1);
-	}
-#else
-	mntput(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntput_long);
-
-struct vfsmount *mntget_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	if (mnt)
-		atomic_inc(&mnt->mnt_longrefs);
-	return mnt;
-#else
-	return mntget(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntget_long);
-
 void mnt_pin(struct vfsmount *mnt)
 {
 	br_write_lock(vfsmount_lock);
@@ -1216,7 +1165,7 @@ void release_mounts(struct list_head *head)
 			dput(dentry);
 			mntput(m);
 		}
-		mntput_long(mnt);
+		mntput(mnt);
 	}
 }
 
@@ -1240,6 +1189,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
+		atomic_dec(&p->mnt_longterm);
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p) {
 			p->mnt_parent->mnt_ghosts++;
@@ -1969,7 +1919,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 
 unlock:
 	up_write(&namespace_sem);
-	mntput_long(newmnt);
+	mntput(newmnt);
 	return err;
 }
 
@@ -2291,6 +2241,20 @@ static struct mnt_namespace *alloc_mnt_ns(void)
 	return new_ns;
 }
 
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+	atomic_inc(&mnt->mnt_longterm);
+}
+
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+		return;
+	br_write_lock(vfsmount_lock);
+	atomic_dec(&mnt->mnt_longterm);
+	br_write_unlock(vfsmount_lock);
+}
+
 /*
  * Allocate a new namespace structure and populate it with contents
  * copied from the namespace of the passed in task structure.
@@ -2328,14 +2292,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	q = new_ns->root;
 	while (p) {
 		q->mnt_ns = new_ns;
+		atomic_inc(&q->mnt_longterm);
 		if (fs) {
 			if (p == fs->root.mnt) {
+				fs->root.mnt = mntget(q);
+				atomic_inc(&q->mnt_longterm);
+				mnt_make_shortterm(p);
 				rootmnt = p;
-				fs->root.mnt = mntget_long(q);
 			}
 			if (p == fs->pwd.mnt) {
+				fs->pwd.mnt = mntget(q);
+				atomic_inc(&q->mnt_longterm);
+				mnt_make_shortterm(p);
 				pwdmnt = p;
-				fs->pwd.mnt = mntget_long(q);
 			}
 		}
 		p = next_mnt(p, mnt_ns->root);
@@ -2344,9 +2313,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	up_write(&namespace_sem);
 
 	if (rootmnt)
-		mntput_long(rootmnt);
+		mntput(rootmnt);
 	if (pwdmnt)
-		mntput_long(pwdmnt);
+		mntput(pwdmnt);
 
 	return new_ns;
 }
@@ -2379,6 +2348,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
 	new_ns = alloc_mnt_ns();
 	if (!IS_ERR(new_ns)) {
 		mnt->mnt_ns = new_ns;
+		atomic_inc(&mnt->mnt_longterm);
 		new_ns->root = mnt;
 		list_add(&new_ns->list, &new_ns->root->mnt_list);
 	}
diff --git a/fs/pipe.c b/fs/pipe.c
index e2e95fb46a1e..89e9e19b1b2e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
 	unregister_filesystem(&pipe_fs_type);
-	mntput_long(pipe_mnt);
+	mntput(pipe_mnt);
 }
 
 fs_initcall(init_pipe_fs);
diff --git a/fs/super.c b/fs/super.c
index 4f6a3571a634..74e149efed81 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1141,7 +1141,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 	return mnt;
 
  err:
-	mntput_long(mnt);
+	mntput(mnt);
 	return ERR_PTR(err);
 }
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index af4765ea8fde..604f122a2326 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -60,7 +60,7 @@ struct vfsmount {
 	struct super_block *mnt_sb;	/* pointer to superblock */
 #ifdef CONFIG_SMP
 	struct mnt_pcp __percpu *mnt_pcp;
-	atomic_t mnt_longrefs;
+	atomic_t mnt_longterm;		/* how many of the refs are longterm */
 #else
 	int mnt_count;
 	int mnt_writers;
@@ -96,8 +96,6 @@ extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput(struct vfsmount *mnt);
 extern struct vfsmount *mntget(struct vfsmount *mnt);
-extern void mntput_long(struct vfsmount *mnt);
-extern struct vfsmount *mntget_long(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
 extern void mnt_unpin(struct vfsmount *mnt);
 extern int __mnt_is_readonly(struct vfsmount *mnt);
diff --git a/include/linux/path.h b/include/linux/path.h
index a581e8c06533..edc98dec6266 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -10,9 +10,7 @@ struct path {
 };
 
 extern void path_get(struct path *);
-extern void path_get_long(struct path *);
 extern void path_put(struct path *);
-extern void path_put_long(struct path *);
 
 static inline int path_equal(const struct path *path1, const struct path *path2)
 {
-- 
cgit v1.2.3-71-gd317


From fc8fe1e992ae0326a88edbe4d6793e840bbdd4ff Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 16 Jan 2011 20:42:43 +0100
Subject: PCI / ACPI: Fix build of the AER driver for CONFIG_ACPI unset

After commit 415e12b23792 ("PCI/ACPI: Request _OSC control once for each
root bridge (v3)") include/linux/pci-acpi.h is included by
drivers/pci/pcie/aer/aerdrv.c and if CONFIG_ACPI is unset, the bogus and
unnecessary alternative definition of acpi_find_root_bridge_handle()
causes a build error to occur.

Remove the offending piece of garbage.

Reported-and-tested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci-acpi.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 479d9bb88e11..44623500f419 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -35,9 +35,6 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
 					      pbus->number);
 }
-#else
-static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
-{ return NULL; }
 #endif
 
 #ifdef CONFIG_ACPI_APEI
-- 
cgit v1.2.3-71-gd317


From 94ae85220a07d357d4937086c490854f63344de4 Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@arm.linux.org.uk>
Date: Sun, 16 Jan 2011 20:18:05 +0000
Subject: ARM: PL08x: cleanup comments

Cleanup the formatting of comments, remove some which don't make sense
anymore.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
[fix conflict with 96a608a4]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/amba-pl08x.c   | 146 ++++++++++++++++-----------------------------
 include/linux/amba/pl08x.h |  43 +++++++------
 2 files changed, 72 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index bebc678ed4fc..297f48b0cba9 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -19,14 +19,14 @@
  * this program; if not, write to the Free Software Foundation, Inc., 59
  * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  *
- * The full GNU General Public License is in this distribution in the
- * file called COPYING.
+ * The full GNU General Public License is in this distribution in the file
+ * called COPYING.
  *
  * Documentation: ARM DDI 0196G == PL080
- * Documentation: ARM DDI 0218E	== PL081
+ * Documentation: ARM DDI 0218E == PL081
  *
- * PL080 & PL081 both have 16 sets of DMA signals that can be routed to
- * any channel.
+ * PL080 & PL081 both have 16 sets of DMA signals that can be routed to any
+ * channel.
  *
  * The PL080 has 8 channels available for simultaneous use, and the PL081
  * has only two channels. So on these DMA controllers the number of channels
@@ -91,11 +91,9 @@
 #define DRIVER_NAME	"pl08xdmac"
 
 /**
- * struct vendor_data - vendor-specific config parameters
- * for PL08x derivatives
+ * struct vendor_data - vendor-specific config parameters for PL08x derivatives
  * @channels: the number of channels available in this variant
- * @dualmaster: whether this version supports dual AHB masters
- * or not.
+ * @dualmaster: whether this version supports dual AHB masters or not.
  */
 struct vendor_data {
 	u8 channels;
@@ -241,10 +239,8 @@ static void pl08x_start_txd(struct pl08x_dma_chan *plchan,
  *
  * Disabling individual channels could lose data.
  *
- * Disable the peripheral DMA after disabling the DMAC
- * in order to allow the DMAC FIFO to drain, and
- * hence allow the channel to show inactive
- *
+ * Disable the peripheral DMA after disabling the DMAC in order to allow
+ * the DMAC FIFO to drain, and hence allow the channel to show inactive
  */
 static void pl08x_pause_phy_chan(struct pl08x_phy_chan *ch)
 {
@@ -367,6 +363,10 @@ static u32 pl08x_getbytes_chan(struct pl08x_dma_chan *plchan)
 
 /*
  * Allocate a physical channel for a virtual channel
+ *
+ * Try to locate a physical channel to be used for this transfer. If all
+ * are taken return NULL and the requester will have to cope by using
+ * some fallback PIO mode or retrying later.
  */
 static struct pl08x_phy_chan *
 pl08x_get_phy_channel(struct pl08x_driver_data *pl08x,
@@ -376,12 +376,6 @@ pl08x_get_phy_channel(struct pl08x_driver_data *pl08x,
 	unsigned long flags;
 	int i;
 
-	/*
-	 * Try to locate a physical channel to be used for
-	 * this transfer. If all are taken return NULL and
-	 * the requester will have to cope by using some fallback
-	 * PIO mode or retrying later.
-	 */
 	for (i = 0; i < pl08x->vd->channels; i++) {
 		ch = &pl08x->phy_chans[i];
 
@@ -495,9 +489,9 @@ struct pl08x_lli_build_data {
 };
 
 /*
- * Autoselect a master bus to use for the transfer
- * this prefers the destination bus if both available
- * if fixed address on one bus the other will be chosen
+ * Autoselect a master bus to use for the transfer this prefers the
+ * destination bus if both available if fixed address on one bus the
+ * other will be chosen
  */
 static void pl08x_choose_master_bus(struct pl08x_lli_build_data *bd,
 	struct pl08x_bus_data **mbus, struct pl08x_bus_data **sbus, u32 cctl)
@@ -530,8 +524,7 @@ static void pl08x_choose_master_bus(struct pl08x_lli_build_data *bd,
 }
 
 /*
- * Fills in one LLI for a certain transfer descriptor
- * and advance the counter
+ * Fills in one LLI for a certain transfer descriptor and advance the counter
  */
 static void pl08x_fill_lli_for_desc(struct pl08x_lli_build_data *bd,
 	int num_llis, int len, u32 cctl)
@@ -640,15 +633,11 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	 * Choose bus to align to
 	 * - prefers destination bus if both available
 	 * - if fixed address on one bus chooses other
-	 * - modifies cctl to choose an appropriate master
 	 */
 	pl08x_choose_master_bus(&bd, &mbus, &sbus, cctl);
 
 	if (txd->len < mbus->buswidth) {
-		/*
-		 * Less than a bus width available
-		 * - send as single bytes
-		 */
+		/* Less than a bus width available - send as single bytes */
 		while (bd.remainder) {
 			dev_vdbg(&pl08x->adev->dev,
 				 "%s single byte LLIs for a transfer of "
@@ -659,10 +648,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 			total_bytes++;
 		}
 	} else {
-		/*
-		 *  Make one byte LLIs until master bus is aligned
-		 *  - slave will then be aligned also
-		 */
+		/* Make one byte LLIs until master bus is aligned */
 		while ((mbus->addr) % (mbus->buswidth)) {
 			dev_vdbg(&pl08x->adev->dev,
 				"%s adjustment lli for less than bus width "
@@ -674,7 +660,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 		}
 
 		/*
-		 *  Master now aligned
+		 * Master now aligned
 		 * - if slave is not then we must set its width down
 		 */
 		if (sbus->addr % sbus->buswidth) {
@@ -732,10 +718,8 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
 			if (lli_len == target_len) {
 				/*
-				 * Can send what we wanted
-				 */
-				/*
-				 *  Maintain alignment
+				 * Can send what we wanted.
+				 * Maintain alignment
 				 */
 				lli_len	= (lli_len/mbus->buswidth) *
 							mbus->buswidth;
@@ -743,17 +727,14 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 			} else {
 				/*
 				 * So now we know how many bytes to transfer
-				 * to get to the nearest boundary
-				 * The next LLI will past the boundary
-				 * - however we may be working to a boundary
-				 *   on the slave bus
-				 *   We need to ensure the master stays aligned
+				 * to get to the nearest boundary.  The next
+				 * LLI will past the boundary.  However, we
+				 * may be working to a boundary on the slave
+				 * bus.  We need to ensure the master stays
+				 * aligned, and that we are working in
+				 * multiples of the bus widths.
 				 */
 				odd_bytes = lli_len % mbus->buswidth;
-				/*
-				 * - and that we are working in multiples
-				 *   of the bus widths
-				 */
 				lli_len -= odd_bytes;
 
 			}
@@ -793,8 +774,8 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
 			if (odd_bytes) {
 				/*
-				 * Creep past the boundary,
-				 * maintaining master alignment
+				 * Creep past the boundary, maintaining
+				 * master alignment
 				 */
 				int j;
 				for (j = 0; (j < mbus->buswidth)
@@ -837,13 +818,9 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 	}
 
 	llis_va = txd->llis_va;
-	/*
-	 * The final LLI terminates the LLI.
-	 */
+	/* The final LLI terminates the LLI. */
 	llis_va[num_llis - 1].lli = 0;
-	/*
-	 * The final LLI element shall also fire an interrupt
-	 */
+	/* The final LLI element shall also fire an interrupt. */
 	llis_va[num_llis - 1].cctl |= PL080_CONTROL_TC_IRQ_EN;
 
 #ifdef VERBOSE_DEBUG
@@ -891,7 +868,6 @@ static void pl08x_free_txd_list(struct pl08x_driver_data *pl08x,
 			list_del(&txdi->node);
 			pl08x_free_txd(pl08x, txdi);
 		}
-
 	}
 }
 
@@ -1020,10 +996,9 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_interrupt(
 }
 
 /*
- * Code accessing dma_async_is_complete() in a tight loop
- * may give problems - could schedule where indicated.
- * If slaves are relying on interrupts to signal completion this
- * function must not be called with interrupts disabled
+ * Code accessing dma_async_is_complete() in a tight loop may give problems.
+ * If slaves are relying on interrupts to signal completion this function
+ * must not be called with interrupts disabled.
  */
 static enum dma_status
 pl08x_dma_tx_status(struct dma_chan *chan,
@@ -1045,10 +1020,6 @@ pl08x_dma_tx_status(struct dma_chan *chan,
 		return ret;
 	}
 
-	/*
-	 * schedule(); could be inserted here
-	 */
-
 	/*
 	 * This cookie not complete yet
 	 */
@@ -1273,11 +1244,10 @@ static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan,
 		}
 	} else
 		/*
-		 * Else we're all set, paused and ready to roll,
-		 * status will switch to PL08X_CHAN_RUNNING when
-		 * we call issue_pending(). If there is something
-		 * running on the channel already we don't change
-		 * its state.
+		 * Else we're all set, paused and ready to roll, status
+		 * will switch to PL08X_CHAN_RUNNING when we call
+		 * issue_pending(). If there is something running on the
+		 * channel already we don't change its state.
 		 */
 		if (plchan->state == PL08X_CHAN_IDLE)
 			plchan->state = PL08X_CHAN_PAUSED;
@@ -1528,10 +1498,9 @@ bool pl08x_filter_id(struct dma_chan *chan, void *chan_id)
 
 /*
  * Just check that the device is there and active
- * TODO: turn this bit on/off depending on the number of
- * physical channels actually used, if it is zero... well
- * shut it off. That will save some power. Cut the clock
- * at the same time.
+ * TODO: turn this bit on/off depending on the number of physical channels
+ * actually used, if it is zero... well shut it off. That will save some
+ * power. Cut the clock at the same time.
  */
 static void pl08x_ensure_on(struct pl08x_driver_data *pl08x)
 {
@@ -1579,16 +1548,11 @@ static void pl08x_tasklet(unsigned long data)
 	plchan->at = NULL;
 
 	if (txd) {
-		/*
-		 * Update last completed
-		 */
+		/* Update last completed */
 		plchan->lc = txd->tx.cookie;
 	}
 
-	/*
-	 * If a new descriptor is queued, set it up
-	 * plchan->at is NULL here
-	 */
+	/* If a new descriptor is queued, set it up plchan->at is NULL here */
 	if (!list_empty(&plchan->pend_list)) {
 		struct pl08x_txd *next;
 
@@ -1615,11 +1579,10 @@ static void pl08x_tasklet(unsigned long data)
 		plchan->state = PL08X_CHAN_IDLE;
 
 		/*
-		 * And NOW before anyone else can grab that free:d
-		 * up physical channel, see if there is some memcpy
-		 * pending that seriously needs to start because of
-		 * being stacked up while we were choking the
-		 * physical channels with data.
+		 * And NOW before anyone else can grab that free:d up
+		 * physical channel, see if there is some memcpy pending
+		 * that seriously needs to start because of being stacked
+		 * up while we were choking the physical channels with data.
 		 */
 		list_for_each_entry(waiting, &pl08x->memcpy.channels,
 				    chan.device_node) {
@@ -1670,9 +1633,7 @@ static irqreturn_t pl08x_irq(int irq, void *dev)
 
 	val = readl(pl08x->base + PL080_ERR_STATUS);
 	if (val) {
-		/*
-		 * An error interrupt (on one or more channels)
-		 */
+		/* An error interrupt (on one or more channels) */
 		dev_err(&pl08x->adev->dev,
 			"%s error interrupt, register value 0x%08x\n",
 				__func__, val);
@@ -1696,9 +1657,7 @@ static irqreturn_t pl08x_irq(int irq, void *dev)
 			mask |= (1 << i);
 		}
 	}
-	/*
-	 * Clear only the terminal interrupts on channels we processed
-	 */
+	/* Clear only the terminal interrupts on channels we processed */
 	writel(mask, pl08x->base + PL080_TC_CLEAR);
 
 	return mask ? IRQ_HANDLED : IRQ_NONE;
@@ -1717,6 +1676,7 @@ static int pl08x_dma_init_virtual_channels(struct pl08x_driver_data *pl08x,
 	int i;
 
 	INIT_LIST_HEAD(&dmadev->channels);
+
 	/*
 	 * Register as many many memcpy as we have physical channels,
 	 * we won't always be able to use all but the code will have
@@ -1950,9 +1910,7 @@ static int pl08x_probe(struct amba_device *adev, struct amba_id *id)
 	/* Turn on the PL08x */
 	pl08x_ensure_on(pl08x);
 
-	/*
-	 * Attach the interrupt handler
-	 */
+	/* Attach the interrupt handler */
 	writel(0x000000FF, pl08x->base + PL080_ERR_CLEAR);
 	writel(0x000000FF, pl08x->base + PL080_TC_CLEAR);
 
diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h
index 5b87b6aac3f8..3111385b8ca7 100644
--- a/include/linux/amba/pl08x.h
+++ b/include/linux/amba/pl08x.h
@@ -12,7 +12,6 @@
  *
  * Please credit ARM.com
  * Documentation: ARM DDI 0196D
- *
  */
 
 #ifndef AMBA_PL08X_H
@@ -55,8 +54,8 @@ enum {
  * @circular_buffer: whether the buffer passed in is circular and
  * shall simply be looped round round (like a record baby round
  * round round round)
- * @single: the device connected to this channel will request single
- * DMA transfers, not bursts. (Bursts are default.)
+ * @single: the device connected to this channel will request single DMA
+ * transfers, not bursts. (Bursts are default.)
  * @periph_buses: the device connected to this channel is accessible via
  * these buses (use PL08X_AHB1 | PL08X_AHB2).
  */
@@ -78,8 +77,7 @@ struct pl08x_channel_data {
  * @addr: current address
  * @maxwidth: the maximum width of a transfer on this bus
  * @buswidth: the width of this bus in bytes: 1, 2 or 4
- * @fill_bytes: bytes required to fill to the next bus memory
- * boundary
+ * @fill_bytes: bytes required to fill to the next bus memory boundary
  */
 struct pl08x_bus_data {
 	dma_addr_t addr;
@@ -92,10 +90,10 @@ struct pl08x_bus_data {
  * struct pl08x_phy_chan - holder for the physical channels
  * @id: physical index to this channel
  * @lock: a lock to use when altering an instance of this struct
- * @signal: the physical signal (aka channel) serving this
- * physical channel right now
- * @serving: the virtual channel currently being served by this
- * physical channel
+ * @signal: the physical signal (aka channel) serving this physical channel
+ * right now
+ * @serving: the virtual channel currently being served by this physical
+ * channel
  */
 struct pl08x_phy_chan {
 	unsigned int id;
@@ -119,7 +117,6 @@ struct pl08x_txd {
 	size_t len;
 	dma_addr_t llis_bus;
 	struct pl08x_lli *llis_va;
-	bool active;
 	/* Default cctl value for LLIs */
 	u32 cctl;
 	/*
@@ -130,8 +127,8 @@ struct pl08x_txd {
 };
 
 /**
- * struct pl08x_dma_chan_state - holds the PL08x specific virtual
- * channel states
+ * struct pl08x_dma_chan_state - holds the PL08x specific virtual channel
+ * states
  * @PL08X_CHAN_IDLE: the channel is idle
  * @PL08X_CHAN_RUNNING: the channel has allocated a physical transport
  * channel and is running a transfer on it
@@ -152,7 +149,7 @@ enum pl08x_dma_chan_state {
  * @chan: wrappped abstract channel
  * @phychan: the physical channel utilized by this channel, if there is one
  * @phychan_hold: if non-zero, hold on to the physical channel even if we
- *  have no pending entries
+ * have no pending entries
  * @tasklet: tasklet scheduled by the IRQ to handle actual work etc
  * @name: name of channel
  * @cd: channel platform data
@@ -166,8 +163,8 @@ enum pl08x_dma_chan_state {
  * @host: a pointer to the host (internal use)
  * @state: whether the channel is idle, paused, running etc
  * @slave: whether this channel is a device (slave) or for memcpy
- * @waiting: a TX descriptor on this channel which is waiting for
- * a physical channel to become available
+ * @waiting: a TX descriptor on this channel which is waiting for a physical
+ * channel to become available
  */
 struct pl08x_dma_chan {
 	struct dma_chan chan;
@@ -189,16 +186,16 @@ struct pl08x_dma_chan {
 };
 
 /**
- * struct pl08x_platform_data - the platform configuration for the
- * PL08x PrimeCells.
+ * struct pl08x_platform_data - the platform configuration for the PL08x
+ * PrimeCells.
  * @slave_channels: the channels defined for the different devices on the
  * platform, all inclusive, including multiplexed channels. The available
- * physical channels will be multiplexed around these signals as they
- * are requested, just enumerate all possible channels.
- * @get_signal: request a physical signal to be used for a DMA
- * transfer immediately: if there is some multiplexing or similar blocking
- * the use of the channel the transfer can be denied by returning
- * less than zero, else it returns the allocated signal number
+ * physical channels will be multiplexed around these signals as they are
+ * requested, just enumerate all possible channels.
+ * @get_signal: request a physical signal to be used for a DMA transfer
+ * immediately: if there is some multiplexing or similar blocking the use
+ * of the channel the transfer can be denied by returning less than zero,
+ * else it returns the allocated signal number
  * @put_signal: indicate to the platform that this physical signal is not
  * running any DMA transfer and multiplexing can be recycled
  * @lli_buses: buses which LLIs can be fetched from: PL08X_AHB1 | PL08X_AHB2
-- 
cgit v1.2.3-71-gd317


From 2fe17c1075836b66678ed2a305fd09b6773883aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Jan 2011 13:07:43 +0100
Subject: fallocate should be a file operation

Currently all filesystems except XFS implement fallocate asynchronously,
while XFS forced a commit.  Both of these are suboptimal - in case of O_SYNC
I/O we really want our allocation on disk, especially for the !KEEP_SIZE
case where we actually grow the file with user-visible zeroes.  On the
other hand always commiting the transaction is a bad idea for fast-path
uses of fallocate like for example in recent Samba versions.   Given
that block allocation is a data plane operation anyway change it from
an inode operation to a file operation so that we have the file structure
available that lets us check for O_SYNC.

This also includes moving the code around for a few of the filesystems,
and remove the already unnedded S_ISDIR checks given that we only wire
up fallocate for regular files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |   3 +-
 fs/btrfs/file.c                   | 113 +++++++++++++++++
 fs/btrfs/inode.c                  | 111 ----------------
 fs/ext4/ext4.h                    |   2 +-
 fs/ext4/extents.c                 |   9 +-
 fs/ext4/file.c                    |   2 +-
 fs/gfs2/file.c                    | 258 ++++++++++++++++++++++++++++++++++++++
 fs/gfs2/ops_inode.c               | 258 --------------------------------------
 fs/ocfs2/file.c                   |   8 +-
 fs/open.c                         |   4 +-
 fs/xfs/linux-2.6/xfs_file.c       |  56 +++++++++
 fs/xfs/linux-2.6/xfs_iops.c       |  60 ---------
 include/linux/fs.h                |   4 +-
 13 files changed, 440 insertions(+), 448 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 651d5237c155..4471a416c274 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -60,7 +60,6 @@ ata *);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
-	long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 
 locking rules:
@@ -88,7 +87,6 @@ getxattr:	no
 listxattr:	no
 removexattr:	yes
 truncate_range:	yes
-fallocate:	no
 fiemap:		no
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
@@ -437,6 +435,7 @@ prototypes:
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
 			size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	long (*fallocate)(struct file *, int, loff_t, loff_t);
 };
 
 locking rules:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..a9e0a4eaf3d9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -1237,6 +1238,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
+static long btrfs_fallocate(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct extent_state *cached_state = NULL;
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 locked_end;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, alloc_end);
+	if (ret)
+		goto out;
+
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	if (ret)
+		goto out;
+
+	locked_end = alloc_end - 1;
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    (cur_offset >= inode->i_size &&
+		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
+
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -1248,6 +1360,7 @@ const struct file_operations btrfs_file_operations = {
 	.open		= generic_file_open,
 	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
+	.fallocate	= btrfs_fallocate,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64daf2acd0d5..902afbf50811 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7098,116 +7098,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 					   min_size, actual_len, alloc_hint, trans);
 }
 
-static long btrfs_fallocate(struct inode *inode, int mode,
-			    loff_t offset, loff_t len)
-{
-	struct extent_state *cached_state = NULL;
-	u64 cur_offset;
-	u64 last_byte;
-	u64 alloc_start;
-	u64 alloc_end;
-	u64 alloc_hint = 0;
-	u64 locked_end;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-	struct extent_map *em;
-	int ret;
-
-	alloc_start = offset & ~mask;
-	alloc_end =  (offset + len + mask) & ~mask;
-
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
-		return -EOPNOTSUPP;
-
-	/*
-	 * wait for ordered IO before we have any locks.  We'll loop again
-	 * below with the locks held.
-	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
-	mutex_lock(&inode->i_mutex);
-	ret = inode_newsize_ok(inode, alloc_end);
-	if (ret)
-		goto out;
-
-	if (alloc_start > inode->i_size) {
-		ret = btrfs_cont_expand(inode, alloc_start);
-		if (ret)
-			goto out;
-	}
-
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-	if (ret)
-		goto out;
-
-	locked_end = alloc_end - 1;
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		/* the extent lock is ordered inside the running
-		 * transaction
-		 */
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-				 locked_end, 0, &cached_state, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    alloc_end - 1);
-		if (ordered &&
-		    ordered->file_offset + ordered->len > alloc_start &&
-		    ordered->file_offset < alloc_end) {
-			btrfs_put_ordered_extent(ordered);
-			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-					     alloc_start, locked_end,
-					     &cached_state, GFP_NOFS);
-			/*
-			 * we can't wait on the range with the transaction
-			 * running or with the extent lock held
-			 */
-			btrfs_wait_ordered_range(inode, alloc_start,
-						 alloc_end - alloc_start);
-		} else {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-	}
-
-	cur_offset = alloc_start;
-	while (1) {
-		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-				      alloc_end - cur_offset, 0);
-		BUG_ON(IS_ERR(em) || !em);
-		last_byte = min(extent_map_end(em), alloc_end);
-		last_byte = (last_byte + mask) & ~mask;
-		if (em->block_start == EXTENT_MAP_HOLE ||
-		    (cur_offset >= inode->i_size &&
-		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-							last_byte - cur_offset,
-							1 << inode->i_blkbits,
-							offset + len,
-							&alloc_hint);
-			if (ret < 0) {
-				free_extent_map(em);
-				break;
-			}
-		}
-		free_extent_map(em);
-
-		cur_offset = last_byte;
-		if (cur_offset >= alloc_end) {
-			ret = 0;
-			break;
-		}
-	}
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			     &cached_state, GFP_NOFS);
-
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
 static int btrfs_set_page_dirty(struct page *page)
 {
 	return __set_page_dirty_nobuffers(page);
@@ -7310,7 +7200,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
-	.fallocate	= btrfs_fallocate,
 	.fiemap		= btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1de65f572033..0c8d97b56f34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2065,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4bdd160854eb..63a75810b7c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
  * For block-mapped files, posix_fallocate should fall back to the method
  * of writing zeroes to the required new blocks (the same behavior which is
  * expected for file systems which do not support fallocate() system call).
  */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	handle_t *handle;
 	loff_t new_size;
 	unsigned int max_blocks;
@@ -3655,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
-	/* preallocation to directories is currently not supported */
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bb003dc9ffff..2e8322c8aa88 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -210,6 +210,7 @@ const struct file_operations ext4_file_operations = {
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.fallocate	= ext4_fallocate,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
@@ -223,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.check_acl	= ext4_check_acl,
-	.fallocate	= ext4_fallocate,
 	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e6..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
+static void empty_write_end(struct page *page, unsigned from,
+			   unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+	page_zero_new_buffers(page, from, to);
+	flush_dcache_page(page);
+	mark_page_accessed(page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	block_commit_write(page, from, to);
+}
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	unsigned start, end, next;
+	struct buffer_head *bh, *head;
+	int error;
+
+	if (!page_has_buffers(page)) {
+		error = __block_write_begin(page, from, to - from, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+
+		empty_write_end(page, from, to);
+		return 0;
+	}
+
+	bh = head = page_buffers(page);
+	next = end = 0;
+	while (next < from) {
+		next += bh->b_size;
+		bh = bh->b_this_page;
+	}
+	start = next;
+	do {
+		next += bh->b_size;
+		if (buffer_mapped(bh)) {
+			if (end) {
+				error = __block_write_begin(page, start, end - start,
+							    gfs2_block_map);
+				if (unlikely(error))
+					return error;
+				empty_write_end(page, start, end);
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		bh = bh->b_this_page;
+	} while (next < to);
+
+	if (end) {
+		error = __block_write_begin(page, start, end - start, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+		empty_write_end(page, start, end);
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+		 sdp->sd_sb.sb_bsize_shift;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(al);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 
 /**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= gfs2_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= generic_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c09528c07f3d..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
-#include <linux/swap.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -1257,261 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return ret;
 }
 
-static void empty_write_end(struct page *page, unsigned from,
-			   unsigned to)
-{
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-
-	page_zero_new_buffers(page, from, to);
-	flush_dcache_page(page);
-	mark_page_accessed(page);
-
-	if (!gfs2_is_writeback(ip))
-		gfs2_page_add_databufs(ip, page, from, to);
-
-	block_commit_write(page, from, to);
-}
-
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-{
-	unsigned start, end, next;
-	struct buffer_head *bh, *head;
-	int error;
-
-	if (!page_has_buffers(page)) {
-		error = __block_write_begin(page, from, to - from, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-
-		empty_write_end(page, from, to);
-		return 0;
-	}
-
-	bh = head = page_buffers(page);
-	next = end = 0;
-	while (next < from) {
-		next += bh->b_size;
-		bh = bh->b_this_page;
-	}
-	start = next;
-	do {
-		next += bh->b_size;
-		if (buffer_mapped(bh)) {
-			if (end) {
-				error = __block_write_begin(page, start, end - start,
-							    gfs2_block_map);
-				if (unlikely(error))
-					return error;
-				empty_write_end(page, start, end);
-				end = 0;
-			}
-			start = next;
-		}
-		else
-			end = next;
-		bh = bh->b_this_page;
-	} while (next < to);
-
-	if (end) {
-		error = __block_write_begin(page, start, end - start, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-		empty_write_end(page, start, end);
-	}
-
-	return 0;
-}
-
-static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
-			   int mode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *dibh;
-	int error;
-	u64 start = offset >> PAGE_CACHE_SHIFT;
-	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-	pgoff_t curr;
-	struct page *page;
-	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-	unsigned int from, to;
-
-	if (!end_offset)
-		end_offset = PAGE_CACHE_SIZE;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(error))
-		goto out;
-
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-
-	if (gfs2_is_stuffed(ip)) {
-		error = gfs2_unstuff_dinode(ip, NULL);
-		if (unlikely(error))
-			goto out;
-	}
-
-	curr = start;
-	offset = start << PAGE_CACHE_SHIFT;
-	from = start_offset;
-	to = PAGE_CACHE_SIZE;
-	while (curr <= end) {
-		page = grab_cache_page_write_begin(inode->i_mapping, curr,
-						   AOP_FLAG_NOFS);
-		if (unlikely(!page)) {
-			error = -ENOMEM;
-			goto out;
-		}
-
-		if (curr == end)
-			to = end_offset;
-		error = write_empty_blocks(page, from, to);
-		if (!error && offset + to > inode->i_size &&
-		    !(mode & FALLOC_FL_KEEP_SIZE)) {
-			i_size_write(inode, offset + to);
-		}
-		unlock_page(page);
-		page_cache_release(page);
-		if (error)
-			goto out;
-		curr++;
-		offset += PAGE_CACHE_SIZE;
-		from = 0;
-	}
-
-	gfs2_dinode_out(ip, dibh->b_data);
-	mark_inode_dirty(inode);
-
-	brelse(dibh);
-
-out:
-	return error;
-}
-
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
-			    unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
-	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
-
-	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
-		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-		max_data -= tmp;
-	}
-	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-	   so it might end up with fewer data blocks */
-	if (max_data <= *data_blocks)
-		return;
-	*data_blocks = max_data;
-	*ind_blocks = max_blocks - max_data;
-	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
-	if (*len > max) {
-		*len = max;
-		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
-	}
-}
-
-static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
-			   loff_t len)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	loff_t bytes, max_bytes;
-	struct gfs2_alloc *al;
-	int error;
-	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
-	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
-
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
-		return -EOPNOTSUPP;
-
-	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
-		 sdp->sd_sb.sb_bsize_shift;
-
-	len = next - offset;
-	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
-	if (!bytes)
-		bytes = UINT_MAX;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-	error = gfs2_glock_nq(&ip->i_gh);
-	if (unlikely(error))
-		goto out_uninit;
-
-	if (!gfs2_write_alloc_required(ip, offset, len))
-		goto out_unlock;
-
-	while (len > 0) {
-		if (len < bytes)
-			bytes = len;
-		al = gfs2_alloc_get(ip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_unlock;
-		}
-
-		error = gfs2_quota_lock_check(ip);
-		if (error)
-			goto out_alloc_put;
-
-retry:
-		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-
-		al->al_requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip);
-		if (error) {
-			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-				bytes >>= 1;
-				goto retry;
-			}
-			goto out_qunlock;
-		}
-		max_bytes = bytes;
-		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
-		al->al_requested = data_blocks + ind_blocks;
-
-		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-			  RES_RG_HDR + gfs2_rg_blocks(al);
-		if (gfs2_is_jdata(ip))
-			rblocks += data_blocks ? data_blocks : 1;
-
-		error = gfs2_trans_begin(sdp, rblocks,
-					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-		if (error)
-			goto out_trans_fail;
-
-		error = fallocate_chunk(inode, offset, max_bytes, mode);
-		gfs2_trans_end(sdp);
-
-		if (error)
-			goto out_trans_fail;
-
-		len -= max_bytes;
-		offset += max_bytes;
-		gfs2_inplace_release(ip);
-		gfs2_quota_unlock(ip);
-		gfs2_alloc_put(ip);
-	}
-	goto out_unlock;
-
-out_trans_fail:
-	gfs2_inplace_release(ip);
-out_qunlock:
-	gfs2_quota_unlock(ip);
-out_alloc_put:
-	gfs2_alloc_put(ip);
-out_unlock:
-	gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-	gfs2_holder_uninit(&ip->i_gh);
-	return error;
-}
-
-
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
@@ -1562,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
-	.fallocate = gfs2_fallocate,
 	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index cf254ce8c941..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,9 +1989,10 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
 
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
 			    loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_space_resv sr;
 	int change_size = 1;
@@ -2002,9 +2003,6 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 	if (!ocfs2_writes_unwritten_extents(osb))
 		return -EOPNOTSUPP;
 
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		change_size = 0;
 
@@ -2612,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
 	.getxattr	= generic_getxattr,
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
-	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
 
@@ -2644,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops = {
diff --git a/fs/open.c b/fs/open.c
index 5b6ef7e2859e..e52389e1f05b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -255,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 		return -EFBIG;
 
-	if (!inode->i_op->fallocate)
+	if (!file->f_op->fallocate)
 		return -EOPNOTSUPP;
 
-	return inode->i_op->fallocate(inode, mode, offset, len);
+	return file->f_op->fallocate(file, mode, offset, len);
 }
 
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ef51eb43e137..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,6 +37,7 @@
 #include "xfs_trace.h"
 
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -882,6 +883,60 @@ out_unlock:
 	return ret;
 }
 
+STATIC long
+xfs_file_fallocate(
+	struct file	*file,
+	int		mode,
+	loff_t		offset,
+	loff_t		len)
+{
+	struct inode	*inode = file->f_path.dentry->d_inode;
+	long		error;
+	loff_t		new_size = 0;
+	xfs_flock64_t	bf;
+	xfs_inode_t	*ip = XFS_I(inode);
+	int		cmd = XFS_IOC_RESVSP;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	bf.l_whence = 0;
+	bf.l_start = offset;
+	bf.l_len = len;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = XFS_IOC_UNRESVSP;
+
+	/* check the new inode size is valid before allocating */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		error = inode_newsize_ok(inode, new_size);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+	if (error)
+		goto out_unlock;
+
+	/* Change file size if needed */
+	if (new_size) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+	}
+
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+
 STATIC int
 xfs_file_open(
 	struct inode	*inode,
@@ -1000,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
+	.fallocate	= xfs_file_fallocate,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index a4ecc2188a09..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
 
@@ -505,64 +504,6 @@ xfs_vn_setattr(
 	return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 
-STATIC long
-xfs_vn_fallocate(
-	struct inode	*inode,
-	int		mode,
-	loff_t		offset,
-	loff_t		len)
-{
-	long		error;
-	loff_t		new_size = 0;
-	xfs_flock64_t	bf;
-	xfs_inode_t	*ip = XFS_I(inode);
-	int		cmd = XFS_IOC_RESVSP;
-
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-		return -EOPNOTSUPP;
-
-	/* preallocation on directories not yet supported */
-	error = -ENODEV;
-	if (S_ISDIR(inode->i_mode))
-		goto out_error;
-
-	bf.l_whence = 0;
-	bf.l_start = offset;
-	bf.l_len = len;
-
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-	if (mode & FALLOC_FL_PUNCH_HOLE)
-		cmd = XFS_IOC_UNRESVSP;
-
-	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode)) {
-		new_size = offset + len;
-		error = inode_newsize_ok(inode, new_size);
-		if (error)
-			goto out_unlock;
-	}
-
-	error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
-	if (error)
-		goto out_unlock;
-
-	/* Change file size if needed */
-	if (new_size) {
-		struct iattr iattr;
-
-		iattr.ia_valid = ATTR_SIZE;
-		iattr.ia_size = new_size;
-		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-	}
-
-out_unlock:
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-	return error;
-}
-
 #define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
 /*
@@ -656,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
 	.getxattr		= generic_getxattr,
 	.removexattr		= generic_removexattr,
 	.listxattr		= xfs_vn_listxattr,
-	.fallocate		= xfs_vn_fallocate,
 	.fiemap			= xfs_vn_fiemap,
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 177b4ddea418..09b5bd6a7c6b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1552,6 +1552,8 @@ struct file_operations {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	long (*fallocate)(struct file *file, int mode, loff_t offset,
+			  loff_t len);
 };
 
 #define IPERM_FLAG_RCU	0x0001
@@ -1582,8 +1584,6 @@ struct inode_operations {
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
-	long (*fallocate)(struct inode *inode, int mode, loff_t offset,
-			  loff_t len);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
 } ____cacheline_aligned;
-- 
cgit v1.2.3-71-gd317


From c2b3e74b78b24cb367289a75a2bd30e569e56e0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Dec 2010 19:38:08 -0500
Subject: fs: Remove unlikely() from fput_light()

In fput_light(), there's an unlikely(fput_needed), which running on
my normal desktop doing firefox, xchat, evolution and part of my distcc farm,
and running the annotate branch profiler shows that the unlikely is not
very unlikely.

 correct incorrect  %        Function             File              Line
 ------- ---------  -        --------             ----              ----
       0       48 100 fput_light                file.h               26
115828710 897415279  88 fput_light              file.h               26
865271179 5286128445  85 fput_light             file.h               26
19568539  8923664  31 fput_light                file.h               26
12353677  3562279  22 fput_light                file.h               26
  267691    67062  20 fput_light                file.h               26
15014853   348172   2 fput_light                file.h               26
  209258      205   0 fput_light                file.h               26
 1364164        0   0 fput_light                file.h               26

Which gives 1032903812 times it was correct and 6203351846 times it was
incorrect, or 85% incorrect.

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/file.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index b1e12970f617..e85baebf6279 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -23,7 +23,7 @@ extern struct file *alloc_file(struct path *, fmode_t mode,
 
 static inline void fput_light(struct file *file, int fput_needed)
 {
-	if (unlikely(fput_needed))
+	if (fput_needed)
 		fput(file);
 }
 
-- 
cgit v1.2.3-71-gd317


From ecf5632dd189ab4c366cef853d6e5fe7adfe52e5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sun, 16 Jan 2011 23:28:17 +0900
Subject: fs: fix address space warnings in ioctl_fiemap()

The fi_extents_start field of struct fiemap_extent_info is a
user pointer but was not marked as __user. This makes sparse
emit following warnings:

  CHECK   fs/ioctl.c
fs/ioctl.c:114:26: warning: incorrect type in argument 1 (different address spaces)
fs/ioctl.c:114:26:    expected void [noderef] <asn:1>*dst
fs/ioctl.c:114:26:    got struct fiemap_extent *[assigned] dest
fs/ioctl.c:202:14: warning: incorrect type in argument 1 (different address spaces)
fs/ioctl.c:202:14:    expected void const volatile [noderef] <asn:1>*<noident>
fs/ioctl.c:202:14:    got struct fiemap_extent *[assigned] fi_extents_start
fs/ioctl.c:212:27: warning: incorrect type in argument 1 (different address spaces)
fs/ioctl.c:212:27:    expected void [noderef] <asn:1>*dst
fs/ioctl.c:212:27:    got char *<noident>

Also add 'ufiemap' variable to eliminate unnecessary casts.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ioctl.c         | 10 +++++-----
 include/linux/fs.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index d6cc16476620..a59635e295fa 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
 			    u64 phys, u64 len, u32 flags)
 {
 	struct fiemap_extent extent;
-	struct fiemap_extent *dest = fieinfo->fi_extents_start;
+	struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
 
 	/* only count the extents */
 	if (fieinfo->fi_extents_max == 0) {
@@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
 static int ioctl_fiemap(struct file *filp, unsigned long arg)
 {
 	struct fiemap fiemap;
+	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
 	struct fiemap_extent_info fieinfo = { 0, };
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
@@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	if (!inode->i_op->fiemap)
 		return -EOPNOTSUPP;
 
-	if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
-			   sizeof(struct fiemap)))
+	if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
 		return -EFAULT;
 
 	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 
 	fieinfo.fi_flags = fiemap.fm_flags;
 	fieinfo.fi_extents_max = fiemap.fm_extent_count;
-	fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+	fieinfo.fi_extents_start = ufiemap->fm_extents;
 
 	if (fiemap.fm_extent_count != 0 &&
 	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
 	fiemap.fm_flags = fieinfo.fi_flags;
 	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
-	if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+	if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
 		error = -EFAULT;
 
 	return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 09b5bd6a7c6b..32b38cd829d3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1483,8 +1483,8 @@ struct fiemap_extent_info {
 	unsigned int fi_flags;		/* Flags as passed from user */
 	unsigned int fi_extents_mapped;	/* Number of mapped extents */
 	unsigned int fi_extents_max;	/* Size of fiemap_extent array */
-	struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent
-						 * array */
+	struct fiemap_extent __user *fi_extents_start; /* Start of
+							fiemap_extent array */
 };
 int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
 			    u64 phys, u64 len, u32 flags);
-- 
cgit v1.2.3-71-gd317


From 2a8652f4e0d11ee27b1d2870c600fd1300661a6e Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 3 Nov 2010 11:11:15 +0100
Subject: ecryptfs: moved ECRYPTFS_SUPER_MAGIC definition to linux/magic.h

The definition of ECRYPTFS_SUPER_MAGIC has been moved to the include
file 'linux/magic.h' to become available to other kernel subsystems.

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/ecryptfs_kernel.h | 1 -
 include/linux/magic.h         | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0bb..bc530a81e4ce 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
 		(((struct user_key_payload*)key->payload.data)->data);
 }
 
-#define ECRYPTFS_SUPER_MAGIC 0xf15f
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64
diff --git a/include/linux/magic.h b/include/linux/magic.h
index ff690d05f129..62730ea2b56e 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -16,6 +16,7 @@
 #define TMPFS_MAGIC		0x01021994
 #define HUGETLBFS_MAGIC 	0x958458f6	/* some random number */
 #define SQUASHFS_MAGIC		0x73717368
+#define ECRYPTFS_SUPER_MAGIC	0xf15f
 #define EFS_SUPER_MAGIC		0x414A53
 #define EXT2_SUPER_MAGIC	0xEF53
 #define EXT3_SUPER_MAGIC	0xEF53
-- 
cgit v1.2.3-71-gd317


From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 20 Jan 2011 12:06:35 +0100
Subject: lockdep: Move early boot local IRQ enable/disable status to
 init/main.c

During early boot, local IRQ is disabled until IRQ subsystem is
properly initialized.  During this time, no one should enable
local IRQ and some operations which usually are not allowed with
IRQ disabled, e.g. operations which might sleep or require
communications with other processors, are allowed.

lockdep tracked this with early_boot_irqs_off/on() callbacks.
As other subsystems need this information too, move it to
init/main.c and make it generally available.  While at it,
toggle the boolean to early_boot_irqs_disabled instead of
enabled so that it can be initialized with %false and %true
indicates the exceptional condition.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110120110635.GB6036@htj.dyndns.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c     |  2 +-
 include/linux/kernel.h       |  2 ++
 include/linux/lockdep.h      |  8 --------
 init/main.c                  | 13 +++++++++++--
 kernel/lockdep.c             | 18 +-----------------
 kernel/trace/trace_irqsoff.c |  8 --------
 6 files changed, 15 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7e8d3bc80af6..50542efe45fb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1194,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void)
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
 
 	local_irq_disable();
-	early_boot_irqs_off();
+	early_boot_irqs_disabled = true;
 
 	memblock_init();
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5a9d9059520b..d07d8057e440 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -243,6 +243,8 @@ extern int test_taint(unsigned flag);
 extern unsigned long get_taint(void);
 extern int root_mountflags;
 
+extern bool early_boot_irqs_disabled;
+
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 71c09b26c759..f638fd78d106 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -436,16 +436,8 @@ do {								\
 #endif /* CONFIG_LOCKDEP */
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-extern void early_boot_irqs_off(void);
-extern void early_boot_irqs_on(void);
 extern void print_irqtrace_events(struct task_struct *curr);
 #else
-static inline void early_boot_irqs_off(void)
-{
-}
-static inline void early_boot_irqs_on(void)
-{
-}
 static inline void print_irqtrace_events(struct task_struct *curr)
 {
 }
diff --git a/init/main.c b/init/main.c
index 00799c1d4628..33c37c379e96 100644
--- a/init/main.c
+++ b/init/main.c
@@ -96,6 +96,15 @@ static inline void mark_rodata_ro(void) { }
 extern void tc_init(void);
 #endif
 
+/*
+ * Debug helper: via this flag we know that we are in 'early bootup code'
+ * where only the boot processor is running with IRQ disabled.  This means
+ * two things - IRQ must not be enabled before the flag is cleared and some
+ * operations which are not allowed with IRQ disabled are allowed while the
+ * flag is set.
+ */
+bool early_boot_irqs_disabled __read_mostly;
+
 enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 
@@ -554,7 +563,7 @@ asmlinkage void __init start_kernel(void)
 	cgroup_init_early();
 
 	local_irq_disable();
-	early_boot_irqs_off();
+	early_boot_irqs_disabled = true;
 
 /*
  * Interrupts are still disabled. Do necessary setups, then
@@ -621,7 +630,7 @@ asmlinkage void __init start_kernel(void)
 	if (!irqs_disabled())
 		printk(KERN_CRIT "start_kernel(): bug: interrupts were "
 				 "enabled early\n");
-	early_boot_irqs_on();
+	early_boot_irqs_disabled = false;
 	local_irq_enable();
 
 	/* Interrupts are enabled now so all GFP allocations are safe. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2291,22 +2291,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 	return 1;
 }
 
-/*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-
-void early_boot_irqs_off(void)
-{
-	early_boot_irqs_enabled = 0;
-}
-
-void early_boot_irqs_on(void)
-{
-	early_boot_irqs_enabled = 1;
-}
-
 /*
  * Hardirqs will be enabled:
  */
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
-	if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
  * Stubs:
  */
 
-void early_boot_irqs_off(void)
-{
-}
-
-void early_boot_irqs_on(void)
-{
-}
-
 void trace_softirqs_on(unsigned long ip)
 {
 }
-- 
cgit v1.2.3-71-gd317


From ca3e021417eed30ec2b64ce88eb0acf64aa9bc29 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 20 Jan 2011 14:44:24 -0800
Subject: memcg: fix USED bit handling at uncharge in THP

Now, under THP:

at charge:
  - PageCgroupUsed bit is set to all page_cgroup on a hugepage.
    ....set to 512 pages.
at uncharge
  - PageCgroupUsed bit is unset on the head page.

So, some pages will remain with "Used" bit.

This patch fixes that Used bit is set only to the head page.
Used bits for tail pages will be set at splitting if necessary.

This patch adds this lock order:
   compound_lock() -> page_cgroup_move_lock().

[akpm@linux-foundation.org: fix warning]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  9 +++++
 mm/huge_memory.c           |  2 +
 mm/memcontrol.c            | 91 ++++++++++++++++++++++++++--------------------
 3 files changed, 62 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6a576f989437..f512e189be5a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -146,6 +146,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask);
 u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail);
+#endif
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -335,6 +339,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
 	return 0;
 }
 
+static inline void mem_cgroup_split_huge_fixup(struct page *head,
+						struct page *tail)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c4f634b3a48e..e187454d82f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1203,6 +1203,8 @@ static void __split_huge_page_refcount(struct page *page)
 		BUG_ON(!PageDirty(page_tail));
 		BUG_ON(!PageSwapBacked(page_tail));
 
+		mem_cgroup_split_huge_fixup(page, page_tail);
+
 		lru_add_page_tail(zone, page, page_tail);
 	}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6d59a2bd520a..848b42195e5b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1614,7 +1614,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 	if (unlikely(!mem || !PageCgroupUsed(pc)))
 		goto out;
 	/* pc->mem_cgroup is unstable ? */
-	if (unlikely(mem_cgroup_stealed(mem))) {
+	if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
 		/* take a lock against to access pc->mem_cgroup */
 		move_lock_page_cgroup(pc, &flags);
 		need_unlock = true;
@@ -2083,14 +2083,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	return mem;
 }
 
-/*
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
- * USED state. If already USED, uncharge and return.
- */
-static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
-					 struct page_cgroup *pc,
-					 enum charge_type ctype)
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+				       struct page_cgroup *pc,
+				       enum charge_type ctype,
+				       int page_size)
 {
+	int nr_pages = page_size >> PAGE_SHIFT;
+
+	/* try_charge() can return NULL to *memcg, taking care of it. */
+	if (!mem)
+		return;
+
+	lock_page_cgroup(pc);
+	if (unlikely(PageCgroupUsed(pc))) {
+		unlock_page_cgroup(pc);
+		mem_cgroup_cancel_charge(mem, page_size);
+		return;
+	}
+	/*
+	 * we don't need page_cgroup_lock about tail pages, becase they are not
+	 * accessed by any other context at this point.
+	 */
 	pc->mem_cgroup = mem;
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2114,35 +2127,7 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
 		break;
 	}
 
-	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), 1);
-}
-
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
-				       struct page_cgroup *pc,
-				       enum charge_type ctype,
-				       int page_size)
-{
-	int i;
-	int count = page_size >> PAGE_SHIFT;
-
-	/* try_charge() can return NULL to *memcg, taking care of it. */
-	if (!mem)
-		return;
-
-	lock_page_cgroup(pc);
-	if (unlikely(PageCgroupUsed(pc))) {
-		unlock_page_cgroup(pc);
-		mem_cgroup_cancel_charge(mem, page_size);
-		return;
-	}
-
-	/*
-	 * we don't need page_cgroup_lock about tail pages, becase they are not
-	 * accessed by any other context at this point.
-	 */
-	for (i = 0; i < count; i++)
-		____mem_cgroup_commit_charge(mem, pc + i, ctype);
-
+	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
 	unlock_page_cgroup(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
@@ -2152,6 +2137,34 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	memcg_check_events(mem, pc->page);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+			(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+	struct page_cgroup *head_pc = lookup_page_cgroup(head);
+	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+	unsigned long flags;
+
+	/*
+	 * We have no races witch charge/uncharge but will have races with
+	 * page state accounting.
+	 */
+	move_lock_page_cgroup(head_pc, &flags);
+
+	tail_pc->mem_cgroup = head_pc->mem_cgroup;
+	smp_wmb(); /* see __commit_charge() */
+	/* we don't need to copy all flags...*/
+	tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+	move_unlock_page_cgroup(head_pc, &flags);
+}
+#endif
+
 /**
  * __mem_cgroup_move_account - move account of the page
  * @pc:	page_cgroup of the page.
@@ -2545,7 +2558,6 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-	int i;
 	int count;
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
@@ -2595,8 +2607,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		break;
 	}
 
-	for (i = 0; i < count; i++)
-		mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -1);
+	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
 
 	ClearPageCgroupUsed(pc);
 	/*
-- 
cgit v1.2.3-71-gd317


From 2d6d9fd3a54a28c6f67f26eb6c74803307a1b11e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 19 Jan 2011 22:27:14 +0100
Subject: ACPI: Introduce acpi_os_ioremap()

Commit ca9b600be38c ("ACPI / PM: Make suspend_nvs_save() use
acpi_os_map_memory()") attempted to prevent the code in osl.c and nvs.c
from using different ioremap() variants by making the latter use
acpi_os_map_memory() for mapping the NVS pages.  However, that also
requires acpi_os_unmap_memory() to be used for unmapping them, which
causes synchronize_rcu() to be executed many times in a row
unnecessarily and introduces substantial delays during resume on some
systems.

Instead of using acpi_os_map_memory() for mapping the NVS pages in nvs.c
introduce acpi_os_ioremap() calling ioremap_cache() and make the code in
both osl.c and nvs.c use it.

Reported-by: Jeff Chua <jeff.chua.linux@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/nvs.c      |  7 ++++---
 drivers/acpi/osl.c      | 12 +++++++-----
 include/linux/acpi.h    |  3 ---
 include/linux/acpi_io.h | 16 ++++++++++++++++
 4 files changed, 27 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/acpi_io.h

(limited to 'include/linux')

diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c
index 54b6ab8040a6..fa5a1df42b79 100644
--- a/drivers/acpi/nvs.c
+++ b/drivers/acpi/nvs.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/acpi.h>
+#include <linux/acpi_io.h>
 #include <acpi/acpiosxf.h>
 
 /*
@@ -80,7 +81,7 @@ void suspend_nvs_free(void)
 			free_page((unsigned long)entry->data);
 			entry->data = NULL;
 			if (entry->kaddr) {
-				acpi_os_unmap_memory(entry->kaddr, entry->size);
+				iounmap(entry->kaddr);
 				entry->kaddr = NULL;
 			}
 		}
@@ -114,8 +115,8 @@ int suspend_nvs_save(void)
 
 	list_for_each_entry(entry, &nvs_list, node)
 		if (entry->data) {
-			entry->kaddr = acpi_os_map_memory(entry->phys_start,
-							  entry->size);
+			entry->kaddr = acpi_os_ioremap(entry->phys_start,
+						    entry->size);
 			if (!entry->kaddr) {
 				suspend_nvs_free();
 				return -ENOMEM;
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index e2dd6de5d50c..b0931818cf98 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -38,6 +38,7 @@
 #include <linux/workqueue.h>
 #include <linux/nmi.h>
 #include <linux/acpi.h>
+#include <linux/acpi_io.h>
 #include <linux/efi.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
@@ -302,9 +303,10 @@ void __iomem *__init_refok
 acpi_os_map_memory(acpi_physical_address phys, acpi_size size)
 {
 	struct acpi_ioremap *map, *tmp_map;
-	unsigned long flags, pg_sz;
+	unsigned long flags;
 	void __iomem *virt;
-	phys_addr_t pg_off;
+	acpi_physical_address pg_off;
+	acpi_size pg_sz;
 
 	if (phys > ULONG_MAX) {
 		printk(KERN_ERR PREFIX "Cannot map memory that high\n");
@@ -320,7 +322,7 @@ acpi_os_map_memory(acpi_physical_address phys, acpi_size size)
 
 	pg_off = round_down(phys, PAGE_SIZE);
 	pg_sz = round_up(phys + size, PAGE_SIZE) - pg_off;
-	virt = ioremap_cache(pg_off, pg_sz);
+	virt = acpi_os_ioremap(pg_off, pg_sz);
 	if (!virt) {
 		kfree(map);
 		return NULL;
@@ -642,7 +644,7 @@ acpi_os_read_memory(acpi_physical_address phys_addr, u32 * value, u32 width)
 	virt_addr = acpi_map_vaddr_lookup(phys_addr, size);
 	rcu_read_unlock();
 	if (!virt_addr) {
-		virt_addr = ioremap_cache(phys_addr, size);
+		virt_addr = acpi_os_ioremap(phys_addr, size);
 		unmap = 1;
 	}
 	if (!value)
@@ -678,7 +680,7 @@ acpi_os_write_memory(acpi_physical_address phys_addr, u32 value, u32 width)
 	virt_addr = acpi_map_vaddr_lookup(phys_addr, size);
 	rcu_read_unlock();
 	if (!virt_addr) {
-		virt_addr = ioremap_cache(phys_addr, size);
+		virt_addr = acpi_os_ioremap(phys_addr, size);
 		unmap = 1;
 	}
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index eb176bb1b15b..a2e910e01293 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -306,9 +306,6 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
 					     u32 *mask, u32 req);
 extern void acpi_early_init(void);
 
-int acpi_os_map_generic_address(struct acpi_generic_address *addr);
-void acpi_os_unmap_generic_address(struct acpi_generic_address *addr);
-
 #else	/* !CONFIG_ACPI */
 
 #define acpi_disabled 1
diff --git a/include/linux/acpi_io.h b/include/linux/acpi_io.h
new file mode 100644
index 000000000000..7180013a4a3a
--- /dev/null
+++ b/include/linux/acpi_io.h
@@ -0,0 +1,16 @@
+#ifndef _ACPI_IO_H_
+#define _ACPI_IO_H_
+
+#include <linux/io.h>
+#include <acpi/acpi.h>
+
+static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys,
+					    acpi_size size)
+{
+       return ioremap_cache(phys, size);
+}
+
+int acpi_os_map_generic_address(struct acpi_generic_address *addr);
+void acpi_os_unmap_generic_address(struct acpi_generic_address *addr);
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 1c77ff22f539ceaa64ea43d6a26d867e84602cb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Jan 2011 19:41:35 +0100
Subject: genirq: Remove __do_IRQ

All architectures are finally converted. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Michal Simek <monstr@monstr.eu>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chen Liqin <liqin.chen@sunplusct.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Jeff Dike <jdike@addtoit.com>
---
 Documentation/feature-removal-schedule.txt |   8 ---
 arch/alpha/Kconfig                         |   3 -
 arch/blackfin/Kconfig                      |   3 -
 arch/frv/Kconfig                           |   4 --
 arch/ia64/Kconfig                          |   3 -
 arch/m68knommu/Kconfig                     |   4 --
 arch/microblaze/Kconfig                    |   3 -
 arch/mips/Kconfig                          |   3 -
 arch/mn10300/Kconfig                       |   3 -
 arch/parisc/Kconfig                        |   4 --
 arch/powerpc/Kconfig                       |   4 --
 arch/score/Kconfig                         |   3 -
 arch/sparc/Kconfig                         |   4 --
 arch/tile/Kconfig                          |   3 -
 arch/um/Kconfig.um                         |   3 -
 include/linux/irqdesc.h                    |  14 ----
 kernel/irq/Kconfig                         |   3 -
 kernel/irq/handle.c                        | 111 -----------------------------
 18 files changed, 183 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 8c594c45b6a1..b959659c5df4 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -357,14 +357,6 @@ Who:	Dave Jones <davej@redhat.com>, Matthew Garrett <mjg@redhat.com>
 
 -----------------------------
 
-What:	__do_IRQ all in one fits nothing interrupt handler
-When:	2.6.32
-Why:	__do_IRQ was kept for easy migration to the type flow handlers.
-	More than two years of migration time is enough.
-Who:	Thomas Gleixner <tglx@linutronix.de>
-
------------------------------
-
 What:	fakephp and associated sysfs files in /sys/bus/pci/slots/
 When:	2011
 Why:	In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index fc95ee1bcf6f..943fe6930f77 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -68,9 +68,6 @@ config GENERIC_IOMAP
 	bool
 	default n
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_HARDIRQS
 	bool
 	default y
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 0a221d48152d..a37b2be23f18 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -50,9 +50,6 @@ config GENERIC_HARDIRQS
 config GENERIC_IRQ_PROBE
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_GPIO
 	def_bool y
 
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index f6bcb039cd6d..e504edeb3d84 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -33,10 +33,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	default y
-
 config TIME_LOW_RES
 	bool
 	default y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index e0f5b6d7f849..be1faf991813 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -684,9 +684,6 @@ source "lib/Kconfig"
 config GENERIC_HARDIRQS
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_IRQ_PROBE
 	bool
 	default y
diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig
index 704e7b92334c..7379cb0ce1af 100644
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -52,10 +52,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 5f5018a71a3d..5a6378493797 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -52,9 +52,6 @@ config GENERIC_TIME_VSYSCALL
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_GPIO
 	def_bool y
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 548e6cc3bc28..f5ecc0566bc2 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -793,9 +793,6 @@ config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 #
 # Select some configuration options automatically based on user selections.
 #
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 8ed41cf2b08d..4638269152f6 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -34,9 +34,6 @@ config RWSEM_GENERIC_SPINLOCK
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 0888675c98dd..4b94ac4dc603 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -12,7 +12,6 @@ config PARISC
 	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
-	select GENERIC_HARDIRQS_NO__DO_IRQ
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
 	  in many of their workstations & servers (HP9000 700 and 800 series,
@@ -79,9 +78,6 @@ config IRQ_PER_CPU
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
 	bool
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 959f38ccb9a7..e0b185dd718a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -40,10 +40,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	default y
-
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool PPC64
 
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 4293fdcb5398..4f159acfbe33 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -53,9 +53,6 @@ config GENERIC_CLOCKEVENTS
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_SYSCALL_TABLE
 	def_bool y
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 45d9c87d083a..989bb6415ea3 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -107,10 +107,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
 config NEED_PER_CPU_PAGE_FIRST_CHUNK
 	def_bool y if SPARC64
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	def_bool y if SPARC64
-
 config MMU
 	bool
 	default y
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 4e8b82bca9e5..c16b98c2435d 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -10,9 +10,6 @@ config GENERIC_CSUM
 config GENERIC_HARDIRQS
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_IRQ_PROBE
 	def_bool y
 
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index f8d1d0d47fe6..90a438acbfaf 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -120,9 +120,6 @@ config SMP
 
 	  If you don't know what to do, say N.
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 6a64c6fa81af..c1a95b7b58de 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -100,13 +100,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 #define get_irq_desc_data(desc)		((desc)->irq_data.handler_data)
 #define get_irq_desc_msi(desc)		((desc)->irq_data.msi_desc)
 
-/*
- * Monolithic do_IRQ implementation.
- */
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-extern unsigned int __do_IRQ(unsigned int irq);
-#endif
-
 /*
  * Architectures call this to let the generic IRQ layer
  * handle an interrupt. If the descriptor is attached to an
@@ -115,14 +108,7 @@ extern unsigned int __do_IRQ(unsigned int irq);
  */
 static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
-#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 	desc->handle_irq(irq, desc);
-#else
-	if (likely(desc->handle_irq))
-		desc->handle_irq(irq, desc);
-	else
-		__do_IRQ(irq);
-#endif
 }
 
 static inline void generic_handle_irq(unsigned int irq)
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
        def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
-
 # Select this to disable the deprecated stuff
 config GENERIC_HARDIRQS_NO_DEPRECATED
        def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	return retval;
 }
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq:	the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action;
-	unsigned int status;
-
-	kstat_incr_irqs_this_cpu(irq, desc);
-
-	if (CHECK_IRQ_PER_CPU(desc->status)) {
-		irqreturn_t action_ret;
-
-		/*
-		 * No locking required for CPU-local interrupts:
-		 */
-		if (desc->irq_data.chip->ack)
-			desc->irq_data.chip->ack(irq);
-		if (likely(!(desc->status & IRQ_DISABLED))) {
-			action_ret = handle_IRQ_event(irq, desc->action);
-			if (!noirqdebug)
-				note_interrupt(irq, desc, action_ret);
-		}
-		desc->irq_data.chip->end(irq);
-		return 1;
-	}
-
-	raw_spin_lock(&desc->lock);
-	if (desc->irq_data.chip->ack)
-		desc->irq_data.chip->ack(irq);
-	/*
-	 * REPLAY is when Linux resends an IRQ that was dropped earlier
-	 * WAITING is used by probe to mark irqs that are being tested
-	 */
-	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
-	status |= IRQ_PENDING; /* we _want_ to handle it */
-
-	/*
-	 * If the IRQ is disabled for whatever reason, we cannot
-	 * use the action we have.
-	 */
-	action = NULL;
-	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
-		action = desc->action;
-		status &= ~IRQ_PENDING; /* we commit to handling */
-		status |= IRQ_INPROGRESS; /* we are handling it */
-	}
-	desc->status = status;
-
-	/*
-	 * If there is no IRQ handler or it was disabled, exit early.
-	 * Since we set PENDING, if another processor is handling
-	 * a different instance of this same irq, the other processor
-	 * will take care of it.
-	 */
-	if (unlikely(!action))
-		goto out;
-
-	/*
-	 * Edge triggered interrupts need to remember
-	 * pending events.
-	 * This applies to any hw interrupts that allow a second
-	 * instance of the same irq to arrive while we are in do_IRQ
-	 * or in the handler. But the code here only handles the _second_
-	 * instance of the irq, not the third or fourth. So it is mostly
-	 * useful for irq hardware that does not mask cleanly in an
-	 * SMP environment.
-	 */
-	for (;;) {
-		irqreturn_t action_ret;
-
-		raw_spin_unlock(&desc->lock);
-
-		action_ret = handle_IRQ_event(irq, action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
-
-		raw_spin_lock(&desc->lock);
-		if (likely(!(desc->status & IRQ_PENDING)))
-			break;
-		desc->status &= ~IRQ_PENDING;
-	}
-	desc->status &= ~IRQ_INPROGRESS;
-
-out:
-	/*
-	 * The ->end() handler has to deal with interrupts which got
-	 * disabled while the handler was running.
-	 */
-	desc->irq_data.chip->end(irq);
-	raw_spin_unlock(&desc->lock);
-
-	return 1;
-}
-#endif
-- 
cgit v1.2.3-71-gd317


From 1daeddd5962acad1bea55e524fc0fadf32654a21 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 13 Jan 2011 09:30:49 -0800
Subject: rtc: Cleanup removed UIE emulation declaration

rtc_dev_update_irq_enable_emul was removed in commit
042620a018afcfba1d678062b62e463b9e43a68d (UIE emulation is
now handled via hrtimer), but the declaration was missed.
This patch cleans it up.

Signed-off-by: John Stultz <john.stultz@linaro.org>
LKML-Reference: <1294939849-20608-1-git-send-email-john.stultz@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rtc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 3c995b4d742c..ea760698e6fe 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -235,8 +235,6 @@ extern int rtc_irq_set_freq(struct rtc_device *rtc,
 				struct rtc_task *task, int freq);
 extern int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled);
 extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled);
-extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc,
-						unsigned int enabled);
 
 void rtc_aie_update_irq(void *private);
 void rtc_uie_update_irq(void *private);
-- 
cgit v1.2.3-71-gd317


From aa0be0f4659f91f31e45adc422b1788cb36ffddc Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 20 Jan 2011 15:26:12 -0800
Subject: RTC: Propagate error handling via rtc_timer_enqueue properly

In cases where RTC hardware does not support alarms, the virtualized
RTC interfaces did not have a way to propagate the error up to userland.

This patch extends rtc_timer_enqueue so it catches errors from the hardware
and returns them upwards to the virtualized interfaces. To simplify error
handling, it also internalizes the management of the timer->enabled bit
into rtc_timer_enqueue and rtc_timer_remove.

Also makes rtc_timer_enqueue and rtc_timer_remove static.

Reported-by: David Daney <ddaney@caviumnetworks.com>
Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Diagnosed-by: David Daney <ddaney@caviumnetworks.com>
Tested-by: David Daney <ddaney@caviumnetworks.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
LKML-Reference: <1295565973-14358-1-git-send-email-john.stultz@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/rtc/interface.c | 49 ++++++++++++++++++++++++++++---------------------
 include/linux/rtc.h     |  2 --
 2 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 90384b9f6b2c..f1ba2c696528 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -16,6 +16,9 @@
 #include <linux/log2.h>
 #include <linux/workqueue.h>
 
+static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer);
+static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer);
+
 static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
 {
 	int err;
@@ -175,16 +178,14 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 		return err;
 	if (rtc->aie_timer.enabled) {
 		rtc_timer_remove(rtc, &rtc->aie_timer);
-		rtc->aie_timer.enabled = 0;
 	}
 	rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
 	rtc->aie_timer.period = ktime_set(0, 0);
 	if (alarm->enabled) {
-		rtc->aie_timer.enabled = 1;
-		rtc_timer_enqueue(rtc, &rtc->aie_timer);
+		err = rtc_timer_enqueue(rtc, &rtc->aie_timer);
 	}
 	mutex_unlock(&rtc->ops_lock);
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(rtc_set_alarm);
 
@@ -195,15 +196,15 @@ int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 		return err;
 
 	if (rtc->aie_timer.enabled != enabled) {
-		if (enabled) {
-			rtc->aie_timer.enabled = 1;
-			rtc_timer_enqueue(rtc, &rtc->aie_timer);
-		} else {
+		if (enabled)
+			err = rtc_timer_enqueue(rtc, &rtc->aie_timer);
+		else
 			rtc_timer_remove(rtc, &rtc->aie_timer);
-			rtc->aie_timer.enabled = 0;
-		}
 	}
 
+	if (err)
+		return err;
+
 	if (!rtc->ops)
 		err = -ENODEV;
 	else if (!rtc->ops->alarm_irq_enable)
@@ -235,12 +236,9 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 		now = rtc_tm_to_ktime(tm);
 		rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);
 		rtc->uie_rtctimer.period = ktime_set(1, 0);
-		rtc->uie_rtctimer.enabled = 1;
-		rtc_timer_enqueue(rtc, &rtc->uie_rtctimer);
-	} else {
+		err = rtc_timer_enqueue(rtc, &rtc->uie_rtctimer);
+	} else
 		rtc_timer_remove(rtc, &rtc->uie_rtctimer);
-		rtc->uie_rtctimer.enabled = 0;
-	}
 
 out:
 	mutex_unlock(&rtc->ops_lock);
@@ -488,10 +486,13 @@ EXPORT_SYMBOL_GPL(rtc_irq_set_freq);
  * Enqueues a timer onto the rtc devices timerqueue and sets
  * the next alarm event appropriately.
  *
+ * Sets the enabled bit on the added timer.
+ *
  * Must hold ops_lock for proper serialization of timerqueue
  */
-void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
+static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
 {
+	timer->enabled = 1;
 	timerqueue_add(&rtc->timerqueue, &timer->node);
 	if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) {
 		struct rtc_wkalrm alarm;
@@ -501,7 +502,13 @@ void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
 		err = __rtc_set_alarm(rtc, &alarm);
 		if (err == -ETIME)
 			schedule_work(&rtc->irqwork);
+		else if (err) {
+			timerqueue_del(&rtc->timerqueue, &timer->node);
+			timer->enabled = 0;
+			return err;
+		}
 	}
+	return 0;
 }
 
 /**
@@ -512,13 +519,15 @@ void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
  * Removes a timer onto the rtc devices timerqueue and sets
  * the next alarm event appropriately.
  *
+ * Clears the enabled bit on the removed timer.
+ *
  * Must hold ops_lock for proper serialization of timerqueue
  */
-void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
+static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
 {
 	struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
 	timerqueue_del(&rtc->timerqueue, &timer->node);
-
+	timer->enabled = 0;
 	if (next == &timer->node) {
 		struct rtc_wkalrm alarm;
 		int err;
@@ -626,8 +635,7 @@ int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
 	timer->node.expires = expires;
 	timer->period = period;
 
-	timer->enabled = 1;
-	rtc_timer_enqueue(rtc, timer);
+	ret = rtc_timer_enqueue(rtc, timer);
 
 	mutex_unlock(&rtc->ops_lock);
 	return ret;
@@ -645,7 +653,6 @@ int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer)
 	mutex_lock(&rtc->ops_lock);
 	if (timer->enabled)
 		rtc_timer_remove(rtc, timer);
-	timer->enabled = 0;
 	mutex_unlock(&rtc->ops_lock);
 	return ret;
 }
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index ea760698e6fe..a0b639f8e805 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -244,8 +244,6 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
-void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer);
-void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer);
 void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data);
 int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
 			ktime_t expires, ktime_t period);
-- 
cgit v1.2.3-71-gd317


From 3dece370ecc7c6152b3fdd21c6de28179f6e643d Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Fri, 21 Jan 2011 08:49:56 +0100
Subject: mm: System without MMU do not need pte_mkwrite

The patch "thp: export maybe_mkwrite" (commit 14fd403f2146) breaks
systems without MMU.

Error log:

    CC      arch/microblaze/mm/init.o
  In file included from include/linux/mman.h:14,
                   from arch/microblaze/mm/consistent.c:24:
  include/linux/mm.h: In function 'maybe_mkwrite':
  include/linux/mm.h:482: error: implicit declaration of function 'pte_mkwrite'
  include/linux/mm.h:482: error: incompatible types in assignment

Signed-off-by: Michal Simek <monstr@monstr.eu>
CC: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 956a35532f47..f6385fc17ad4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -470,6 +470,7 @@ static inline void set_compound_order(struct page *page, unsigned long order)
 	page[1].lru.prev = (void *)order;
 }
 
+#ifdef CONFIG_MMU
 /*
  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
  * servicing faults for write access.  In the normal case, do always want
@@ -482,6 +483,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 		pte = pte_mkwrite(pte);
 	return pte;
 }
+#endif
 
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
-- 
cgit v1.2.3-71-gd317


From e94965ed5beb23c6fabf7ed31f625e66d7ff28de Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@vmware.com>
Date: Wed, 15 Dec 2010 14:00:19 -0800
Subject: module: show version information for built-in modules in sysfs

Currently only drivers that are built as modules have their versions
shown in /sys/module/<module_name>/version, but this information might
also be useful for built-in drivers as well. This especially important
for drivers that do not define any parameters - such drivers, if
built-in, are completely invisible from userspace.

This patch changes MODULE_VERSION() macro so that in case when we are
compiling built-in module, version information is stored in a separate
section. Kernel then uses this data to create 'version' sysfs attribute
in the same fashion it creates attributes for module parameters.

Signed-off-by: Dmitry Torokhov <dtor@vmware.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/asm-generic/vmlinux.lds.h |  7 +++++
 include/linux/module.h            | 27 ++++++++++++++++
 kernel/params.c                   | 65 ++++++++++++++++++++++++++++++++-------
 3 files changed, 88 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 68649336c4ad..6ebb81030d2d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -364,6 +364,13 @@
 		VMLINUX_SYMBOL(__start___param) = .;			\
 		*(__param)						\
 		VMLINUX_SYMBOL(__stop___param) = .;			\
+	}								\
+									\
+	/* Built-in module versions. */					\
+	__modver : AT(ADDR(__modver) - LOAD_OFFSET) {			\
+		VMLINUX_SYMBOL(__start___modver) = .;			\
+		*(__modver)						\
+		VMLINUX_SYMBOL(__stop___modver) = .;			\
 		. = ALIGN((align));					\
 		VMLINUX_SYMBOL(__end_rodata) = .;			\
 	}								\
diff --git a/include/linux/module.h b/include/linux/module.h
index 8b17fd8c790d..50efcd3ae850 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -58,6 +58,12 @@ struct module_attribute {
 	void (*free)(struct module *);
 };
 
+struct module_version_attribute {
+	struct module_attribute mattr;
+	const char *module_name;
+	const char *version;
+};
+
 struct module_kobject
 {
 	struct kobject kobj;
@@ -161,7 +167,28 @@ extern struct module __this_module;
   Using this automatically adds a checksum of the .c files and the
   local headers in "srcversion".
 */
+
+#ifdef MODULE
 #define MODULE_VERSION(_version) MODULE_INFO(version, _version)
+#else
+#define MODULE_VERSION(_version)					\
+	extern ssize_t __modver_version_show(struct module_attribute *,	\
+					     struct module *, char *);	\
+	static struct module_version_attribute __modver_version_attr	\
+	__used								\
+    __attribute__ ((__section__ ("__modver"),aligned(sizeof(void *)))) \
+	= {								\
+		.mattr	= {						\
+			.attr	= {					\
+				.name	= "version",			\
+				.mode	= S_IRUGO,			\
+			},						\
+			.show	= __modver_version_show,		\
+		},							\
+		.module_name	= KBUILD_MODNAME,			\
+		.version	= _version,				\
+	}
+#endif
 
 /* Optional firmware file (or files) needed by the module
  * format is simply firmware file name.  Multiple firmware
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
 			params[i].ops->free(params[i].arg);
 }
 
-static void __init kernel_add_sysfs_param(const char *name,
-					  struct kernel_param *kparam,
-					  unsigned int name_skip)
+static struct module_kobject * __init locate_module_kobject(const char *name)
 {
 	struct module_kobject *mk;
 	struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
 
 	kobj = kset_find_obj(module_kset, name);
 	if (kobj) {
-		/* We already have one.  Remove params so we can add more. */
 		mk = to_module_kobject(kobj);
-		/* We need to remove it before adding parameters. */
-		sysfs_remove_group(&mk->kobj, &mk->mp->grp);
 	} else {
 		mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
 		BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
 					   "%s", name);
 		if (err) {
 			kobject_put(&mk->kobj);
-			printk(KERN_ERR "Module '%s' failed add to sysfs, "
-			       "error number %d\n", name, err);
-			printk(KERN_ERR	"The system will be unstable now.\n");
-			return;
+			printk(KERN_ERR
+				"Module '%s' failed add to sysfs, error number %d\n",
+				name, err);
+			printk(KERN_ERR
+				"The system will be unstable now.\n");
+			return NULL;
 		}
-		/* So that exit path is even. */
+
+		/* So that we hold reference in both cases. */
 		kobject_get(&mk->kobj);
 	}
 
+	return mk;
+}
+
+static void __init kernel_add_sysfs_param(const char *name,
+					  struct kernel_param *kparam,
+					  unsigned int name_skip)
+{
+	struct module_kobject *mk;
+	int err;
+
+	mk = locate_module_kobject(name);
+	if (!mk)
+		return;
+
+	/* We need to remove old parameters before adding more. */
+	if (mk->mp)
+		sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+
 	/* These should not fail at boot. */
 	err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
 	BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
 	}
 }
 
+ssize_t __modver_version_show(struct module_attribute *mattr,
+			      struct module *mod, char *buf)
+{
+	struct module_version_attribute *vattr =
+		container_of(mattr, struct module_version_attribute, mattr);
+
+	return sprintf(buf, "%s\n", vattr->version);
+}
+
+extern struct module_version_attribute __start___modver[], __stop___modver[];
+
+static void __init version_sysfs_builtin(void)
+{
+	const struct module_version_attribute *vattr;
+	struct module_kobject *mk;
+	int err;
+
+	for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+		mk = locate_module_kobject(vattr->module_name);
+		if (mk) {
+			err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+			kobject_uevent(&mk->kobj, KOBJ_ADD);
+			kobject_put(&mk->kobj);
+		}
+	}
+}
 
 /* module-related sysfs stuff */
 
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
 	}
 	module_sysfs_initialized = 1;
 
+	version_sysfs_builtin();
 	param_sysfs_builtin();
 
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 3b90a5b292321b2acac3921f77046ae195aef53f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 24 Jan 2011 14:32:51 -0600
Subject: module: fix linker error for MODULE_VERSION when !MODULE and
 CONFIG_SYSFS=n

lib/built-in.o:(__modver+0x8): undefined reference to `__modver_version_show'
lib/built-in.o:(__modver+0x2c): undefined reference to `__modver_version_show'

Simplest to just not emit anything: if they've disabled SYSFS they probably
want the smallest kernel possible.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 50efcd3ae850..e7c6385c6683 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -168,7 +168,7 @@ extern struct module __this_module;
   local headers in "srcversion".
 */
 
-#ifdef MODULE
+#if defined(MODULE) || !defined(CONFIG_SYSFS)
 #define MODULE_VERSION(_version) MODULE_INFO(version, _version)
 #else
 #define MODULE_VERSION(_version)					\
-- 
cgit v1.2.3-71-gd317


From b75be4204e7871869b2c268c00783703197aaa7d Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Wed, 5 Jan 2011 13:27:04 +0100
Subject: param: add null statement to compiled-in module params

Add an unused struct declaration statement requiring a
terminating semicolon to the compile-in case to provoke an
error if __MODULE_INFO() is used without the terminating
semicolon. Previously MODULE_ALIAS("foo") (no semicolon)
compiled fine if MODULE was not selected.

Cc: Dan Carpenter <error27@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 112adf8bd47d..07b41951e3fa 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -16,15 +16,17 @@
 /* Chosen so that structs with an unsigned long line up. */
 #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
 
-#ifdef MODULE
 #define ___module_cat(a,b) __mod_ ## a ## b
 #define __module_cat(a,b) ___module_cat(a,b)
+#ifdef MODULE
 #define __MODULE_INFO(tag, name, info)					  \
 static const char __module_cat(name,__LINE__)[]				  \
   __used __attribute__((section(".modinfo"), unused, aligned(1)))	  \
   = __stringify(tag) "=" info
 #else  /* !MODULE */
-#define __MODULE_INFO(tag, name, info)
+/* This struct is here for syntactic coherency, it is not used */
+#define __MODULE_INFO(tag, name, info)					  \
+  struct __module_cat(name,__LINE__) {}
 #endif
 #define __MODULE_PARM_TYPE(name, _type)					  \
   __MODULE_INFO(parmtype, name##type, #name ":" _type)
-- 
cgit v1.2.3-71-gd317


From 7ef88ad561457c0346355dfd1f53e503ddfde719 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 24 Jan 2011 14:45:10 -0600
Subject: BUILD_BUG_ON: make it handle more cases

BUILD_BUG_ON used to use the optimizer to do code elimination or fail
at link time; it was changed to first the size of a negative array (a
nicer compile time error), then (in
8c87df457cb58fe75b9b893007917cf8095660a0) to a bitfield.

This forced us to change some non-constant cases to MAYBE_BUILD_BUG_ON();
as Jan points out in that commit, it didn't work as intended anyway.

bitfields: needs a literal constant at parse time, and can't be put under
	"if (__builtin_constant_p(x))" for example.
negative array: can handle anything, but if the compiler can't tell it's
	a constant, silently has no effect.
link time: breaks link if the compiler can't determine the value, but the
	linker output is not usually as informative as a compiler error.

If we use the negative-array-size method *and* the link time trick,
we get the ability to use BUILD_BUG_ON() under __builtin_constant_p()
branches, and maximal ability for the compiler to detect errors at
build time.

We also document it thoroughly.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jan Beulich <JBeulich@novell.com>
Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
---
 include/linux/kernel.h | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d07d8057e440..864712f3653d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -575,12 +575,6 @@ struct sysinfo {
 	char _f[20-2*sizeof(long)-sizeof(int)];	/* Padding: libc5 uses this.. */
 };
 
-/* Force a compilation error if condition is true */
-#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
-
-/* Force a compilation error if condition is constant and true */
-#define MAYBE_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)]))
-
 /* Force a compilation error if a constant expression is not a power of 2 */
 #define BUILD_BUG_ON_NOT_POWER_OF_2(n)			\
 	BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0))
@@ -592,6 +586,33 @@ struct sysinfo {
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
 #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
 
+/**
+ * BUILD_BUG_ON - break compile if a condition is true.
+ * @cond: the condition which the compiler should know is false.
+ *
+ * If you have some code which relies on certain constants being equal, or
+ * other compile-time-evaluated condition, you should use BUILD_BUG_ON to
+ * detect if someone changes it.
+ *
+ * The implementation uses gcc's reluctance to create a negative array, but
+ * gcc (as of 4.4) only emits that error for obvious cases (eg. not arguments
+ * to inline functions).  So as a fallback we use the optimizer; if it can't
+ * prove the condition is false, it will cause a link error on the undefined
+ * "__build_bug_on_failed".  This error message can be harder to track down
+ * though, hence the two different methods.
+ */
+#ifndef __OPTIMIZE__
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+#else
+extern int __build_bug_on_failed;
+#define BUILD_BUG_ON(condition)					\
+	do {							\
+		((void)sizeof(char[1 - 2*!!(condition)]));	\
+		if (condition) __build_bug_on_failed = 1;	\
+	} while(0)
+#endif
+#define MAYBE_BUILD_BUG_ON(condition) BUILD_BUG_ON(condition)
+
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
-- 
cgit v1.2.3-71-gd317


From 1765e3a4933ea0870fabd755feffc5473c4363ce Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 24 Jan 2011 14:45:10 -0600
Subject: Remove MAYBE_BUILD_BUG_ON

Now BUILD_BUG_ON() can handle optimizable constants, we don't need
MAYBE_BUILD_BUG_ON any more.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/gfp.h           | 2 +-
 include/linux/kernel.h        | 1 -
 include/linux/kmemcheck.h     | 2 +-
 include/linux/virtio_config.h | 5 ++++-
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a3b148a91874..0b84c61607e8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -249,7 +249,7 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 					 ((1 << ZONES_SHIFT) - 1);
 
 	if (__builtin_constant_p(bit))
-		MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
+		BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 	else {
 #ifdef CONFIG_DEBUG_VM
 		BUG_ON((GFP_ZONE_BAD >> bit) & 1);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 864712f3653d..e2f4d6af2125 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -611,7 +611,6 @@ extern int __build_bug_on_failed;
 		if (condition) __build_bug_on_failed = 1;	\
 	} while(0)
 #endif
-#define MAYBE_BUILD_BUG_ON(condition) BUILD_BUG_ON(condition)
 
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 08d7dc4ddf40..39f8453239f7 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -76,7 +76,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size);
 									\
 		_n = (long) &((ptr)->name##_end)			\
 			- (long) &((ptr)->name##_begin);		\
-		MAYBE_BUILD_BUG_ON(_n < 0);				\
+		BUILD_BUG_ON(_n < 0);					\
 									\
 		kmemcheck_mark_initialized(&((ptr)->name##_begin), _n);	\
 	} while (0)
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 0093dd7c1d6f..800617b4ddd5 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -109,7 +109,10 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
 				      unsigned int fbit)
 {
 	/* Did you forget to fix assumptions on max features? */
-	MAYBE_BUILD_BUG_ON(fbit >= 32);
+	if (__builtin_constant_p(fbit))
+		BUILD_BUG_ON(fbit >= 32);
+	else
+		BUG_ON(fbit >= 32);
 
 	if (fbit < VIRTIO_TRANSPORT_F_START)
 		virtio_check_driver_offered_feature(vdev, fbit);
-- 
cgit v1.2.3-71-gd317