From a9e61e25f9d2e7e43bf17625f5cb56c9e0a89b17 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Tue, 31 Mar 2009 15:12:56 -0500
Subject: lockd: call locks_release_private to cleanup per-filesystem state

For every lock request lockd creates a new file_lock object
in nlmsvc_setgrantargs() by copying the passed in file_lock with
locks_copy_lock(). A filesystem can attach it's own lock_operations
vector to the file_lock. It has to be cleaned up at the end of the
file_lock's life. However, lockd doesn't do it today, yet it
asserts in nlmclnt_release_lockargs() that the per-filesystem
state is clean.
This patch fixes it by exporting locks_release_private() and adding
it to nlmsvc_freegrantargs(), to be symmetrical to creating a
file_lock in nlmsvc_setgrantargs().

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436f4353..5ba615e8f533 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1108,6 +1108,7 @@ extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
 extern void locks_remove_flock(struct file *);
+extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
-- 
cgit v1.2.3-71-gd317


From abc5c44d6284fab8fb21bcfc52c0f16f980637df Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:31:25 -0400
Subject: SUNRPC: Fix error return value of svc_addr_len()

The svc_addr_len() helper function returns -EAFNOSUPPORT if it doesn't
recognize the address family of the passed-in socket address.  However,
the return type of this function is size_t, which means -EAFNOSUPPORT
is turned into a very large positive value in this case.

The check in svc_udp_recvfrom() to see if the return value is less
than zero therefore won't work at all.

Additionally, handle_connect_req() passes this value directly to
memset().  This could cause memset() to clobber a large chunk of memory
if svc_addr_len() has returned an error.  Currently the address family
of these addresses, however, is known to be supported long before
handle_connect_req() is called, so this isn't a real risk.

Change the error return value of svc_addr_len() to zero, which fits in
the range of size_t, and is safer to pass to memset() directly.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/svc_xprt.h | 5 +++--
 net/sunrpc/svcsock.c            | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0d9cb6ef28b0..d790c52525cc 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -118,7 +118,7 @@ static inline unsigned short svc_addr_port(const struct sockaddr *sa)
 	return 0;
 }
 
-static inline size_t svc_addr_len(struct sockaddr *sa)
+static inline size_t svc_addr_len(const struct sockaddr *sa)
 {
 	switch (sa->sa_family) {
 	case AF_INET:
@@ -126,7 +126,8 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
 	case AF_INET6:
 		return sizeof(struct sockaddr_in6);
 	}
-	return -EAFNOSUPPORT;
+
+	return 0;
 }
 
 static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index af3198814c15..8b0832834135 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -426,13 +426,14 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
 	} buffer;
 	struct cmsghdr *cmh = &buffer.hdr;
-	int		err, len;
 	struct msghdr msg = {
 		.msg_name = svc_addr(rqstp),
 		.msg_control = cmh,
 		.msg_controllen = sizeof(buffer),
 		.msg_flags = MSG_DONTWAIT,
 	};
+	size_t len;
+	int err;
 
 	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
 	    /* udp sockets need large rcvbuf as all pending
@@ -464,8 +465,8 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		return -EAGAIN;
 	}
 	len = svc_addr_len(svc_addr(rqstp));
-	if (len < 0)
-		return len;
+	if (len == 0)
+		return -EAFNOSUPPORT;
 	rqstp->rq_addrlen = len;
 	if (skb->tstamp.tv64 == 0) {
 		skb->tstamp = ktime_get_real();
-- 
cgit v1.2.3-71-gd317


From 335c54bdc4d3bacdbd619ec95cd0b352435bd37f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:32:25 -0400
Subject: NFSD: Prevent a buffer overflow in svc_xprt_names()

The svc_xprt_names() function can overflow its buffer if it's so near
the end of the passed in buffer that the "name too long" string still
doesn't fit.  Of course, it could never tell if it was near the end
of the passed in buffer, since its only caller passes in zero as the
buffer length.

Let's make this API a little safer.

Change svc_xprt_names() so it *always* checks for a buffer overflow,
and change its only caller to pass in the correct buffer length.

If svc_xprt_names() does overflow its buffer, it now fails with an
ENAMETOOLONG errno, instead of trying to write a message at the end
of the buffer.  I don't like this much, but I can't figure out a clean
way that's always safe to return some of the names, *and* an
indication that the buffer was not long enough.

The displayed error when doing a 'cat /proc/fs/nfsd/portlist' is
"File name too long".

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c                |  2 +-
 include/linux/sunrpc/svc_xprt.h |  2 +-
 net/sunrpc/svc_xprt.c           | 56 ++++++++++++++++++++++++++++-------------
 3 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e051847b93fb..6a1cd908e6bc 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -918,7 +918,7 @@ static ssize_t __write_ports_names(char *buf)
 {
 	if (nfsd_serv == NULL)
 		return 0;
-	return svc_xprt_names(nfsd_serv, buf, 0);
+	return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
 }
 
 /*
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index d790c52525cc..2223ae0b5ed5 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -83,7 +83,7 @@ int	svc_port_is_privileged(struct sockaddr *sin);
 int	svc_print_xprts(char *buf, int maxlen);
 struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 			const sa_family_t af, const unsigned short port);
-int	svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
+int	svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
 {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f200393ac877..6f33d33cc064 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1098,36 +1098,58 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 }
 EXPORT_SYMBOL_GPL(svc_find_xprt);
 
-/*
- * Format a buffer with a list of the active transports. A zero for
- * the buflen parameter disables target buffer overflow checking.
+static int svc_one_xprt_name(const struct svc_xprt *xprt,
+			     char *pos, int remaining)
+{
+	int len;
+
+	len = snprintf(pos, remaining, "%s %u\n",
+			xprt->xpt_class->xcl_name,
+			svc_xprt_local_port(xprt));
+	if (len >= remaining)
+		return -ENAMETOOLONG;
+	return len;
+}
+
+/**
+ * svc_xprt_names - format a buffer with a list of transport names
+ * @serv: pointer to an RPC service
+ * @buf: pointer to a buffer to be filled in
+ * @buflen: length of buffer to be filled in
+ *
+ * Fills in @buf with a string containing a list of transport names,
+ * each name terminated with '\n'.
+ *
+ * Returns positive length of the filled-in string on success; otherwise
+ * a negative errno value is returned if an error occurs.
  */
-int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
+int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen)
 {
 	struct svc_xprt *xprt;
-	char xprt_str[64];
-	int totlen = 0;
-	int len;
+	int len, totlen;
+	char *pos;
 
 	/* Sanity check args */
 	if (!serv)
 		return 0;
 
 	spin_lock_bh(&serv->sv_lock);
+
+	pos = buf;
+	totlen = 0;
 	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
-		len = snprintf(xprt_str, sizeof(xprt_str),
-			       "%s %d\n", xprt->xpt_class->xcl_name,
-			       svc_xprt_local_port(xprt));
-		/* If the string was truncated, replace with error string */
-		if (len >= sizeof(xprt_str))
-			strcpy(xprt_str, "name-too-long\n");
-		/* Don't overflow buffer */
-		len = strlen(xprt_str);
-		if (buflen && (len + totlen >= buflen))
+		len = svc_one_xprt_name(xprt, pos, buflen - totlen);
+		if (len < 0) {
+			*buf = '\0';
+			totlen = len;
+		}
+		if (len <= 0)
 			break;
-		strcpy(buf+totlen, xprt_str);
+
+		pos += len;
 		totlen += len;
 	}
+
 	spin_unlock_bh(&serv->sv_lock);
 	return totlen;
 }
-- 
cgit v1.2.3-71-gd317


From bfba9ab4c64f0e5c33930711e6c073c285e01fcf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:32:33 -0400
Subject: SUNRPC: pass buffer size to svc_addsock()

Adjust the synopsis of svc_addsock() to pass in the size of the output
buffer.  Add a documenting comment.

This is a cosmetic change for now.  A subsequent patch will make sure
the buffer length is passed to one_sock_name(), where the length will
actually be useful.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c               |  2 +-
 include/linux/sunrpc/svcsock.h |  3 ++-
 net/sunrpc/svcsock.c           | 16 +++++++++++++---
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6a1cd908e6bc..1f1c2159b802 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -943,7 +943,7 @@ static ssize_t __write_ports_addfd(char *buf)
 	if (err != 0)
 		goto out;
 
-	err = svc_addsock(nfsd_serv, fd, buf);
+	err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
 	if (err < 0)
 		lockd_down();
 
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 483e10380aae..e23241c53f42 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -39,7 +39,8 @@ int		svc_send(struct svc_rqst *);
 void		svc_drop(struct svc_rqst *);
 void		svc_sock_update_bufs(struct svc_serv *serv);
 int		svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
-int		svc_addsock(struct svc_serv *serv, int fd, char *name_return);
+int		svc_addsock(struct svc_serv *serv, const int fd,
+					char *name_return, const size_t len);
 void		svc_init_xprt_sock(void);
 void		svc_cleanup_xprt_sock(void);
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 8b0832834135..6bec1e25b542 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1128,9 +1128,19 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	return svsk;
 }
 
-int svc_addsock(struct svc_serv *serv,
-		int fd,
-		char *name_return)
+/**
+ * svc_addsock - add a listener socket to an RPC service
+ * @serv: pointer to RPC service to which to add a new listener
+ * @fd: file descriptor of the new listener
+ * @name_return: pointer to buffer to fill in with name of listener
+ * @len: size of the buffer
+ *
+ * Fills in socket name and returns positive length of name if successful.
+ * Name is terminated with '\n'.  On error, returns a negative errno
+ * value.
+ */
+int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
+		const size_t len)
 {
 	int err = 0;
 	struct socket *so = sockfd_lookup(fd, &err);
-- 
cgit v1.2.3-71-gd317


From 8435d34dbbe75678c3cdad3d53b1e7996a79b3bf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:32:40 -0400
Subject: SUNRPC: pass buffer size to svc_sock_names()

Adjust the synopsis of svc_sock_names() to pass in the size of the
output buffer.  Add a documenting comment.

This is a cosmetic change for now.  A subsequent patch will make sure
the buffer length is passed to one_sock_name(), where the length will
actually be useful.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c               |  3 ++-
 include/linux/sunrpc/svcsock.h |  4 +++-
 net/sunrpc/svcsock.c           | 19 +++++++++++++++++--
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1f1c2159b802..b64a7fbfccf5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -966,7 +966,8 @@ static ssize_t __write_ports_delfd(char *buf)
 		return -ENOMEM;
 
 	if (nfsd_serv != NULL)
-		len = svc_sock_names(buf, nfsd_serv, toclose);
+		len = svc_sock_names(nfsd_serv, buf,
+					SIMPLE_TRANSACTION_LIMIT, toclose);
 	if (len >= 0)
 		lockd_down();
 
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index e23241c53f42..827163138949 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -38,7 +38,9 @@ int		svc_recv(struct svc_rqst *, long);
 int		svc_send(struct svc_rqst *);
 void		svc_drop(struct svc_rqst *);
 void		svc_sock_update_bufs(struct svc_serv *serv);
-int		svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
+int		svc_sock_names(struct svc_serv *serv, char *buf,
+					const size_t buflen,
+					const char *toclose);
 int		svc_addsock(struct svc_serv *serv, const int fd,
 					char *name_return, const size_t len);
 void		svc_init_xprt_sock(void);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6bec1e25b542..032b52ea9541 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -259,8 +259,23 @@ static int one_sock_name(char *buf, struct svc_sock *svsk)
 	return len;
 }
 
-int
-svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
+/**
+ * svc_sock_names - construct a list of listener names in a string
+ * @serv: pointer to RPC service
+ * @buf: pointer to a buffer to fill in with socket names
+ * @buflen: size of the buffer to be filled
+ * @toclose: pointer to '\0'-terminated C string containing the name
+ *		of a listener to be closed
+ *
+ * Fills in @buf with a '\n'-separated list of names of listener
+ * sockets.  If @toclose is not NULL, the socket named by @toclose
+ * is closed, and is not included in the output list.
+ *
+ * Returns positive length of the socket name string, or a negative
+ * errno value on error.
+ */
+int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen,
+		   const char *toclose)
 {
 	struct svc_sock *svsk, *closesk = NULL;
 	int len = 0;
-- 
cgit v1.2.3-71-gd317


From 4ed0d3e6c64cfd9ba4ceb2099b10d1cf8ece4320 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 24 Apr 2009 17:30:20 -0700
Subject: Intel IOMMU Pass Through Support

The patch adds kernel parameter intel_iommu=pt to set up pass through
mode in context mapping entry. This disables DMAR in linux kernel; but
KVM still runs on VT-d and interrupt remapping still works.

In this mode, kernel uses swiotlb for DMA API functions but other VT-d
functionalities are enabled for KVM. KVM always uses multi level
translation page table in VT-d. By default, pass though mode is disabled
in kernel.

This is useful when people don't want to enable VT-d DMAR in kernel but
still want to use KVM and interrupt remapping for reasons like DMAR
performance concern or debug purpose.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Weidong Han <weidong@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/kernel-parameters.txt |   1 +
 arch/ia64/include/asm/iommu.h       |   1 +
 arch/ia64/kernel/pci-swiotlb.c      |   2 +-
 arch/x86/include/asm/iommu.h        |   1 +
 arch/x86/kernel/pci-dma.c           |   6 ++
 arch/x86/kernel/pci-swiotlb.c       |   3 +-
 drivers/pci/dmar.c                  |  11 ++-
 drivers/pci/intel-iommu.c           | 180 ++++++++++++++++++++++++++----------
 include/linux/dma_remapping.h       |   8 ++
 include/linux/intel-iommu.h         |   2 +
 10 files changed, 165 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 600cdd72900c..fa4faeb7597f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -965,6 +965,7 @@ and is between 256 and 4096 characters. It is defined in the file
 		nomerge
 		forcesac
 		soft
+		pt	[x86, IA64]
 
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
index 0490794fe4aa..37d41ca5645a 100644
--- a/arch/ia64/include/asm/iommu.h
+++ b/arch/ia64/include/asm/iommu.h
@@ -9,6 +9,7 @@ extern void pci_iommu_shutdown(void);
 extern void no_iommu_init(void);
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
+extern int iommu_pass_through;
 extern void iommu_dma_init(void);
 extern void machvec_init(const char *name);
 
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 285aae8431c6..223abb134105 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -46,7 +46,7 @@ void __init swiotlb_dma_init(void)
 
 void __init pci_swiotlb_init(void)
 {
-	if (!iommu_detected) {
+	if (!iommu_detected || iommu_pass_through) {
 #ifdef CONFIG_IA64_GENERIC
 		swiotlb = 1;
 		printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index af326a2975b5..fd6d21bbee6c 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,7 @@ extern void no_iommu_init(void);
 extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
+extern int iommu_pass_through;
 
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 745579bc8256..8cad0d854242 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -160,6 +160,8 @@ again:
 	return page_address(page);
 }
 
+extern int iommu_pass_through;
+
 /*
  * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
  * documentation.
@@ -209,6 +211,10 @@ static __init int iommu_setup(char *p)
 #ifdef CONFIG_SWIOTLB
 		if (!strncmp(p, "soft", 4))
 			swiotlb = 1;
+		if (!strncmp(p, "pt", 2)) {
+			iommu_pass_through = 1;
+			return 1;
+		}
 #endif
 
 		gart_parse_options(p);
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e268..3a0c51e0ba6d 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
-	if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
+	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
+		iommu_pass_through)
 	       swiotlb = 1;
 #endif
 	if (swiotlb_force)
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index fa3a11365ec3..d3d86b749eee 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -515,6 +515,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 	u32 ver;
 	static int iommu_allocated = 0;
 	int agaw = 0;
+	int msagaw = 0;
 
 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 	if (!iommu)
@@ -535,12 +536,20 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 	agaw = iommu_calculate_agaw(iommu);
 	if (agaw < 0) {
 		printk(KERN_ERR
-			"Cannot get a valid agaw for iommu (seq_id = %d)\n",
+		       "Cannot get a valid agaw for iommu (seq_id = %d)\n",
+		       iommu->seq_id);
+		goto error;
+	}
+	msagaw = iommu_calculate_max_sagaw(iommu);
+	if (msagaw < 0) {
+		printk(KERN_ERR
+			"Cannot get a valid max agaw for iommu (seq_id = %d)\n",
 			iommu->seq_id);
 		goto error;
 	}
 #endif
 	iommu->agaw = agaw;
+	iommu->msagaw = msagaw;
 
 	/* the registers might be more than one page */
 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 001b328adf80..13121821db7f 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -53,6 +53,8 @@
 
 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
 
+#define MAX_AGAW_WIDTH 64
+
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
@@ -127,8 +129,6 @@ static inline void context_set_fault_enable(struct context_entry *context)
 	context->lo &= (((u64)-1) << 2) | 1;
 }
 
-#define CONTEXT_TT_MULTI_LEVEL 0
-
 static inline void context_set_translation_type(struct context_entry *context,
 						unsigned long value)
 {
@@ -288,6 +288,7 @@ int dmar_disabled = 1;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
+int iommu_pass_through;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -397,17 +398,13 @@ void free_iova_mem(struct iova *iova)
 
 static inline int width_to_agaw(int width);
 
-/* calculate agaw for each iommu.
- * "SAGAW" may be different across iommus, use a default agaw, and
- * get a supported less agaw for iommus that don't support the default agaw.
- */
-int iommu_calculate_agaw(struct intel_iommu *iommu)
+static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 {
 	unsigned long sagaw;
 	int agaw = -1;
 
 	sagaw = cap_sagaw(iommu->cap);
-	for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	for (agaw = width_to_agaw(max_gaw);
 	     agaw >= 0; agaw--) {
 		if (test_bit(agaw, &sagaw))
 			break;
@@ -416,6 +413,24 @@ int iommu_calculate_agaw(struct intel_iommu *iommu)
 	return agaw;
 }
 
+/*
+ * Calculate max SAGAW for each iommu.
+ */
+int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+}
+
+/*
+ * calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+}
+
 /* in native case, each domain is related to only one iommu */
 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 {
@@ -1321,8 +1336,8 @@ static void domain_exit(struct dmar_domain *domain)
 	free_domain_mem(domain);
 }
 
-static int domain_context_mapping_one(struct dmar_domain *domain,
-				      int segment, u8 bus, u8 devfn)
+static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
+				 u8 bus, u8 devfn, int translation)
 {
 	struct context_entry *context;
 	unsigned long flags;
@@ -1335,7 +1350,10 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 
 	pr_debug("Set context mapping for %02x:%02x.%d\n",
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
 	BUG_ON(!domain->pgd);
+	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
+	       translation != CONTEXT_TT_MULTI_LEVEL);
 
 	iommu = device_to_iommu(segment, bus, devfn);
 	if (!iommu)
@@ -1395,9 +1413,18 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	}
 
 	context_set_domain_id(context, id);
-	context_set_address_width(context, iommu->agaw);
-	context_set_address_root(context, virt_to_phys(pgd));
-	context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
+
+	/*
+	 * In pass through mode, AW must be programmed to indicate the largest
+	 * AGAW value supported by hardware. And ASR is ignored by hardware.
+	 */
+	if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
+		context_set_address_width(context, iommu->agaw);
+		context_set_address_root(context, virt_to_phys(pgd));
+	} else
+		context_set_address_width(context, iommu->msagaw);
+
+	context_set_translation_type(context, translation);
 	context_set_fault_enable(context);
 	context_set_present(context);
 	domain_flush_cache(domain, context, sizeof(*context));
@@ -1422,13 +1449,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 }
 
 static int
-domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
+domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
+			int translation)
 {
 	int ret;
 	struct pci_dev *tmp, *parent;
 
 	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
-					 pdev->bus->number, pdev->devfn);
+					 pdev->bus->number, pdev->devfn,
+					 translation);
 	if (ret)
 		return ret;
 
@@ -1442,7 +1471,7 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
 		ret = domain_context_mapping_one(domain,
 						 pci_domain_nr(parent->bus),
 						 parent->bus->number,
-						 parent->devfn);
+						 parent->devfn, translation);
 		if (ret)
 			return ret;
 		parent = parent->bus->self;
@@ -1450,12 +1479,14 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
 	if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
 		return domain_context_mapping_one(domain,
 					pci_domain_nr(tmp->subordinate),
-					tmp->subordinate->number, 0);
+					tmp->subordinate->number, 0,
+					translation);
 	else /* this is a legacy PCI bridge */
 		return domain_context_mapping_one(domain,
 						  pci_domain_nr(tmp->bus),
 						  tmp->bus->number,
-						  tmp->devfn);
+						  tmp->devfn,
+						  translation);
 }
 
 static int domain_context_mapped(struct pci_dev *pdev)
@@ -1752,7 +1783,7 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev,
 		goto error;
 
 	/* context entry init */
-	ret = domain_context_mapping(domain, pdev);
+	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
 	if (!ret)
 		return 0;
 error:
@@ -1853,6 +1884,23 @@ static inline void iommu_prepare_isa(void)
 }
 #endif /* !CONFIG_DMAR_FLPY_WA */
 
+/* Initialize each context entry as pass through.*/
+static int __init init_context_pass_through(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct dmar_domain *domain;
+	int ret;
+
+	for_each_pci_dev(pdev) {
+		domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+		ret = domain_context_mapping(domain, pdev,
+					     CONTEXT_TT_PASS_THROUGH);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int __init init_dmars(void)
 {
 	struct dmar_drhd_unit *drhd;
@@ -1860,6 +1908,7 @@ static int __init init_dmars(void)
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
 	int i, ret;
+	int pass_through = 1;
 
 	/*
 	 * for each drhd
@@ -1913,7 +1962,15 @@ static int __init init_dmars(void)
 			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
 			goto error;
 		}
+		if (!ecap_pass_through(iommu->ecap))
+			pass_through = 0;
 	}
+	if (iommu_pass_through)
+		if (!pass_through) {
+			printk(KERN_INFO
+			       "Pass Through is not supported by hardware.\n");
+			iommu_pass_through = 0;
+		}
 
 	/*
 	 * Start from the sane iommu hardware state.
@@ -1976,37 +2033,57 @@ static int __init init_dmars(void)
 			       "IOMMU: enable interrupt remapping failed\n");
 	}
 #endif
+	/*
+	 * If pass through is set and enabled, context entries of all pci
+	 * devices are intialized by pass through translation type.
+	 */
+	if (iommu_pass_through) {
+		ret = init_context_pass_through();
+		if (ret) {
+			printk(KERN_ERR "IOMMU: Pass through init failed.\n");
+			iommu_pass_through = 0;
+		}
+	}
 
 	/*
-	 * For each rmrr
-	 *   for each dev attached to rmrr
-	 *   do
-	 *     locate drhd for dev, alloc domain for dev
-	 *     allocate free domain
-	 *     allocate page table entries for rmrr
-	 *     if context not allocated for bus
-	 *           allocate and init context
-	 *           set present in root table for this bus
-	 *     init context with domain, translation etc
-	 *    endfor
-	 * endfor
+	 * If pass through is not set or not enabled, setup context entries for
+	 * identity mappings for rmrr, gfx, and isa.
 	 */
-	for_each_rmrr_units(rmrr) {
-		for (i = 0; i < rmrr->devices_cnt; i++) {
-			pdev = rmrr->devices[i];
-			/* some BIOS lists non-exist devices in DMAR table */
-			if (!pdev)
-				continue;
-			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
-			if (ret)
-				printk(KERN_ERR
+	if (!iommu_pass_through) {
+		/*
+		 * For each rmrr
+		 *   for each dev attached to rmrr
+		 *   do
+		 *     locate drhd for dev, alloc domain for dev
+		 *     allocate free domain
+		 *     allocate page table entries for rmrr
+		 *     if context not allocated for bus
+		 *           allocate and init context
+		 *           set present in root table for this bus
+		 *     init context with domain, translation etc
+		 *    endfor
+		 * endfor
+		 */
+		for_each_rmrr_units(rmrr) {
+			for (i = 0; i < rmrr->devices_cnt; i++) {
+				pdev = rmrr->devices[i];
+				/*
+				 * some BIOS lists non-exist devices in DMAR
+				 * table.
+				 */
+				if (!pdev)
+					continue;
+				ret = iommu_prepare_rmrr_dev(rmrr, pdev);
+				if (ret)
+					printk(KERN_ERR
 				 "IOMMU: mapping reserved region failed\n");
+			}
 		}
-	}
 
-	iommu_prepare_gfx_mapping();
+		iommu_prepare_gfx_mapping();
 
-	iommu_prepare_isa();
+		iommu_prepare_isa();
+	}
 
 	/*
 	 * for each drhd
@@ -2117,7 +2194,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev)
 
 	/* make sure context mapping is ok */
 	if (unlikely(!domain_context_mapped(pdev))) {
-		ret = domain_context_mapping(domain, pdev);
+		ret = domain_context_mapping(domain, pdev,
+					     CONTEXT_TT_MULTI_LEVEL);
 		if (ret) {
 			printk(KERN_ERR
 				"Domain context map for %s failed",
@@ -2786,7 +2864,7 @@ int __init intel_iommu_init(void)
 	 * Check the need for DMA-remapping initialization now.
 	 * Above initialization will also be used by Interrupt-remapping.
 	 */
-	if (no_iommu || swiotlb || dmar_disabled)
+	if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
 		return -ENODEV;
 
 	iommu_init_mempool();
@@ -2806,7 +2884,15 @@ int __init intel_iommu_init(void)
 
 	init_timer(&unmap_timer);
 	force_iommu = 1;
-	dma_ops = &intel_dma_ops;
+
+	if (!iommu_pass_through) {
+		printk(KERN_INFO
+		       "Multi-level page-table translation for DMAR.\n");
+		dma_ops = &intel_dma_ops;
+	} else
+		printk(KERN_INFO
+		       "DMAR: Pass through translation for DMAR.\n");
+
 	init_iommu_sysfs();
 
 	register_iommu(&intel_iommu_ops);
@@ -3146,7 +3232,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 		return -EFAULT;
 	}
 
-	ret = domain_context_mapping(dmar_domain, pdev);
+	ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 1a455f1f86d7..e0a03aff63d9 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -13,6 +13,9 @@
 #define DMA_PTE_WRITE (2)
 #define DMA_PTE_SNP (1 << 11)
 
+#define CONTEXT_TT_MULTI_LEVEL	0
+#define CONTEXT_TT_PASS_THROUGH 2
+
 struct intel_iommu;
 struct dmar_domain;
 struct root_entry;
@@ -21,11 +24,16 @@ extern void free_dmar_iommu(struct intel_iommu *iommu);
 
 #ifdef CONFIG_DMAR
 extern int iommu_calculate_agaw(struct intel_iommu *iommu);
+extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
 {
 	return 0;
 }
+static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return 0;
+}
 #endif
 
 extern int dmar_disabled;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index aa8c53171233..7246971a7feb 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -120,6 +120,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 	(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
 #define ecap_coherent(e)	((e) & 0x1)
 #define ecap_qis(e)		((e) & 0x2)
+#define ecap_pass_through(e)	((e >> 6) & 0x1)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
@@ -302,6 +303,7 @@ struct intel_iommu {
 	spinlock_t	register_lock; /* protect register handling */
 	int		seq_id;	/* sequence id of the iommu */
 	int		agaw; /* agaw of this iommu */
+	int		msagaw; /* max sagaw of this iommu */
 	unsigned int 	irq;
 	unsigned char 	name[13];    /* Device Name */
 
-- 
cgit v1.2.3-71-gd317


From c654b8a9cba6002aad1c01919e4928a79a4a6dcf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 16 Apr 2009 17:33:25 -0400
Subject: nfsd: support ext4 i_version

ext4 supports a real NFSv4 change attribute, which is bumped whenever
the ctime would be updated, including times when two updates arrive
within a jiffy of each other.  (Note that although ext4 has space for
nanosecond-precision ctime, the real resolution is lower: it actually
uses jiffies as the time-source.)  This ensures clients will invalidate
their caches when they need to.

There is some fear that keeping the i_version up-to-date could have
performance drawbacks, so for now it's turned on only by a mount option.
We hope to do something better eventually.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Theodore Tso <tytso@mit.edu>
---
 fs/nfsd/nfs3xdr.c          |  1 +
 fs/nfsd/nfs4xdr.c          | 63 ++++++++++++++++++++++++++++++----------------
 include/linux/nfsd/nfsfh.h |  7 ++++++
 include/linux/nfsd/xdr4.h  | 17 ++++++++++---
 4 files changed, 63 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 17d0dd997204..01d4ec1c88e0 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -272,6 +272,7 @@ void fill_post_wcc(struct svc_fh *fhp)
 
 	err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
 			&fhp->fh_post_attr);
+	fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
 	if (err)
 		fhp->fh_post_saved = 0;
 	else
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4a71fcd3f036..12d36a7361cd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1490,13 +1490,41 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	memcpy(p, ptr, nbytes);					\
 	p += XDR_QUADLEN(nbytes);				\
 }} while (0)
-#define WRITECINFO(c)		do {				\
-	*p++ = htonl(c.atomic);					\
-	*p++ = htonl(c.before_ctime_sec);				\
-	*p++ = htonl(c.before_ctime_nsec);				\
-	*p++ = htonl(c.after_ctime_sec);				\
-	*p++ = htonl(c.after_ctime_nsec);				\
-} while (0)
+
+static void write32(__be32 **p, u32 n)
+{
+	*(*p)++ = n;
+}
+
+static void write64(__be32 **p, u64 n)
+{
+	write32(p, (u32)(n >> 32));
+	write32(p, (u32)n);
+}
+
+static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
+{
+	if (IS_I_VERSION(inode)) {
+		write64(p, inode->i_version);
+	} else {
+		write32(p, stat->ctime.tv_sec);
+		write32(p, stat->ctime.tv_nsec);
+	}
+}
+
+static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
+{
+	write32(p, c->atomic);
+	if (c->change_supported) {
+		write64(p, c->before_change);
+		write64(p, c->after_change);
+	} else {
+		write32(p, c->before_ctime_sec);
+		write32(p, c->before_ctime_nsec);
+		write32(p, c->after_ctime_sec);
+		write32(p, c->after_ctime_nsec);
+	}
+}
 
 #define RESERVE_SPACE(nbytes)	do {				\
 	p = resp->p;						\
@@ -1849,16 +1877,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
 	}
 	if (bmval0 & FATTR4_WORD0_CHANGE) {
-		/*
-		 * Note: This _must_ be consistent with the scheme for writing
-		 * change_info, so any changes made here must be reflected there
-		 * as well.  (See xdr4.h:set_change_info() and the WRITECINFO()
-		 * macro above.)
-		 */
 		if ((buflen -= 8) < 0)
 			goto out_resource;
-		WRITE32(stat.ctime.tv_sec);
-		WRITE32(stat.ctime.tv_nsec);
+		write_change(&p, &stat, dentry->d_inode);
 	}
 	if (bmval0 & FATTR4_WORD0_SIZE) {
 		if ((buflen -= 8) < 0)
@@ -2364,7 +2385,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 	if (!nfserr) {
 		RESERVE_SPACE(32);
-		WRITECINFO(create->cr_cinfo);
+		write_cinfo(&p, &create->cr_cinfo);
 		WRITE32(2);
 		WRITE32(create->cr_bmval[0]);
 		WRITE32(create->cr_bmval[1]);
@@ -2475,7 +2496,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 
 	if (!nfserr) {
 		RESERVE_SPACE(20);
-		WRITECINFO(link->li_cinfo);
+		write_cinfo(&p, &link->li_cinfo);
 		ADJUST_ARGS();
 	}
 	return nfserr;
@@ -2493,7 +2514,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 
 	nfsd4_encode_stateid(resp, &open->op_stateid);
 	RESERVE_SPACE(40);
-	WRITECINFO(open->op_cinfo);
+	write_cinfo(&p, &open->op_cinfo);
 	WRITE32(open->op_rflags);
 	WRITE32(2);
 	WRITE32(open->op_bmval[0]);
@@ -2771,7 +2792,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 	if (!nfserr) {
 		RESERVE_SPACE(20);
-		WRITECINFO(remove->rm_cinfo);
+		write_cinfo(&p, &remove->rm_cinfo);
 		ADJUST_ARGS();
 	}
 	return nfserr;
@@ -2784,8 +2805,8 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 	if (!nfserr) {
 		RESERVE_SPACE(40);
-		WRITECINFO(rename->rn_sinfo);
-		WRITECINFO(rename->rn_tinfo);
+		write_cinfo(&p, &rename->rn_sinfo);
+		write_cinfo(&p, &rename->rn_tinfo);
 		ADJUST_ARGS();
 	}
 	return nfserr;
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index afa19016c4a8..8f641c908450 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -151,9 +151,15 @@ typedef struct svc_fh {
 	__u64			fh_pre_size;	/* size before operation */
 	struct timespec		fh_pre_mtime;	/* mtime before oper */
 	struct timespec		fh_pre_ctime;	/* ctime before oper */
+	/*
+	 * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
+	 *  to find out if it is valid.
+	 */
+	u64			fh_pre_change;
 
 	/* Post-op attributes saved in fh_unlock */
 	struct kstat		fh_post_attr;	/* full attrs after operation */
+	u64			fh_post_change; /* nfsv4 change; see above */
 #endif /* CONFIG_NFSD_V3 */
 
 } svc_fh;
@@ -298,6 +304,7 @@ fill_pre_wcc(struct svc_fh *fhp)
 		fhp->fh_pre_mtime = inode->i_mtime;
 		fhp->fh_pre_ctime = inode->i_ctime;
 		fhp->fh_pre_size  = inode->i_size;
+		fhp->fh_pre_change = inode->i_version;
 		fhp->fh_pre_saved = 1;
 	}
 }
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index f80d6013fdc3..d0f050f01eca 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -64,10 +64,13 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
 
 struct nfsd4_change_info {
 	u32		atomic;
+	bool		change_supported;
 	u32		before_ctime_sec;
 	u32		before_ctime_nsec;
+	u64		before_change;
 	u32		after_ctime_sec;
 	u32		after_ctime_nsec;
+	u64		after_change;
 };
 
 struct nfsd4_access {
@@ -503,10 +506,16 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
 	BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
 	cinfo->atomic = 1;
-	cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
-	cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-	cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
-	cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+	cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
+	if (cinfo->change_supported) {
+		cinfo->before_change = fhp->fh_pre_change;
+		cinfo->after_change = fhp->fh_post_change;
+	} else {
+		cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+		cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+		cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+		cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+	}
 }
 
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
-- 
cgit v1.2.3-71-gd317


From 3cef9ab266a932899e756f7e1ea7a988a97bf3b2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 23 Feb 2009 21:42:10 -0800
Subject: nfsd4: lookup up callback cred only once

Lookup the callback cred once and then use it for all subsequent
callbacks.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 26 ++++++++++++++++++++++++++
 fs/nfsd/nfs4state.c        |  4 ++++
 include/linux/nfsd/state.h |  1 +
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 711c6282151f..cc10ed35ac81 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -415,6 +415,22 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 		(int)clp->cl_name.len, clp->cl_name.data, reason);
 }
 
+static struct rpc_cred *lookup_cb_cred(struct nfs4_callback *cb)
+{
+	struct auth_cred acred = {
+		.machine_cred = 1
+	};
+
+	/*
+	 * Note in the gss case this doesn't actually have to wait for a
+	 * gss upcall (or any calls to the client); this just creates a
+	 * non-uptodate cred which the rpc state machine will fill in with
+	 * a refresh_upcall later.
+	 */
+	return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
+							RPCAUTH_LOOKUP_NEW);
+}
+
 static int do_probe_callback(void *data)
 {
 	struct nfs4_client *clp = data;
@@ -423,9 +439,18 @@ static int do_probe_callback(void *data)
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
 		.rpc_argp       = clp,
 	};
+	struct rpc_cred *cred;
 	int status;
 
+	cred = lookup_cb_cred(cb);
+	if (IS_ERR(cred)) {
+		status = PTR_ERR(cred);
+		goto out;
+	}
+	cb->cb_cred = cred;
+	msg.rpc_cred = cb->cb_cred;
 	status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
+out:
 	if (status)
 		warn_no_callback_path(clp, status);
 	else
@@ -475,6 +500,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
 		.rpc_argp = cbr,
+		.rpc_cred = clp->cl_callback.cb_cred
 	};
 	int retries = 1;
 	int status = 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7e1fcc3aade4..b205c7d7bc6a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -643,6 +643,10 @@ shutdown_callback_client(struct nfs4_client *clp)
 		clp->cl_callback.cb_client = NULL;
 		rpc_shutdown_client(clnt);
 	}
+	if (clp->cl_callback.cb_cred) {
+		put_rpccred(clp->cl_callback.cb_cred);
+		clp->cl_callback.cb_cred = NULL;
+	}
 }
 
 static inline void
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 4d61c873feed..8d882a3eb4b9 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -97,6 +97,7 @@ struct nfs4_callback {
 	/* RPC client info */
 	atomic_t		cb_set;     /* successful CB_NULL call */
 	struct rpc_clnt *       cb_client;
+	struct rpc_cred	*	cb_cred;
 };
 
 /* Maximum number of slots per session. 128 is useful for long haul TCP */
-- 
cgit v1.2.3-71-gd317


From c237dc0303bcf6f4cc2e0efe4fe4e341c6f34dac Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 29 Apr 2009 19:09:19 -0400
Subject: nfsd4: rename callback struct to cb_conn

I want to use the name for a struct that actually does represent a
single callback.

(Actually, I've never been sure it helps to a separate struct for the
callback information.  Some day maybe those fields could just be dumped
into struct nfs4_client.  I don't know.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 16 ++++++++--------
 fs/nfsd/nfs4state.c        | 22 +++++++++++-----------
 include/linux/nfsd/state.h |  4 ++--
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0aaf68beedbd..ed860d7ddd19 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -369,7 +369,7 @@ static int max_cb_time(void)
 int setup_callback_client(struct nfs4_client *clp)
 {
 	struct sockaddr_in	addr;
-	struct nfs4_callback    *cb = &clp->cl_callback;
+	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
 		.to_retries	= 0,
@@ -422,7 +422,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 	if (task->tk_status)
 		warn_no_callback_path(clp, task->tk_status);
 	else
-		atomic_set(&clp->cl_callback.cb_set, 1);
+		atomic_set(&clp->cl_cb_conn.cb_set, 1);
 	put_nfs4_client(clp);
 }
 
@@ -430,7 +430,7 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
 	.rpc_call_done = nfsd4_cb_probe_done,
 };
 
-static struct rpc_cred *lookup_cb_cred(struct nfs4_callback *cb)
+static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
 {
 	struct auth_cred acred = {
 		.machine_cred = 1
@@ -448,7 +448,7 @@ static struct rpc_cred *lookup_cb_cred(struct nfs4_callback *cb)
 
 void do_probe_callback(struct nfs4_client *clp)
 {
-	struct nfs4_callback    *cb = &clp->cl_callback;
+	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
 		.rpc_argp       = clp,
@@ -480,7 +480,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 {
 	int status;
 
-	BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+	BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
 
 	status = setup_callback_client(clp);
 	if (status) {
@@ -501,12 +501,12 @@ void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfs4_client *clp = dp->dl_client;
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
 	struct nfs4_cb_recall *cbr = &dp->dl_recall;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
 		.rpc_argp = cbr,
-		.rpc_cred = clp->cl_callback.cb_cred
+		.rpc_cred = clp->cl_cb_conn.cb_cred
 	};
 	int retries = 1;
 	int status = 0;
@@ -519,7 +519,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 		switch (status) {
 			case -EIO:
 				/* Network partition? */
-				atomic_set(&clp->cl_callback.cb_set, 0);
+				atomic_set(&clp->cl_cb_conn.cb_set, 0);
 			case -EBADHANDLE:
 			case -NFS4ERR_BAD_STATEID:
 				/* Race: client probably got cb_recall
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b205c7d7bc6a..d7b5e6b89207 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -182,7 +182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
-	struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
+	struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
 
 	dprintk("NFSD alloc_init_deleg\n");
 	if (fp->fi_had_conflict)
@@ -633,19 +633,19 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static void
 shutdown_callback_client(struct nfs4_client *clp)
 {
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
 
 	if (clnt) {
 		/*
 		 * Callback threads take a reference on the client, so there
 		 * should be no outstanding callbacks at this point.
 		 */
-		clp->cl_callback.cb_client = NULL;
+		clp->cl_cb_conn.cb_client = NULL;
 		rpc_shutdown_client(clnt);
 	}
-	if (clp->cl_callback.cb_cred) {
-		put_rpccred(clp->cl_callback.cb_cred);
-		clp->cl_callback.cb_cred = NULL;
+	if (clp->cl_cb_conn.cb_cred) {
+		put_rpccred(clp->cl_cb_conn.cb_cred);
+		clp->cl_cb_conn.cb_cred = NULL;
 	}
 }
 
@@ -719,7 +719,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 		return NULL;
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_count, 1);
-	atomic_set(&clp->cl_callback.cb_set, 0);
+	atomic_set(&clp->cl_cb_conn.cb_set, 0);
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
@@ -971,7 +971,7 @@ parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigne
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 {
-	struct nfs4_callback *cb = &clp->cl_callback;
+	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 
 	/* Currently, we only support tcp for the callback channel */
 	if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
@@ -1691,7 +1691,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		else {
 			/* XXX: We just turn off callbacks until we can handle
 			  * change request correctly. */
-			atomic_set(&conf->cl_callback.cb_set, 0);
+			atomic_set(&conf->cl_cb_conn.cb_set, 0);
 			expire_client(unconf);
 			status = nfs_ok;
 
@@ -2425,7 +2425,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
-	struct nfs4_callback *cb = &sop->so_client->cl_callback;
+	struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
 	struct file_lock fl, *flp = &fl;
 	int status, flag = 0;
 
@@ -2617,7 +2617,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	renew_client(clp);
 	status = nfserr_cb_path_down;
 	if (!list_empty(&clp->cl_delegations)
-			&& !atomic_read(&clp->cl_callback.cb_set))
+			&& !atomic_read(&clp->cl_cb_conn.cb_set))
 		goto out;
 	status = nfs_ok;
 out:
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 8d882a3eb4b9..563c367a3013 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -88,7 +88,7 @@ struct nfs4_delegation {
 #define dl_fh           dl_recall.cbr_fh
 
 /* client delegation callback info */
-struct nfs4_callback {
+struct nfs4_cb_conn {
 	/* SETCLIENTID info */
 	u32                     cb_addr;
 	unsigned short          cb_port;
@@ -186,7 +186,7 @@ struct nfs4_client {
 	struct svc_cred		cl_cred; 	/* setclientid principal */
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
-	struct nfs4_callback	cl_callback;    /* callback info */
+	struct nfs4_cb_conn	cl_cb_conn;     /* callback info */
 	atomic_t		cl_count;	/* ref count */
 	u32			cl_firststate;	/* recovery dir creation */
 
-- 
cgit v1.2.3-71-gd317


From b53d40c5070bffde1b2bcaf848412a50d8894794 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 1 May 2009 19:50:00 -0400
Subject: nfsd4: eliminate struct nfs4_cb_recall

The nfs4_cb_recall struct is used only in nfs4_delegation, so its
pointer to the containing delegation is unnecessary--we could just use
container_of().

But there's no real reason to have this a separate struct at all--just
move these fields to nfs4_delegation.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 24 +++++++++++-------------
 fs/nfsd/nfs4state.c        |  5 ++---
 include/linux/nfsd/state.h | 18 +++++-------------
 3 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ed860d7ddd19..2509305f6f53 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -215,18 +215,18 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 }
 
 static int
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
+encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp)
 {
 	__be32 *p;
-	int len = cb_rec->cbr_fh.fh_size;
+	int len = dp->dl_fh.fh_size;
 
-	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
+	RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
 	WRITE32(OP_CB_RECALL);
-	WRITE32(cb_rec->cbr_stateid.si_generation);
-	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
-	WRITE32(cb_rec->cbr_trunc);
+	WRITE32(dp->dl_stateid.si_generation);
+	WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
+	WRITE32(dp->dl_trunc);
 	WRITE32(len);
-	WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
+	WRITEMEM(&dp->dl_fh.fh_base, len);
 	return 0;
 }
 
@@ -241,11 +241,11 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 }
 
 static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr = {
-		.ident = args->cbr_ident,
+		.ident = args->dl_ident,
 		.nops   = 1,
 	};
 
@@ -502,17 +502,15 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfs4_client *clp = dp->dl_client;
 	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-	struct nfs4_cb_recall *cbr = &dp->dl_recall;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-		.rpc_argp = cbr,
+		.rpc_argp = dp,
 		.rpc_cred = clp->cl_cb_conn.cb_cred
 	};
 	int retries = 1;
 	int status = 0;
 
-	cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
-	cbr->cbr_dp = dp;
+	dp->dl_trunc = 0; /* XXX need to implement truncate optimization */
 
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
 	while (retries--) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d7b5e6b89207..3e5345e01b13 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -203,9 +203,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	get_file(stp->st_vfs_file);
 	dp->dl_vfs_file = stp->st_vfs_file;
 	dp->dl_type = type;
-	dp->dl_recall.cbr_dp = NULL;
-	dp->dl_recall.cbr_ident = cb->cb_ident;
-	dp->dl_recall.cbr_trunc = 0;
+	dp->dl_ident = cb->cb_ident;
+	dp->dl_trunc = 0;
 	dp->dl_stateid.si_boot = get_seconds();
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 563c367a3013..233b60d39b84 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -61,15 +61,6 @@ typedef struct {
 #define si_stateownerid   si_opaque.so_stateownerid
 #define si_fileid         si_opaque.so_fileid
 
-
-struct nfs4_cb_recall {
-	u32			cbr_ident;
-	int			cbr_trunc;
-	stateid_t		cbr_stateid;
-	struct knfsd_fh		cbr_fh;
-	struct nfs4_delegation	*cbr_dp;
-};
-
 struct nfs4_delegation {
 	struct list_head	dl_perfile;
 	struct list_head	dl_perclnt;
@@ -81,12 +72,13 @@ struct nfs4_delegation {
 	struct file		*dl_vfs_file;
 	u32			dl_type;
 	time_t			dl_time;
-	struct nfs4_cb_recall	dl_recall;
+/* For recall: */
+	u32			dl_ident;
+	int			dl_trunc;
+	stateid_t		dl_stateid;
+	struct knfsd_fh		dl_fh;
 };
 
-#define dl_stateid      dl_recall.cbr_stateid
-#define dl_fh           dl_recall.cbr_fh
-
 /* client delegation callback info */
 struct nfs4_cb_conn {
 	/* SETCLIENTID info */
-- 
cgit v1.2.3-71-gd317


From 6707bd3d420f53ae8f090dac871843f6f43c9980 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 1 May 2009 19:57:46 -0400
Subject: nfsd4: remove unused dl_trunc

There's no point in keeping this field around--it's always zero.

(Background: the protocol allows you to tell the client that the file is
about to be truncated, as an optimization to save the client from
writing back dirty pages that will just be discarded.  We don't
implement this hint.  If we do some day, adding this field back in will
be the least of the work involved.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 4 +---
 fs/nfsd/nfs4state.c        | 1 -
 include/linux/nfsd/state.h | 1 -
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 2509305f6f53..0420b5e6e20d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -224,7 +224,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp)
 	WRITE32(OP_CB_RECALL);
 	WRITE32(dp->dl_stateid.si_generation);
 	WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
-	WRITE32(dp->dl_trunc);
+	WRITE32(0); /* truncate optimization not implemented */
 	WRITE32(len);
 	WRITEMEM(&dp->dl_fh.fh_base, len);
 	return 0;
@@ -510,8 +510,6 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 	int retries = 1;
 	int status = 0;
 
-	dp->dl_trunc = 0; /* XXX need to implement truncate optimization */
-
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
 	while (retries--) {
 		switch (status) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3e5345e01b13..cbb16e191d5b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -204,7 +204,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_vfs_file = stp->st_vfs_file;
 	dp->dl_type = type;
 	dp->dl_ident = cb->cb_ident;
-	dp->dl_trunc = 0;
 	dp->dl_stateid.si_boot = get_seconds();
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 233b60d39b84..346b603072ce 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -74,7 +74,6 @@ struct nfs4_delegation {
 	time_t			dl_time;
 /* For recall: */
 	u32			dl_ident;
-	int			dl_trunc;
 	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
 };
-- 
cgit v1.2.3-71-gd317


From 3aea09dc9106407d8bc18e593fbffda9ad632844 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 1 May 2009 20:11:12 -0400
Subject: nfsd4: track recall retries in nfs4_delegation

Move this out of a local variable into the nfs4_delegation object in
preparation for making this an async rpc call (at which point we'll need
any state like this in a common object that's preserved across function
calls).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 4 ++--
 include/linux/nfsd/state.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0420b5e6e20d..b88b207d75d9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -507,11 +507,11 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 		.rpc_argp = dp,
 		.rpc_cred = clp->cl_cb_conn.cb_cred
 	};
-	int retries = 1;
 	int status = 0;
 
+	dp->dl_retries = 1;
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
-	while (retries--) {
+	while (dp->dl_retries--) {
 		switch (status) {
 			case -EIO:
 				/* Network partition? */
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 346b603072ce..c0c49215ddc5 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -76,6 +76,7 @@ struct nfs4_delegation {
 	u32			dl_ident;
 	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
+	int			dl_retries;
 };
 
 /* client delegation callback info */
-- 
cgit v1.2.3-71-gd317


From 4c25a2c1b90bf785fc2e2f0f0c74a80b3e070d39 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 10 May 2009 17:16:06 +0100
Subject: intel-iommu: Clean up handling of "caching mode" vs. context
 flushing.

It really doesn't make a lot of sense to have some of the logic to
handle caching vs. non-caching mode duplicated in qi_flush_context() and
__iommu_flush_context(), while the return value indicates whether the
caller should take other action which depends on the same thing.

Especially since qi_flush_context() thought it was returning something
entirely different anyway.

This patch makes qi_flush_context() and __iommu_flush_context() both
return void, removes the 'non_present_entry_flush' argument and makes
the only call site which _set_ that argument to 1 do the right thing.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 13 +++---------
 drivers/pci/intel-iommu.c   | 52 ++++++++++++++++++---------------------------
 include/linux/intel-iommu.h |  8 +++----
 3 files changed, 28 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index d3d86b749eee..10a071ba3232 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -723,23 +723,16 @@ void qi_global_iec(struct intel_iommu *iommu)
 	qi_submit_sync(&desc, iommu);
 }
 
-int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-		     u64 type, int non_present_entry_flush)
+void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+		      u64 type)
 {
 	struct qi_desc desc;
 
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
 			| QI_CC_GRAN(type) | QI_CC_TYPE;
 	desc.high = 0;
 
-	return qi_submit_sync(&desc, iommu);
+	qi_submit_sync(&desc, iommu);
 }
 
 int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index d6f4ee50924c..9f5d9151edc9 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -857,26 +857,13 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 }
 
 /* return value determine if we need a write buffer flush */
-static int __iommu_flush_context(struct intel_iommu *iommu,
-	u16 did, u16 source_id, u8 function_mask, u64 type,
-	int non_present_entry_flush)
+static void __iommu_flush_context(struct intel_iommu *iommu,
+				  u16 did, u16 source_id, u8 function_mask,
+				  u64 type)
 {
 	u64 val = 0;
 	unsigned long flag;
 
-	/*
-	 * In the non-present entry flush case, if hardware doesn't cache
-	 * non-present entry we do nothing and if hardware cache non-present
-	 * entry, we flush entries of domain 0 (the domain id is used to cache
-	 * any non-present entries)
-	 */
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	switch (type) {
 	case DMA_CCMD_GLOBAL_INVL:
 		val = DMA_CCMD_GLOBAL_INVL;
@@ -901,9 +888,6 @@ static int __iommu_flush_context(struct intel_iommu *iommu,
 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
 
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-	/* flush context entry will implicitly flush write buffer */
-	return 0;
 }
 
 /* return value determine if we need a write buffer flush */
@@ -1428,14 +1412,21 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	context_set_present(context);
 	domain_flush_cache(domain, context, sizeof(*context));
 
-	/* it's a non-present to present mapping */
-	if (iommu->flush.flush_context(iommu, id,
-		(((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
-		DMA_CCMD_DEVICE_INVL, 1))
-		iommu_flush_write_buffer(iommu);
-	else
+	/*
+	 * It's a non-present to present mapping. If hardware doesn't cache
+	 * non-present entry we only need to flush the write-buffer. If the
+	 * _does_ cache non-present entries, then it does so in the special
+	 * domain #0, which we have to flush:
+	 */
+	if (cap_caching_mode(iommu->cap)) {
+		iommu->flush.flush_context(iommu, 0,
+					   (((u16)bus) << 8) | devfn,
+					   DMA_CCMD_MASK_NOBIT,
+					   DMA_CCMD_DEVICE_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
-
+	} else {
+		iommu_flush_write_buffer(iommu);
+	}
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	spin_lock_irqsave(&domain->iommu_lock, flags);
@@ -1566,7 +1557,7 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
 
 	clear_context_table(iommu, bus, devfn);
 	iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL, 0);
+					   DMA_CCMD_GLOBAL_INVL);
 	iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 					 DMA_TLB_GLOBAL_FLUSH, 0);
 }
@@ -2104,8 +2095,7 @@ static int __init init_dmars(void)
 
 		iommu_set_root_entry(iommu);
 
-		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
-					   0);
+		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
 					 0);
 		iommu_disable_protect_mem_regions(iommu);
@@ -2721,7 +2711,7 @@ static int init_iommu_hw(void)
 		iommu_set_root_entry(iommu);
 
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL, 0);
+						DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 						DMA_TLB_GLOBAL_FLUSH, 0);
 		iommu_disable_protect_mem_regions(iommu);
@@ -2738,7 +2728,7 @@ static void iommu_flush_all(void)
 
 	for_each_active_iommu(iommu, drhd) {
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL, 0);
+						DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 						DMA_TLB_GLOBAL_FLUSH, 0);
 	}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 7246971a7feb..f2b94dafbf38 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -281,8 +281,8 @@ struct ir_table {
 #endif
 
 struct iommu_flush {
-	int (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-		u64 type, int non_present_entry_flush);
+	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
+			      u8 fm, u64 type);
 	int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
 		unsigned int size_order, u64 type, int non_present_entry_flush);
 };
@@ -339,8 +339,8 @@ extern void dmar_disable_qi(struct intel_iommu *iommu);
 extern int dmar_reenable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
-extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
-			        u8 fm, u64 type, int non_present_entry_flush);
+extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
+			     u8 fm, u64 type);
 extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type,
 			  int non_present_entry_flush);
-- 
cgit v1.2.3-71-gd317


From 1f0ef2aa18802a8ce7eb5a5164aaaf4d59073801 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 10 May 2009 19:58:49 +0100
Subject: intel-iommu: Clean up handling of "caching mode" vs. IOTLB flushing.

As we just did for context cache flushing, clean up the logic around
whether we need to flush the iotlb or just the write-buffer, depending
on caching mode.

Fix the same bug in qi_flush_iotlb() that qi_flush_context() had -- it
isn't supposed to be returning an error; it's supposed to be returning a
flag which triggers a write-buffer flush.

Remove some superfluous conditional write-buffer flushes which could
never have happened because they weren't for non-present-to-present
mapping changes anyway.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 14 ++------
 drivers/pci/intel-iommu.c   | 78 +++++++++++++++++----------------------------
 include/linux/intel-iommu.h |  9 +++---
 3 files changed, 37 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 10a071ba3232..df6af0d4ec03 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -735,22 +735,14 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
 	qi_submit_sync(&desc, iommu);
 }
 
-int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-		   unsigned int size_order, u64 type,
-		   int non_present_entry_flush)
+void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+		    unsigned int size_order, u64 type)
 {
 	u8 dw = 0, dr = 0;
 
 	struct qi_desc desc;
 	int ih = 0;
 
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	if (cap_write_drain(iommu->cap))
 		dw = 1;
 
@@ -762,7 +754,7 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
 		| QI_IOTLB_AM(size_order);
 
-	return qi_submit_sync(&desc, iommu);
+	qi_submit_sync(&desc, iommu);
 }
 
 /*
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 9f5d9151edc9..f47d04aced87 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -891,27 +891,13 @@ static void __iommu_flush_context(struct intel_iommu *iommu,
 }
 
 /* return value determine if we need a write buffer flush */
-static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
-	u64 addr, unsigned int size_order, u64 type,
-	int non_present_entry_flush)
+static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+				u64 addr, unsigned int size_order, u64 type)
 {
 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
 	u64 val = 0, val_iva = 0;
 	unsigned long flag;
 
-	/*
-	 * In the non-present entry flush case, if hardware doesn't cache
-	 * non-present entry we do nothing and if hardware cache non-present
-	 * entry, we flush entries of domain 0 (the domain id is used to cache
-	 * any non-present entries)
-	 */
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	switch (type) {
 	case DMA_TLB_GLOBAL_FLUSH:
 		/* global flush doesn't need set IVA_REG */
@@ -959,12 +945,10 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
 			(unsigned long long)DMA_TLB_IIRG(type),
 			(unsigned long long)DMA_TLB_IAIG(val));
-	/* flush iotlb entry will implicitly flush write buffer */
-	return 0;
 }
 
-static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-	u64 addr, unsigned int pages, int non_present_entry_flush)
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+				  u64 addr, unsigned int pages)
 {
 	unsigned int mask;
 
@@ -974,8 +958,7 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	/* Fallback to domain selective flush if no PSI support */
 	if (!cap_pgsel_inv(iommu->cap))
 		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-						DMA_TLB_DSI_FLUSH,
-						non_present_entry_flush);
+						DMA_TLB_DSI_FLUSH);
 
 	/*
 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
@@ -985,11 +968,10 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	/* Fallback to domain selective flush if size is too big */
 	if (mask > cap_max_amask_val(iommu->cap))
 		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-			DMA_TLB_DSI_FLUSH, non_present_entry_flush);
+						DMA_TLB_DSI_FLUSH);
 
 	return iommu->flush.flush_iotlb(iommu, did, addr, mask,
-					DMA_TLB_PSI_FLUSH,
-					non_present_entry_flush);
+					DMA_TLB_PSI_FLUSH);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1423,7 +1405,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 					   (((u16)bus) << 8) | devfn,
 					   DMA_CCMD_MASK_NOBIT,
 					   DMA_CCMD_DEVICE_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
 	} else {
 		iommu_flush_write_buffer(iommu);
 	}
@@ -1558,8 +1540,7 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
 	clear_context_table(iommu, bus, devfn);
 	iommu->flush.flush_context(iommu, 0, 0, 0,
 					   DMA_CCMD_GLOBAL_INVL);
-	iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH, 0);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 static void domain_remove_dev_info(struct dmar_domain *domain)
@@ -2096,8 +2077,7 @@ static int __init init_dmars(void)
 		iommu_set_root_entry(iommu);
 
 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
-					 0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 		iommu_disable_protect_mem_regions(iommu);
 
 		ret = iommu_enable_translation(iommu);
@@ -2244,10 +2224,11 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
 	if (ret)
 		goto error;
 
-	/* it's a non-present to present mapping */
-	ret = iommu_flush_iotlb_psi(iommu, domain->id,
-			start_paddr, size >> VTD_PAGE_SHIFT, 1);
-	if (ret)
+	/* it's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, 0, start_paddr,
+				      size >> VTD_PAGE_SHIFT);
+	else
 		iommu_flush_write_buffer(iommu);
 
 	return start_paddr + ((u64)paddr & (~PAGE_MASK));
@@ -2283,7 +2264,7 @@ static void flush_unmaps(void)
 
 		if (deferred_flush[i].next) {
 			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						 DMA_TLB_GLOBAL_FLUSH, 0);
+						 DMA_TLB_GLOBAL_FLUSH);
 			for (j = 0; j < deferred_flush[i].next; j++) {
 				__free_iova(&deferred_flush[i].domain[j]->iovad,
 						deferred_flush[i].iova[j]);
@@ -2362,9 +2343,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 	if (intel_iommu_strict) {
-		if (iommu_flush_iotlb_psi(iommu,
-			domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
-			iommu_flush_write_buffer(iommu);
+		iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
+				      size >> VTD_PAGE_SHIFT);
 		/* free iova */
 		__free_iova(&domain->iovad, iova);
 	} else {
@@ -2455,9 +2435,8 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 
-	if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
-			size >> VTD_PAGE_SHIFT, 0))
-		iommu_flush_write_buffer(iommu);
+	iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
+			      size >> VTD_PAGE_SHIFT);
 
 	/* free iova */
 	__free_iova(&domain->iovad, iova);
@@ -2549,10 +2528,13 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
 		offset += size;
 	}
 
-	/* it's a non-present to present mapping */
-	if (iommu_flush_iotlb_psi(iommu, domain->id,
-			start_addr, offset >> VTD_PAGE_SHIFT, 1))
+	/* it's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, 0, start_addr,
+				      offset >> VTD_PAGE_SHIFT);
+	else
 		iommu_flush_write_buffer(iommu);
+
 	return nelems;
 }
 
@@ -2711,9 +2693,9 @@ static int init_iommu_hw(void)
 		iommu_set_root_entry(iommu);
 
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL);
+					   DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 		iommu_disable_protect_mem_regions(iommu);
 		iommu_enable_translation(iommu);
 	}
@@ -2728,9 +2710,9 @@ static void iommu_flush_all(void)
 
 	for_each_active_iommu(iommu, drhd) {
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL);
+					   DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 	}
 }
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f2b94dafbf38..29e05a034c09 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -283,8 +283,8 @@ struct ir_table {
 struct iommu_flush {
 	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
 			      u8 fm, u64 type);
-	int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
-		unsigned int size_order, u64 type, int non_present_entry_flush);
+	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
+			    unsigned int size_order, u64 type);
 };
 
 enum {
@@ -341,9 +341,8 @@ extern void qi_global_iec(struct intel_iommu *iommu);
 
 extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
 			     u8 fm, u64 type);
-extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-			  unsigned int size_order, u64 type,
-			  int non_present_entry_flush);
+extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+			  unsigned int size_order, u64 type);
 
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-- 
cgit v1.2.3-71-gd317


From 302b4215daa0a704c843da40fd2529e5757a72da Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:32 +0800
Subject: PCI: support the ATS capability

The PCIe ATS capability makes the Endpoint be able to request the
DMA address translation from the IOMMU and cache the translation
in the device side, thus alleviate IOMMU pressure and improve the
hardware performance in the I/O virtualization environment.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/iov.c        | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h        |  37 +++++++++++++++++
 include/linux/pci.h      |   2 +
 include/linux/pci_regs.h |  10 +++++
 4 files changed, 154 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index b497daab3d4a..0a7a1b40286f 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -5,6 +5,7 @@
  *
  * PCI Express I/O Virtualization (IOV) support.
  *   Single Root IOV 1.0
+ *   Address Translation Service 1.0
  */
 
 #include <linux/pci.h>
@@ -679,3 +680,107 @@ irqreturn_t pci_sriov_migration(struct pci_dev *dev)
 	return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_migration);
+
+static int ats_alloc_one(struct pci_dev *dev, int ps)
+{
+	int pos;
+	u16 cap;
+	struct pci_ats *ats;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return -ENODEV;
+
+	ats = kzalloc(sizeof(*ats), GFP_KERNEL);
+	if (!ats)
+		return -ENOMEM;
+
+	ats->pos = pos;
+	ats->stu = ps;
+	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
+	ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+					    PCI_ATS_MAX_QDEP;
+	dev->ats = ats;
+
+	return 0;
+}
+
+static void ats_free_one(struct pci_dev *dev)
+{
+	kfree(dev->ats);
+	dev->ats = NULL;
+}
+
+/**
+ * pci_enable_ats - enable the ATS capability
+ * @dev: the PCI device
+ * @ps: the IOMMU page shift
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_ats(struct pci_dev *dev, int ps)
+{
+	int rc;
+	u16 ctrl;
+
+	BUG_ON(dev->ats);
+
+	if (ps < PCI_ATS_MIN_STU)
+		return -EINVAL;
+
+	rc = ats_alloc_one(dev, ps);
+	if (rc)
+		return rc;
+
+	ctrl = PCI_ATS_CTRL_ENABLE;
+	ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU);
+	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+
+	return 0;
+}
+
+/**
+ * pci_disable_ats - disable the ATS capability
+ * @dev: the PCI device
+ */
+void pci_disable_ats(struct pci_dev *dev)
+{
+	u16 ctrl;
+
+	BUG_ON(!dev->ats);
+
+	pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl);
+	ctrl &= ~PCI_ATS_CTRL_ENABLE;
+	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+
+	ats_free_one(dev);
+}
+
+/**
+ * pci_ats_queue_depth - query the ATS Invalidate Queue Depth
+ * @dev: the PCI device
+ *
+ * Returns the queue depth on success, or negative on failure.
+ *
+ * The ATS spec uses 0 in the Invalidate Queue Depth field to
+ * indicate that the function can accept 32 Invalidate Request.
+ * But here we use the `real' values (i.e. 1~32) for the Queue
+ * Depth.
+ */
+int pci_ats_queue_depth(struct pci_dev *dev)
+{
+	int pos;
+	u16 cap;
+
+	if (dev->ats)
+		return dev->ats->qdep;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return -ENODEV;
+
+	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
+
+	return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+				       PCI_ATS_MAX_QDEP;
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d03f6b99f292..3c2ec64f78e9 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -229,6 +229,13 @@ struct pci_sriov {
 	u8 __iomem *mstate;	/* VF Migration State Array */
 };
 
+/* Address Translation Service */
+struct pci_ats {
+	int pos;	/* capability position */
+	int stu;	/* Smallest Translation Unit */
+	int qdep;	/* Invalidate Queue Depth */
+};
+
 #ifdef CONFIG_PCI_IOV
 extern int pci_iov_init(struct pci_dev *dev);
 extern void pci_iov_release(struct pci_dev *dev);
@@ -236,6 +243,20 @@ extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 				enum pci_bar_type *type);
 extern void pci_restore_iov_state(struct pci_dev *dev);
 extern int pci_iov_bus_range(struct pci_bus *bus);
+
+extern int pci_enable_ats(struct pci_dev *dev, int ps);
+extern void pci_disable_ats(struct pci_dev *dev);
+extern int pci_ats_queue_depth(struct pci_dev *dev);
+/**
+ * pci_ats_enabled - query the ATS status
+ * @dev: the PCI device
+ *
+ * Returns 1 if ATS capability is enabled, or 0 if not.
+ */
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+	return !!dev->ats;
+}
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
@@ -257,6 +278,22 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 {
 	return 0;
 }
+
+static inline int pci_enable_ats(struct pci_dev *dev, int ps)
+{
+	return -ENODEV;
+}
+static inline void pci_disable_ats(struct pci_dev *dev)
+{
+}
+static inline int pci_ats_queue_depth(struct pci_dev *dev)
+{
+	return -ENODEV;
+}
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_IOV */
 
 #endif /* DRIVERS_PCI_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 72698d89e767..bd3e4a798c43 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -188,6 +188,7 @@ struct pci_cap_saved_state {
 struct pcie_link_state;
 struct pci_vpd;
 struct pci_sriov;
+struct pci_ats;
 
 /*
  * The pci_dev structure is used to describe PCI devices.
@@ -285,6 +286,7 @@ struct pci_dev {
 		struct pci_sriov *sriov;	/* SR-IOV capability related */
 		struct pci_dev *physfn;	/* the PF this VF is associated with */
 	};
+	struct pci_ats	*ats;	/* Address Translation Service */
 #endif
 };
 
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index e4d08c1b2e0b..c03189c56c7a 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -501,6 +501,7 @@
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
 #define PCI_EXT_CAP_ID_ARI	14
+#define PCI_EXT_CAP_ID_ATS	15
 #define PCI_EXT_CAP_ID_SRIOV	16
 
 /* Advanced Error Reporting */
@@ -619,6 +620,15 @@
 #define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
 #define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
+/* Address Translation Service */
+#define PCI_ATS_CAP		0x04	/* ATS Capability Register */
+#define  PCI_ATS_CAP_QDEP(x)	((x) & 0x1f)	/* Invalidate Queue Depth */
+#define  PCI_ATS_MAX_QDEP	32	/* Max Invalidate Queue Depth */
+#define PCI_ATS_CTRL		0x06	/* ATS Control Register */
+#define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
+#define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f)	/* Smallest Translation Unit */
+#define  PCI_ATS_MIN_STU	12	/* shift of minimum STU block */
+
 /* Single Root I/O Virtualization */
 #define PCI_SRIOV_CAP		0x04	/* SR-IOV Capabilities */
 #define  PCI_SRIOV_CAP_VFM	0x01	/* VF Migration Capable */
-- 
cgit v1.2.3-71-gd317


From aa5d2b515b6fca5f8a56eac84f7fa0a68c1ce9b7 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:34 +0800
Subject: VT-d: parse ATSR in DMA Remapping Reporting Structure

Parse the Root Port ATS Capability Reporting Structure in the DMA
Remapping Reporting Structure ACPI table.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 112 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/dmar.h        |   9 ++++
 include/linux/intel-iommu.h |   1 +
 3 files changed, 116 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index f23460a5d106..6d7f9619b8a9 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -267,6 +267,84 @@ rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
 	}
 	return ret;
 }
+
+static LIST_HEAD(dmar_atsr_units);
+
+static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
+{
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
+	if (!atsru)
+		return -ENOMEM;
+
+	atsru->hdr = hdr;
+	atsru->include_all = atsr->flags & 0x1;
+
+	list_add(&atsru->list, &dmar_atsr_units);
+
+	return 0;
+}
+
+static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
+{
+	int rc;
+	struct acpi_dmar_atsr *atsr;
+
+	if (atsru->include_all)
+		return 0;
+
+	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+	rc = dmar_parse_dev_scope((void *)(atsr + 1),
+				(void *)atsr + atsr->header.length,
+				&atsru->devices_cnt, &atsru->devices,
+				atsr->segment);
+	if (rc || !atsru->devices_cnt) {
+		list_del(&atsru->list);
+		kfree(atsru);
+	}
+
+	return rc;
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+	int i;
+	struct pci_bus *bus;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	list_for_each_entry(atsru, &dmar_atsr_units, list) {
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (atsr->segment == pci_domain_nr(dev->bus))
+			goto found;
+	}
+
+	return 0;
+
+found:
+	for (bus = dev->bus; bus; bus = bus->parent) {
+		struct pci_dev *bridge = bus->self;
+
+		if (!bridge || !bridge->is_pcie ||
+		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+
+		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
+			for (i = 0; i < atsru->devices_cnt; i++)
+				if (atsru->devices[i] == bridge)
+					return 1;
+			break;
+		}
+	}
+
+	if (atsru->include_all)
+		return 1;
+
+	return 0;
+}
 #endif
 
 static void __init
@@ -274,22 +352,28 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 {
 	struct acpi_dmar_hardware_unit *drhd;
 	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_atsr *atsr;
 
 	switch (header->type) {
 	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-		drhd = (struct acpi_dmar_hardware_unit *)header;
+		drhd = container_of(header, struct acpi_dmar_hardware_unit,
+				    header);
 		printk (KERN_INFO PREFIX
-			"DRHD (flags: 0x%08x)base: 0x%016Lx\n",
-			drhd->flags, (unsigned long long)drhd->address);
+			"DRHD base: %#016Lx flags: %#x\n",
+			(unsigned long long)drhd->address, drhd->flags);
 		break;
 	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-		rmrr = (struct acpi_dmar_reserved_memory *)header;
-
+		rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+				    header);
 		printk (KERN_INFO PREFIX
-			"RMRR base: 0x%016Lx end: 0x%016Lx\n",
+			"RMRR base: %#016Lx end: %#016Lx\n",
 			(unsigned long long)rmrr->base_address,
 			(unsigned long long)rmrr->end_address);
 		break;
+	case ACPI_DMAR_TYPE_ATSR:
+		atsr = container_of(header, struct acpi_dmar_atsr, header);
+		printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
+		break;
 	}
 }
 
@@ -361,6 +445,11 @@ parse_dmar_table(void)
 		case ACPI_DMAR_TYPE_RESERVED_MEMORY:
 #ifdef CONFIG_DMAR
 			ret = dmar_parse_one_rmrr(entry_header);
+#endif
+			break;
+		case ACPI_DMAR_TYPE_ATSR:
+#ifdef CONFIG_DMAR
+			ret = dmar_parse_one_atsr(entry_header);
 #endif
 			break;
 		default:
@@ -431,11 +520,19 @@ int __init dmar_dev_scope_init(void)
 #ifdef CONFIG_DMAR
 	{
 		struct dmar_rmrr_unit *rmrr, *rmrr_n;
+		struct dmar_atsr_unit *atsr, *atsr_n;
+
 		list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
 			ret = rmrr_parse_dev(rmrr);
 			if (ret)
 				return ret;
 		}
+
+		list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
+			ret = atsr_parse_dev(atsr);
+			if (ret)
+				return ret;
+		}
 	}
 #endif
 
@@ -468,6 +565,9 @@ int __init dmar_table_init(void)
 #ifdef CONFIG_DMAR
 	if (list_empty(&dmar_rmrr_units))
 		printk(KERN_INFO PREFIX "No RMRR found\n");
+
+	if (list_empty(&dmar_atsr_units))
+		printk(KERN_INFO PREFIX "No ATSR found\n");
 #endif
 
 #ifdef CONFIG_INTR_REMAP
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e397dc342cda..7c9a207e5da6 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -185,6 +185,15 @@ struct dmar_rmrr_unit {
 
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+
+struct dmar_atsr_unit {
+	struct list_head list;		/* list of ATSR units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct pci_dev **devices;	/* target devices */
+	int devices_cnt;		/* target device count */
+	u8 include_all:1;		/* include all ports */
+};
+
 /* Intel DMAR  initialization functions */
 extern int intel_iommu_init(void);
 #else
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 29e05a034c09..0a1939f200fc 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -331,6 +331,7 @@ static inline void __iommu_flush_cache(
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
 
 extern int alloc_iommu(struct dmar_drhd_unit *drhd);
 extern void free_iommu(struct intel_iommu *iommu);
-- 
cgit v1.2.3-71-gd317


From 6ba6c3a4cacfd68bf970e3e04e2ff0d66fa0f695 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:35 +0800
Subject: VT-d: add device IOTLB invalidation support

Support device IOTLB invalidation to flush the translation cached
in the Endpoint.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 77 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/intel-iommu.h | 14 ++++++++-
 2 files changed, 82 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 6d7f9619b8a9..7b287cb38b7a 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -699,7 +699,8 @@ void free_iommu(struct intel_iommu *iommu)
  */
 static inline void reclaim_free_desc(struct q_inval *qi)
 {
-	while (qi->desc_status[qi->free_tail] == QI_DONE) {
+	while (qi->desc_status[qi->free_tail] == QI_DONE ||
+	       qi->desc_status[qi->free_tail] == QI_ABORT) {
 		qi->desc_status[qi->free_tail] = QI_FREE;
 		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
 		qi->free_cnt++;
@@ -709,10 +710,13 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 static int qi_check_fault(struct intel_iommu *iommu, int index)
 {
 	u32 fault;
-	int head;
+	int head, tail;
 	struct q_inval *qi = iommu->qi;
 	int wait_index = (index + 1) % QI_LENGTH;
 
+	if (qi->desc_status[wait_index] == QI_ABORT)
+		return -EAGAIN;
+
 	fault = readl(iommu->reg + DMAR_FSTS_REG);
 
 	/*
@@ -722,7 +726,11 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 	 */
 	if (fault & DMA_FSTS_IQE) {
 		head = readl(iommu->reg + DMAR_IQH_REG);
-		if ((head >> 4) == index) {
+		if ((head >> DMAR_IQ_SHIFT) == index) {
+			printk(KERN_ERR "VT-d detected invalid descriptor: "
+				"low=%llx, high=%llx\n",
+				(unsigned long long)qi->desc[index].low,
+				(unsigned long long)qi->desc[index].high);
 			memcpy(&qi->desc[index], &qi->desc[wait_index],
 					sizeof(struct qi_desc));
 			__iommu_flush_cache(iommu, &qi->desc[index],
@@ -732,6 +740,32 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 		}
 	}
 
+	/*
+	 * If ITE happens, all pending wait_desc commands are aborted.
+	 * No new descriptors are fetched until the ITE is cleared.
+	 */
+	if (fault & DMA_FSTS_ITE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+		head |= 1;
+		tail = readl(iommu->reg + DMAR_IQT_REG);
+		tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+
+		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+		do {
+			if (qi->desc_status[head] == QI_IN_USE)
+				qi->desc_status[head] = QI_ABORT;
+			head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+		} while (head != tail);
+
+		if (qi->desc_status[wait_index] == QI_ABORT)
+			return -EAGAIN;
+	}
+
+	if (fault & DMA_FSTS_ICE)
+		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
 	return 0;
 }
 
@@ -741,7 +775,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
  */
 int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 {
-	int rc = 0;
+	int rc;
 	struct q_inval *qi = iommu->qi;
 	struct qi_desc *hw, wait_desc;
 	int wait_index, index;
@@ -752,6 +786,9 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 
 	hw = qi->desc;
 
+restart:
+	rc = 0;
+
 	spin_lock_irqsave(&qi->q_lock, flags);
 	while (qi->free_cnt < 3) {
 		spin_unlock_irqrestore(&qi->q_lock, flags);
@@ -782,7 +819,7 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 	 * update the HW tail register indicating the presence of
 	 * new descriptors.
 	 */
-	writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG);
+	writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
 
 	while (qi->desc_status[wait_index] != QI_DONE) {
 		/*
@@ -794,18 +831,21 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 		 */
 		rc = qi_check_fault(iommu, index);
 		if (rc)
-			goto out;
+			break;
 
 		spin_unlock(&qi->q_lock);
 		cpu_relax();
 		spin_lock(&qi->q_lock);
 	}
-out:
-	qi->desc_status[index] = qi->desc_status[wait_index] = QI_DONE;
+
+	qi->desc_status[index] = QI_DONE;
 
 	reclaim_free_desc(qi);
 	spin_unlock_irqrestore(&qi->q_lock, flags);
 
+	if (rc == -EAGAIN)
+		goto restart;
+
 	return rc;
 }
 
@@ -857,6 +897,27 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	qi_submit_sync(&desc, iommu);
 }
 
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			u64 addr, unsigned mask)
+{
+	struct qi_desc desc;
+
+	if (mask) {
+		BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
+		addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+	} else
+		desc.high = QI_DEV_IOTLB_ADDR(addr);
+
+	if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+		qdep = 0;
+
+	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+		   QI_DIOTLB_TYPE;
+
+	qi_submit_sync(&desc, iommu);
+}
+
 /*
  * Disable Queued Invalidation interface.
  */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0a1939f200fc..40561b224a17 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -53,6 +53,7 @@
 #define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
 #define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
 #define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
 #define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
@@ -198,6 +199,8 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_FSTS_PPF ((u32)2)
 #define DMA_FSTS_PFO ((u32)1)
 #define DMA_FSTS_IQE (1 << 4)
+#define DMA_FSTS_ICE (1 << 5)
+#define DMA_FSTS_ITE (1 << 6)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
@@ -226,7 +229,8 @@ do {									\
 enum {
 	QI_FREE,
 	QI_IN_USE,
-	QI_DONE
+	QI_DONE,
+	QI_ABORT
 };
 
 #define QI_CC_TYPE		0x1
@@ -255,6 +259,12 @@ enum {
 #define QI_CC_DID(did)		(((u64)did) << 16)
 #define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
 
+#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
+#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
+#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_SIZE	1
+#define QI_DEV_IOTLB_MAX_INVS	32
+
 struct qi_desc {
 	u64 low, high;
 };
@@ -344,6 +354,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
 			     u8 fm, u64 type);
 extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			       u64 addr, unsigned mask);
 
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-- 
cgit v1.2.3-71-gd317


From 93a23a7271dfb811b3adb72779054c3a24433112 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:37 +0800
Subject: VT-d: support the device IOTLB

Enable the device IOTLB (i.e. ATS) for both the bare metal and KVM
environments.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 109 ++++++++++++++++++++++++++++++++++++++----
 include/linux/dma_remapping.h |   1 +
 include/linux/intel-iommu.h   |   1 +
 3 files changed, 102 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 6d7cb84c63ea..c3cdfc90c13a 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -252,6 +252,7 @@ struct device_domain_info {
 	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
 	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
 	struct dmar_domain *domain; /* pointer to domain */
 };
 
@@ -945,6 +946,77 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 			(unsigned long long)DMA_TLB_IAIG(val));
 }
 
+static struct device_domain_info *iommu_support_dev_iotlb(
+	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
+{
+	int found = 0;
+	unsigned long flags;
+	struct device_domain_info *info;
+	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
+
+	if (!ecap_dev_iotlb_support(iommu->ecap))
+		return NULL;
+
+	if (!iommu->qi)
+		return NULL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->bus == bus && info->devfn == devfn) {
+			found = 1;
+			break;
+		}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (!found || !info->dev)
+		return NULL;
+
+	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
+		return NULL;
+
+	if (!dmar_find_matched_atsr_unit(info->dev))
+		return NULL;
+
+	info->iommu = iommu;
+
+	return info;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info)
+		return;
+
+	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info->dev || !pci_ats_enabled(info->dev))
+		return;
+
+	pci_disable_ats(info->dev);
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+				  u64 addr, unsigned mask)
+{
+	u16 sid, qdep;
+	unsigned long flags;
+	struct device_domain_info *info;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev || !pci_ats_enabled(info->dev))
+			continue;
+
+		sid = info->bus << 8 | info->devfn;
+		qdep = pci_ats_queue_depth(info->dev);
+		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 				  u64 addr, unsigned int pages)
 {
@@ -965,6 +1037,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	else
 		iommu->flush.flush_iotlb(iommu, did, addr, mask,
 						DMA_TLB_PSI_FLUSH);
+	if (did)
+		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1305,6 +1379,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	unsigned long ndomains;
 	int id;
 	int agaw;
+	struct device_domain_info *info = NULL;
 
 	pr_debug("Set context mapping for %02x:%02x.%d\n",
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1372,15 +1447,21 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 
 	context_set_domain_id(context, id);
 
+	if (translation != CONTEXT_TT_PASS_THROUGH) {
+		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
+		translation = info ? CONTEXT_TT_DEV_IOTLB :
+				     CONTEXT_TT_MULTI_LEVEL;
+	}
 	/*
 	 * In pass through mode, AW must be programmed to indicate the largest
 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
 	 */
-	if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
-		context_set_address_width(context, iommu->agaw);
-		context_set_address_root(context, virt_to_phys(pgd));
-	} else
+	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
 		context_set_address_width(context, iommu->msagaw);
+	else {
+		context_set_address_root(context, virt_to_phys(pgd));
+		context_set_address_width(context, iommu->agaw);
+	}
 
 	context_set_translation_type(context, translation);
 	context_set_fault_enable(context);
@@ -1402,6 +1483,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	} else {
 		iommu_flush_write_buffer(iommu);
 	}
+	iommu_enable_dev_iotlb(info);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	spin_lock_irqsave(&domain->iommu_lock, flags);
@@ -1552,6 +1634,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 			info->dev->dev.archdata.iommu = NULL;
 		spin_unlock_irqrestore(&device_domain_lock, flags);
 
+		iommu_disable_dev_iotlb(info);
 		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		free_devinfo_mem(info);
@@ -2259,10 +2342,16 @@ static void flush_unmaps(void)
 			continue;
 
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 		for (j = 0; j < deferred_flush[i].next; j++) {
-			__free_iova(&deferred_flush[i].domain[j]->iovad,
-					deferred_flush[i].iova[j]);
+			unsigned long mask;
+			struct iova *iova = deferred_flush[i].iova[j];
+
+			mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT;
+			mask = ilog2(mask >> VTD_PAGE_SHIFT);
+			iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
+					iova->pfn_lo << PAGE_SHIFT, mask);
+			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
 		}
 		deferred_flush[i].next = 0;
 	}
@@ -2943,6 +3032,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 				info->dev->dev.archdata.iommu = NULL;
 			spin_unlock_irqrestore(&device_domain_lock, flags);
 
+			iommu_disable_dev_iotlb(info);
 			iommu_detach_dev(iommu, info->bus, info->devfn);
 			iommu_detach_dependent_devices(iommu, pdev);
 			free_devinfo_mem(info);
@@ -2993,6 +3083,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 
 		spin_unlock_irqrestore(&device_domain_lock, flags1);
 
+		iommu_disable_dev_iotlb(info);
 		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		iommu_detach_dependent_devices(iommu, info->dev);
@@ -3197,11 +3288,11 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 		return -EFAULT;
 	}
 
-	ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
+	ret = vm_domain_add_dev_info(dmar_domain, pdev);
 	if (ret)
 		return ret;
 
-	ret = vm_domain_add_dev_info(dmar_domain, pdev);
+	ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
 	return ret;
 }
 
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index e0a03aff63d9..5619f8522738 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -14,6 +14,7 @@
 #define DMA_PTE_SNP (1 << 11)
 
 #define CONTEXT_TT_MULTI_LEVEL	0
+#define CONTEXT_TT_DEV_IOTLB	1
 #define CONTEXT_TT_PASS_THROUGH 2
 
 struct intel_iommu;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 40561b224a17..482dc91fd53a 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -124,6 +124,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define ecap_pass_through(e)	((e >> 6) & 0x1)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
 #define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
 
-- 
cgit v1.2.3-71-gd317


From b90cf6681f4f6263920616e7ca2fd09130e4143a Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 5 Apr 2009 08:23:44 -0700
Subject: [MTD] Remove option for add_mtd_partitions() to not register
 partitions.

This breaks the dilnetpc map driver, but it could be fixed not to use
that option. We want to simplify the partition handling, and this is a
step towards that.

Remove superfluous 'index' field from private struct mtd_part too, while
we're at it.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdpart.c          | 18 ++++--------------
 include/linux/mtd/partitions.h |  1 -
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 63d1cd2c17be..349fcbe5cc0f 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -27,9 +27,7 @@ struct mtd_part {
 	struct mtd_info mtd;
 	struct mtd_info *master;
 	uint64_t offset;
-	int index;
 	struct list_head list;
-	int registered;
 };
 
 /*
@@ -321,8 +319,7 @@ int del_mtd_partitions(struct mtd_info *master)
 	list_for_each_entry_safe(slave, next, &mtd_partitions, list)
 		if (slave->master == master) {
 			list_del(&slave->list);
-			if (slave->registered)
-				del_mtd_device(&slave->mtd);
+			del_mtd_device(&slave->mtd);
 			kfree(slave);
 		}
 
@@ -412,7 +409,6 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	slave->mtd.erase = part_erase;
 	slave->master = master;
 	slave->offset = part->offset;
-	slave->index = partno;
 
 	if (slave->offset == MTDPART_OFS_APPEND)
 		slave->offset = cur_offset;
@@ -500,15 +496,9 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	}
 
 out_register:
-	if (part->mtdp) {
-		/* store the object pointer (caller may or may not register it*/
-		*part->mtdp = &slave->mtd;
-		slave->registered = 0;
-	} else {
-		/* register our partition */
-		add_mtd_device(&slave->mtd);
-		slave->registered = 1;
-	}
+	/* register our partition */
+	add_mtd_device(&slave->mtd);
+
 	return slave;
 }
 
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 7535a74083b9..af6dcb992bc3 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -40,7 +40,6 @@ struct mtd_partition {
 	uint64_t offset;		/* offset within the master MTD space */
 	uint32_t mask_flags;		/* master MTD flags to mask out for this partition */
 	struct nand_ecclayout *ecclayout;	/* out of band layout for this partition (NAND only)*/
-	struct mtd_info **mtdp;		/* pointer to store the MTD object */
 };
 
 #define MTDPART_OFS_NXTBLK	(-2)
-- 
cgit v1.2.3-71-gd317


From fca4217c5bab31019b5247e977673c9fcc385f6b Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Wed, 1 Apr 2009 07:28:13 +1100
Subject: knfsd: reply cache cleanups

Make REQHASH() an inline function.  Rename hash_list to cache_hash.
Fix an obsolete comment.

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfscache.c         | 29 +++++++++++++++++++----------
 include/linux/nfsd/cache.h |  3 +--
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5bfc2ac60d54..6f0aa4989c61 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,15 +29,24 @@
  */
 #define CACHESIZE		1024
 #define HASHSIZE		64
-#define REQHASH(xid)		(((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
 
-static struct hlist_head *	hash_list;
+static struct hlist_head *	cache_hash;
 static struct list_head 	lru_head;
 static int			cache_disabled = 1;
 
+/*
+ * Calculate the hash index from an XID.
+ */
+static inline u32 request_hash(u32 xid)
+{
+	u32 h = xid;
+	h ^= (xid >> 24);
+	return h & (HASHSIZE-1);
+}
+
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 
-/* 
+/*
  * locking for the reply cache:
  * A cache entry is "single use" if c_state == RC_INPROG
  * Otherwise, it when accessing _prev or _next, the lock must be held.
@@ -62,8 +71,8 @@ int nfsd_reply_cache_init(void)
 		i--;
 	}
 
-	hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-	if (!hash_list)
+	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+	if (!cache_hash)
 		goto out_nomem;
 
 	cache_disabled = 0;
@@ -88,8 +97,8 @@ void nfsd_reply_cache_shutdown(void)
 
 	cache_disabled = 1;
 
-	kfree (hash_list);
-	hash_list = NULL;
+	kfree (cache_hash);
+	cache_hash = NULL;
 }
 
 /*
@@ -108,7 +117,7 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
 	hlist_del_init(&rp->c_hash);
-	hlist_add_head(&rp->c_hash, hash_list + REQHASH(rp->c_xid));
+	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
 }
 
 /*
@@ -138,7 +147,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
 	spin_lock(&cache_lock);
 	rtn = RC_DOIT;
 
-	rh = &hash_list[REQHASH(xid)];
+	rh = &cache_hash[request_hash(xid)];
 	hlist_for_each_entry(rp, hn, rh, c_hash) {
 		if (rp->c_state != RC_UNUSED &&
 		    xid == rp->c_xid && proc == rp->c_proc &&
@@ -264,7 +273,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 
 	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
 	len >>= 2;
-	
+
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
 		rp->c_state = RC_UNUSED;
diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index 5bccaab81056..3a3f58934f5e 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -14,8 +14,7 @@
 #include <linux/uio.h>
 
 /*
- * Representation of a reply cache entry. The first two members *must*
- * be hash_next and hash_prev.
+ * Representation of a reply cache entry.
  */
 struct svc_cacherep {
 	struct hlist_node	c_hash;
-- 
cgit v1.2.3-71-gd317


From ee8f37688966ab1438d0cf42e0cb7c6595d9592c Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 5 May 2009 11:04:19 +0300
Subject: mtd: OneNAND: add support for OneNAND manufactured by Numonyx

In addition to adding the Numonyx manufacturer code, this patch
also ensures 'sync. write' is disabled when reading identification
data - something that the Numonyx chip objects to, but the
Samsung chip seems to ignore.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/onenand/onenand_base.c | 3 ++-
 include/linux/mtd/onenand.h        | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 30d6999e5f9f..2346857a275d 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -2576,6 +2576,7 @@ static void onenand_print_device_info(int device, int version)
 
 static const struct onenand_manufacturers onenand_manuf_ids[] = {
         {ONENAND_MFR_SAMSUNG, "Samsung"},
+	{ONENAND_MFR_NUMONYX, "Numonyx"},
 };
 
 /**
@@ -2621,7 +2622,7 @@ static int onenand_probe(struct mtd_info *mtd)
 	/* Save system configuration 1 */
 	syscfg = this->read_word(this->base + ONENAND_REG_SYS_CFG1);
 	/* Clear Sync. Burst Read mode to read BootRAM */
-	this->write_word((syscfg & ~ONENAND_SYS_CFG1_SYNC_READ), this->base + ONENAND_REG_SYS_CFG1);
+	this->write_word((syscfg & ~ONENAND_SYS_CFG1_SYNC_READ & ~ONENAND_SYS_CFG1_SYNC_WRITE), this->base + ONENAND_REG_SYS_CFG1);
 
 	/* Send the command for reading device ID from BootRAM */
 	this->write_word(ONENAND_CMD_READID, this->base + ONENAND_BOOTRAM);
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 9aa2a9149b58..0fa3ac4ad576 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -176,6 +176,7 @@ struct onenand_chip {
  * OneNAND Flash Manufacturer ID Codes
  */
 #define ONENAND_MFR_SAMSUNG	0xec
+#define ONENAND_MFR_NUMONYX	0x20
 
 /**
  * struct onenand_manufacturers - NAND Flash Manufacturer ID Structure
-- 
cgit v1.2.3-71-gd317


From d6fed9e9fc5eefae5be0ecf222bac7e7496e8e74 Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex@digriz.org.uk>
Date: Mon, 11 May 2009 19:28:01 +0100
Subject: mtd: extend plat_nand for (read|write)_buf

This patch adds (write|read)_buf callbacks to plat_nand.

The NAND on the TS-7800 provisioned by the FPGA allows readw() and
readl() to be used which gives a 2.5x speed up.  To be able to use this
from the plat_nand driver a hook for read_buf (and also write_buf whilst
we are in there) need to be made available.  This patch adds the hook.

Signed-off-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/plat_nand.c | 2 ++
 include/linux/mtd/nand.h     | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index 28ffd4e8bb2f..47a2105b9671 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -61,6 +61,8 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 	data->chip.cmd_ctrl = pdata->ctrl.cmd_ctrl;
 	data->chip.dev_ready = pdata->ctrl.dev_ready;
 	data->chip.select_chip = pdata->ctrl.select_chip;
+	data->chip.write_buf = pdata->ctrl.write_buf;
+	data->chip.read_buf = pdata->ctrl.read_buf;
 	data->chip.chip_delay = pdata->chip.chip_delay;
 	data->chip.options |= pdata->chip.options;
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7efb9be34662..0e35375ea795 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -584,6 +584,8 @@ struct platform_nand_chip {
  * @select_chip:	platform specific chip select function
  * @cmd_ctrl:		platform specific function for controlling
  *			ALE/CLE/nCE. Also used to write command and address
+ * @write_buf:		platform specific function for write buffer
+ * @read_buf:		platform specific function for read buffer
  * @priv:		private data to transport driver specific settings
  *
  * All fields are optional and depend on the hardware driver requirements
@@ -594,6 +596,10 @@ struct platform_nand_ctrl {
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
 	void		(*cmd_ctrl)(struct mtd_info *mtd, int dat,
 				    unsigned int ctrl);
+	void		(*write_buf)(struct mtd_info *mtd,
+				    const uint8_t *buf, int len);
+	void		(*read_buf)(struct mtd_info *mtd,
+				    uint8_t *buf, int len);
 	void		*priv;
 };
 
-- 
cgit v1.2.3-71-gd317


From bf95efd41b1a760128eb25402791b0a4941eb655 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Tue, 12 May 2009 13:46:58 -0700
Subject: mtd: plat_nand: add platform probe/remove callbacks

Add optional probe and remove callbacks to the plat_nand driver.

Some platforms may require additional setup, such as configuring the
memory controller, before the nand device can be accessed.  This patch
provides an optional callback to handle this setup as well as a callback
to teardown the setup.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Tested-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/plat_nand.c | 13 +++++++++++--
 include/linux/mtd/nand.h     |  7 +++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index 47a2105b9671..22e0ce788419 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -72,6 +72,13 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, data);
 
+	/* Handle any platform specific setup */
+	if (pdata->ctrl.probe) {
+		res = pdata->ctrl.probe(pdev);
+		if (res)
+			goto out;
+	}
+
 	/* Scan to find existance of the device */
 	if (nand_scan(&data->mtd, 1)) {
 		res = -ENXIO;
@@ -101,6 +108,8 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 
 	nand_release(&data->mtd);
 out:
+	if (pdata->ctrl.remove)
+		pdata->ctrl.remove(pdev);
 	platform_set_drvdata(pdev, NULL);
 	iounmap(data->io_base);
 	kfree(data);
@@ -113,15 +122,15 @@ out:
 static int __devexit plat_nand_remove(struct platform_device *pdev)
 {
 	struct plat_nand_data *data = platform_get_drvdata(pdev);
-#ifdef CONFIG_MTD_PARTITIONS
 	struct platform_nand_data *pdata = pdev->dev.platform_data;
-#endif
 
 	nand_release(&data->mtd);
 #ifdef CONFIG_MTD_PARTITIONS
 	if (data->parts && data->parts != pdata->chip.partitions)
 		kfree(data->parts);
 #endif
+	if (pdata->ctrl.remove)
+		pdata->ctrl.remove(pdev);
 	iounmap(data->io_base);
 	kfree(data);
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 0e35375ea795..7f2d69356554 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -577,8 +577,13 @@ struct platform_nand_chip {
 	void			*priv;
 };
 
+/* Keep gcc happy */
+struct platform_device;
+
 /**
  * struct platform_nand_ctrl - controller level device structure
+ * @probe:		platform specific function to probe/setup hardware
+ * @remove:		platform specific function to remove/teardown hardware
  * @hwcontrol:		platform specific hardware control structure
  * @dev_ready:		platform specific function to read ready/busy pin
  * @select_chip:	platform specific chip select function
@@ -591,6 +596,8 @@ struct platform_nand_chip {
  * All fields are optional and depend on the hardware driver requirements
  */
 struct platform_nand_ctrl {
+	int		(*probe)(struct platform_device *pdev);
+	void		(*remove)(struct platform_device *pdev);
 	void		(*hwcontrol)(struct mtd_info *mtd, int cmd);
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
-- 
cgit v1.2.3-71-gd317


From f36e20c01ad0104688f2eaebdf2213e749929c97 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Tue, 12 May 2009 13:46:59 -0700
Subject: mtd: plat_nand: allow platform to set partitions

Add optional callback to allow platform to initialize partitions.

Static partitions on a nand device could vary depending on the size of the
device.  This patch allows an optional platform callback to be used to
setup this partition information at runtime.

Scan order is:
	1) chip.part_probe_types
	2) chip.set_parts
	3) chip.partitions
	4) full mtd device (fallback for no partitions)

Some of the existing nand drivers could possibly be replaced by the
plat_nand driver by using this patch.  These include autcpu12.c and
ts7250.c drivers.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/plat_nand.c | 2 ++
 include/linux/mtd/nand.h     | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index 22e0ce788419..4e16c6f5bdd5 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -95,6 +95,8 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 			return 0;
 		}
 	}
+	if (pdata->chip.set_parts)
+		pdata->chip.set_parts(data->mtd.size, &pdata->chip);
 	if (pdata->chip.partitions) {
 		data->parts = pdata->chip.partitions;
 		res = add_mtd_partitions(&data->mtd, data->parts,
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7f2d69356554..4030ebada49e 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -563,6 +563,7 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @options:		Option flags, e.g. 16bit buswidth
  * @ecclayout:		ecc layout info structure
  * @part_probe_types:	NULL-terminated array of probe types
+ * @set_parts:		platform specific function to set partitions
  * @priv:		hardware controller specific settings
  */
 struct platform_nand_chip {
@@ -574,6 +575,8 @@ struct platform_nand_chip {
 	int			chip_delay;
 	unsigned int		options;
 	const char		**part_probe_types;
+	void			(*set_parts)(uint64_t size,
+					struct platform_nand_chip *chip);
 	void			*priv;
 };
 
-- 
cgit v1.2.3-71-gd317


From 5988af2319781bc8e0ce418affec4e09cfa77907 Mon Sep 17 00:00:00 2001
From: Rohit Hagargundgi <h.rohit@samsung.com>
Date: Tue, 12 May 2009 13:46:57 -0700
Subject: mtd: Flex-OneNAND support

Add support for Samsung Flex-OneNAND devices.

Flex-OneNAND combines SLC and MLC technologies into a single device.
SLC area provides increased reliability and speed, suitable for storing
code such as bootloader, kernel and root file system.  MLC area
provides high density and is suitable for storing user data.

SLC and MLC regions can be configured through kernel parameter.

[akpm@linux-foundation.org: export flexoand_region and onenand_addr]
Signed-off-by: Rohit Hagargundgi <h.rohit@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Vishak G <vishak.g@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/kernel-parameters.txt |  10 +
 drivers/mtd/onenand/onenand_base.c  | 857 ++++++++++++++++++++++++++++++++----
 drivers/mtd/onenand/onenand_bbt.c   |  14 +-
 drivers/mtd/onenand/onenand_sim.c   |  81 +++-
 include/linux/mtd/onenand.h         |  18 +
 include/linux/mtd/onenand_regs.h    |  20 +-
 6 files changed, 913 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc75..12df135f8af9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1380,6 +1380,16 @@ and is between 256 and 4096 characters. It is defined in the file
 	mtdparts=	[MTD]
 			See drivers/mtd/cmdlinepart.c.
 
+	onenand.bdry=	[HW,MTD] Flex-OneNAND Boundary Configuration
+
+			Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
+
+			boundary - index of last SLC block on Flex-OneNAND.
+				   The remaining blocks are configured as MLC blocks.
+			lock	 - Configure if Flex-OneNAND boundary should be locked.
+				   Once locked, the boundary cannot be changed.
+				   1 indicates lock status, 0 indicates unlock status.
+
 	mtdset=		[ARM]
 			ARM/S3C2412 JIVE boot control
 
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 2346857a275d..8d4c9c253732 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -9,6 +9,10 @@
  *	auto-placement support, read-while load support, various fixes
  *	Copyright (C) Nokia Corporation, 2007
  *
+ *	Vishak G <vishak.g at samsung.com>, Rohit Hagargundgi <h.rohit at samsung.com>
+ *	Flex-OneNAND support
+ *	Copyright (C) Samsung Electronics, 2008
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -27,6 +31,30 @@
 
 #include <asm/io.h>
 
+/* Default Flex-OneNAND boundary and lock respectively */
+static int flex_bdry[MAX_DIES * 2] = { -1, 0, -1, 0 };
+
+/**
+ *  onenand_oob_128 - oob info for Flex-Onenand with 4KB page
+ *  For now, we expose only 64 out of 80 ecc bytes
+ */
+static struct nand_ecclayout onenand_oob_128 = {
+	.eccbytes	= 64,
+	.eccpos		= {
+		6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+		22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+		38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+		54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+		70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+		86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+		102, 103, 104, 105
+		},
+	.oobfree	= {
+		{2, 4}, {18, 4}, {34, 4}, {50, 4},
+		{66, 4}, {82, 4}, {98, 4}, {114, 4}
+	}
+};
+
 /**
  * onenand_oob_64 - oob info for large (2KB) page
  */
@@ -65,6 +93,14 @@ static const unsigned char ffchars[] = {
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 48 */
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 64 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 80 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 96 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 112 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 128 */
 };
 
 /**
@@ -170,6 +206,70 @@ static int onenand_buffer_address(int dataram1, int sectors, int count)
 	return ((bsa << ONENAND_BSA_SHIFT) | bsc);
 }
 
+/**
+ * flexonenand_block- For given address return block number
+ * @param this         - OneNAND device structure
+ * @param addr		- Address for which block number is needed
+ */
+static unsigned flexonenand_block(struct onenand_chip *this, loff_t addr)
+{
+	unsigned boundary, blk, die = 0;
+
+	if (ONENAND_IS_DDP(this) && addr >= this->diesize[0]) {
+		die = 1;
+		addr -= this->diesize[0];
+	}
+
+	boundary = this->boundary[die];
+
+	blk = addr >> (this->erase_shift - 1);
+	if (blk > boundary)
+		blk = (blk + boundary + 1) >> 1;
+
+	blk += die ? this->density_mask : 0;
+	return blk;
+}
+
+inline unsigned onenand_block(struct onenand_chip *this, loff_t addr)
+{
+	if (!FLEXONENAND(this))
+		return addr >> this->erase_shift;
+	return flexonenand_block(this, addr);
+}
+
+/**
+ * flexonenand_addr - Return address of the block
+ * @this:		OneNAND device structure
+ * @block:		Block number on Flex-OneNAND
+ *
+ * Return address of the block
+ */
+static loff_t flexonenand_addr(struct onenand_chip *this, int block)
+{
+	loff_t ofs = 0;
+	int die = 0, boundary;
+
+	if (ONENAND_IS_DDP(this) && block >= this->density_mask) {
+		block -= this->density_mask;
+		die = 1;
+		ofs = this->diesize[0];
+	}
+
+	boundary = this->boundary[die];
+	ofs += (loff_t)block << (this->erase_shift - 1);
+	if (block > (boundary + 1))
+		ofs += (loff_t)(block - boundary - 1) << (this->erase_shift - 1);
+	return ofs;
+}
+
+loff_t onenand_addr(struct onenand_chip *this, int block)
+{
+	if (!FLEXONENAND(this))
+		return (loff_t)block << this->erase_shift;
+	return flexonenand_addr(this, block);
+}
+EXPORT_SYMBOL(onenand_addr);
+
 /**
  * onenand_get_density - [DEFAULT] Get OneNAND density
  * @param dev_id	OneNAND device ID
@@ -182,6 +282,22 @@ static inline int onenand_get_density(int dev_id)
 	return (density & ONENAND_DEVICE_DENSITY_MASK);
 }
 
+/**
+ * flexonenand_region - [Flex-OneNAND] Return erase region of addr
+ * @param mtd		MTD device structure
+ * @param addr		address whose erase region needs to be identified
+ */
+int flexonenand_region(struct mtd_info *mtd, loff_t addr)
+{
+	int i;
+
+	for (i = 0; i < mtd->numeraseregions; i++)
+		if (addr < mtd->eraseregions[i].offset)
+			break;
+	return i - 1;
+}
+EXPORT_SYMBOL(flexonenand_region);
+
 /**
  * onenand_command - [DEFAULT] Send command to OneNAND device
  * @param mtd		MTD device structure
@@ -207,16 +323,28 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 		page = -1;
 		break;
 
+	case FLEXONENAND_CMD_PI_ACCESS:
+		/* addr contains die index */
+		block = addr * this->density_mask;
+		page = -1;
+		break;
+
 	case ONENAND_CMD_ERASE:
 	case ONENAND_CMD_BUFFERRAM:
 	case ONENAND_CMD_OTP_ACCESS:
-		block = (int) (addr >> this->erase_shift);
+		block = onenand_block(this, addr);
 		page = -1;
 		break;
 
+	case FLEXONENAND_CMD_READ_PI:
+		cmd = ONENAND_CMD_READ;
+		block = addr * this->density_mask;
+		page = 0;
+		break;
+
 	default:
-		block = (int) (addr >> this->erase_shift);
-		page = (int) (addr >> this->page_shift);
+		block = onenand_block(this, addr);
+		page = (int) (addr - onenand_addr(this, block)) >> this->page_shift;
 
 		if (ONENAND_IS_2PLANE(this)) {
 			/* Make the even block number */
@@ -236,7 +364,7 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 		value = onenand_bufferram_address(this, block);
 		this->write_word(value, this->base + ONENAND_REG_START_ADDRESS2);
 
-		if (ONENAND_IS_2PLANE(this))
+		if (ONENAND_IS_MLC(this) || ONENAND_IS_2PLANE(this))
 			/* It is always BufferRAM0 */
 			ONENAND_SET_BUFFERRAM0(this);
 		else
@@ -258,13 +386,18 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 
 	if (page != -1) {
 		/* Now we use page size operation */
-		int sectors = 4, count = 4;
+		int sectors = 0, count = 0;
 		int dataram;
 
 		switch (cmd) {
+		case FLEXONENAND_CMD_RECOVER_LSB:
 		case ONENAND_CMD_READ:
 		case ONENAND_CMD_READOOB:
-			dataram = ONENAND_SET_NEXT_BUFFERRAM(this);
+			if (ONENAND_IS_MLC(this))
+				/* It is always BufferRAM0 */
+				dataram = ONENAND_SET_BUFFERRAM0(this);
+			else
+				dataram = ONENAND_SET_NEXT_BUFFERRAM(this);
 			break;
 
 		default:
@@ -292,6 +425,30 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 	return 0;
 }
 
+/**
+ * onenand_read_ecc - return ecc status
+ * @param this		onenand chip structure
+ */
+static inline int onenand_read_ecc(struct onenand_chip *this)
+{
+	int ecc, i, result = 0;
+
+	if (!FLEXONENAND(this))
+		return this->read_word(this->base + ONENAND_REG_ECC_STATUS);
+
+	for (i = 0; i < 4; i++) {
+		ecc = this->read_word(this->base + ONENAND_REG_ECC_STATUS + i);
+		if (likely(!ecc))
+			continue;
+		if (ecc & FLEXONENAND_UNCORRECTABLE_ERROR)
+			return ONENAND_ECC_2BIT_ALL;
+		else
+			result = ONENAND_ECC_1BIT_ALL;
+	}
+
+	return result;
+}
+
 /**
  * onenand_wait - [DEFAULT] wait until the command is done
  * @param mtd		MTD device structure
@@ -331,14 +488,14 @@ static int onenand_wait(struct mtd_info *mtd, int state)
 	 * power off recovery (POR) test, it should read ECC status first
 	 */
 	if (interrupt & ONENAND_INT_READ) {
-		int ecc = this->read_word(this->base + ONENAND_REG_ECC_STATUS);
+		int ecc = onenand_read_ecc(this);
 		if (ecc) {
 			if (ecc & ONENAND_ECC_2BIT_ALL) {
 				printk(KERN_ERR "onenand_wait: ECC error = 0x%04x\n", ecc);
 				mtd->ecc_stats.failed++;
 				return -EBADMSG;
 			} else if (ecc & ONENAND_ECC_1BIT_ALL) {
-				printk(KERN_INFO "onenand_wait: correctable ECC error = 0x%04x\n", ecc);
+				printk(KERN_DEBUG "onenand_wait: correctable ECC error = 0x%04x\n", ecc);
 				mtd->ecc_stats.corrected++;
 			}
 		}
@@ -656,7 +813,7 @@ static int onenand_check_bufferram(struct mtd_info *mtd, loff_t addr)
 
 	if (found && ONENAND_IS_DDP(this)) {
 		/* Select DataRAM for DDP */
-		int block = (int) (addr >> this->erase_shift);
+		int block = onenand_block(this, addr);
 		int value = onenand_bufferram_address(this, block);
 		this->write_word(value, this->base + ONENAND_REG_START_ADDRESS2);
 	}
@@ -815,6 +972,149 @@ static int onenand_transfer_auto_oob(struct mtd_info *mtd, uint8_t *buf, int col
 	return 0;
 }
 
+/**
+ * onenand_recover_lsb - [Flex-OneNAND] Recover LSB page data
+ * @param mtd		MTD device structure
+ * @param addr		address to recover
+ * @param status	return value from onenand_wait / onenand_bbt_wait
+ *
+ * MLC NAND Flash cell has paired pages - LSB page and MSB page. LSB page has
+ * lower page address and MSB page has higher page address in paired pages.
+ * If power off occurs during MSB page program, the paired LSB page data can
+ * become corrupt. LSB page recovery read is a way to read LSB page though page
+ * data are corrupted. When uncorrectable error occurs as a result of LSB page
+ * read after power up, issue LSB page recovery read.
+ */
+static int onenand_recover_lsb(struct mtd_info *mtd, loff_t addr, int status)
+{
+	struct onenand_chip *this = mtd->priv;
+	int i;
+
+	/* Recovery is only for Flex-OneNAND */
+	if (!FLEXONENAND(this))
+		return status;
+
+	/* check if we failed due to uncorrectable error */
+	if (status != -EBADMSG && status != ONENAND_BBT_READ_ECC_ERROR)
+		return status;
+
+	/* check if address lies in MLC region */
+	i = flexonenand_region(mtd, addr);
+	if (mtd->eraseregions[i].erasesize < (1 << this->erase_shift))
+		return status;
+
+	/* We are attempting to reread, so decrement stats.failed
+	 * which was incremented by onenand_wait due to read failure
+	 */
+	printk(KERN_INFO "onenand_recover_lsb: Attempting to recover from uncorrectable read\n");
+	mtd->ecc_stats.failed--;
+
+	/* Issue the LSB page recovery command */
+	this->command(mtd, FLEXONENAND_CMD_RECOVER_LSB, addr, this->writesize);
+	return this->wait(mtd, FL_READING);
+}
+
+/**
+ * onenand_mlc_read_ops_nolock - MLC OneNAND read main and/or out-of-band
+ * @param mtd		MTD device structure
+ * @param from		offset to read from
+ * @param ops:		oob operation description structure
+ *
+ * MLC OneNAND / Flex-OneNAND has 4KB page size and 4KB dataram.
+ * So, read-while-load is not present.
+ */
+static int onenand_mlc_read_ops_nolock(struct mtd_info *mtd, loff_t from,
+				struct mtd_oob_ops *ops)
+{
+	struct onenand_chip *this = mtd->priv;
+	struct mtd_ecc_stats stats;
+	size_t len = ops->len;
+	size_t ooblen = ops->ooblen;
+	u_char *buf = ops->datbuf;
+	u_char *oobbuf = ops->oobbuf;
+	int read = 0, column, thislen;
+	int oobread = 0, oobcolumn, thisooblen, oobsize;
+	int ret = 0;
+	int writesize = this->writesize;
+
+	DEBUG(MTD_DEBUG_LEVEL3, "onenand_mlc_read_ops_nolock: from = 0x%08x, len = %i\n", (unsigned int) from, (int) len);
+
+	if (ops->mode == MTD_OOB_AUTO)
+		oobsize = this->ecclayout->oobavail;
+	else
+		oobsize = mtd->oobsize;
+
+	oobcolumn = from & (mtd->oobsize - 1);
+
+	/* Do not allow reads past end of device */
+	if (from + len > mtd->size) {
+		printk(KERN_ERR "onenand_mlc_read_ops_nolock: Attempt read beyond end of device\n");
+		ops->retlen = 0;
+		ops->oobretlen = 0;
+		return -EINVAL;
+	}
+
+	stats = mtd->ecc_stats;
+
+	while (read < len) {
+		cond_resched();
+
+		thislen = min_t(int, writesize, len - read);
+
+		column = from & (writesize - 1);
+		if (column + thislen > writesize)
+			thislen = writesize - column;
+
+		if (!onenand_check_bufferram(mtd, from)) {
+			this->command(mtd, ONENAND_CMD_READ, from, writesize);
+
+			ret = this->wait(mtd, FL_READING);
+			if (unlikely(ret))
+				ret = onenand_recover_lsb(mtd, from, ret);
+			onenand_update_bufferram(mtd, from, !ret);
+			if (ret == -EBADMSG)
+				ret = 0;
+		}
+
+		this->read_bufferram(mtd, ONENAND_DATARAM, buf, column, thislen);
+		if (oobbuf) {
+			thisooblen = oobsize - oobcolumn;
+			thisooblen = min_t(int, thisooblen, ooblen - oobread);
+
+			if (ops->mode == MTD_OOB_AUTO)
+				onenand_transfer_auto_oob(mtd, oobbuf, oobcolumn, thisooblen);
+			else
+				this->read_bufferram(mtd, ONENAND_SPARERAM, oobbuf, oobcolumn, thisooblen);
+			oobread += thisooblen;
+			oobbuf += thisooblen;
+			oobcolumn = 0;
+		}
+
+		read += thislen;
+		if (read == len)
+			break;
+
+		from += thislen;
+		buf += thislen;
+	}
+
+	/*
+	 * Return success, if no ECC failures, else -EBADMSG
+	 * fs driver will take care of that, because
+	 * retlen == desired len and result == -EBADMSG
+	 */
+	ops->retlen = read;
+	ops->oobretlen = oobread;
+
+	if (ret)
+		return ret;
+
+	if (mtd->ecc_stats.failed - stats.failed)
+		return -EBADMSG;
+
+	return mtd->ecc_stats.corrected - stats.corrected ? -EUCLEAN : 0;
+}
+
 /**
  * onenand_read_ops_nolock - [OneNAND Interface] OneNAND read main and/or out-of-band
  * @param mtd		MTD device structure
@@ -962,7 +1262,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 	size_t len = ops->ooblen;
 	mtd_oob_mode_t mode = ops->mode;
 	u_char *buf = ops->oobbuf;
-	int ret = 0;
+	int ret = 0, readcmd;
 
 	from += ops->ooboffs;
 
@@ -993,17 +1293,22 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 
 	stats = mtd->ecc_stats;
 
+	readcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_READ : ONENAND_CMD_READOOB;
+
 	while (read < len) {
 		cond_resched();
 
 		thislen = oobsize - column;
 		thislen = min_t(int, thislen, len);
 
-		this->command(mtd, ONENAND_CMD_READOOB, from, mtd->oobsize);
+		this->command(mtd, readcmd, from, mtd->oobsize);
 
 		onenand_update_bufferram(mtd, from, 0);
 
 		ret = this->wait(mtd, FL_READING);
+		if (unlikely(ret))
+			ret = onenand_recover_lsb(mtd, from, ret);
+
 		if (ret && ret != -EBADMSG) {
 			printk(KERN_ERR "onenand_read_oob_nolock: read failed = 0x%x\n", ret);
 			break;
@@ -1053,6 +1358,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
 	size_t *retlen, u_char *buf)
 {
+	struct onenand_chip *this = mtd->priv;
 	struct mtd_oob_ops ops = {
 		.len	= len,
 		.ooblen	= 0,
@@ -1062,7 +1368,9 @@ static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
 	int ret;
 
 	onenand_get_device(mtd, FL_READING);
-	ret = onenand_read_ops_nolock(mtd, from, &ops);
+	ret = ONENAND_IS_MLC(this) ?
+		onenand_mlc_read_ops_nolock(mtd, from, &ops) :
+		onenand_read_ops_nolock(mtd, from, &ops);
 	onenand_release_device(mtd);
 
 	*retlen = ops.retlen;
@@ -1080,6 +1388,7 @@ static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
 static int onenand_read_oob(struct mtd_info *mtd, loff_t from,
 			    struct mtd_oob_ops *ops)
 {
+	struct onenand_chip *this = mtd->priv;
 	int ret;
 
 	switch (ops->mode) {
@@ -1094,7 +1403,9 @@ static int onenand_read_oob(struct mtd_info *mtd, loff_t from,
 
 	onenand_get_device(mtd, FL_READING);
 	if (ops->datbuf)
-		ret = onenand_read_ops_nolock(mtd, from, ops);
+		ret = ONENAND_IS_MLC(this) ?
+			onenand_mlc_read_ops_nolock(mtd, from, ops) :
+			onenand_read_ops_nolock(mtd, from, ops);
 	else
 		ret = onenand_read_oob_nolock(mtd, from, ops);
 	onenand_release_device(mtd);
@@ -1128,11 +1439,11 @@ static int onenand_bbt_wait(struct mtd_info *mtd, int state)
 	ctrl = this->read_word(this->base + ONENAND_REG_CTRL_STATUS);
 
 	if (interrupt & ONENAND_INT_READ) {
-		int ecc = this->read_word(this->base + ONENAND_REG_ECC_STATUS);
+		int ecc = onenand_read_ecc(this);
 		if (ecc & ONENAND_ECC_2BIT_ALL) {
 			printk(KERN_INFO "onenand_bbt_wait: ecc error = 0x%04x"
 				", controller error 0x%04x\n", ecc, ctrl);
-			return ONENAND_BBT_READ_ERROR;
+			return ONENAND_BBT_READ_ECC_ERROR;
 		}
 	} else {
 		printk(KERN_ERR "onenand_bbt_wait: read timeout!"
@@ -1163,7 +1474,7 @@ int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 {
 	struct onenand_chip *this = mtd->priv;
 	int read = 0, thislen, column;
-	int ret = 0;
+	int ret = 0, readcmd;
 	size_t len = ops->ooblen;
 	u_char *buf = ops->oobbuf;
 
@@ -1183,17 +1494,22 @@ int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 
 	column = from & (mtd->oobsize - 1);
 
+	readcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_READ : ONENAND_CMD_READOOB;
+
 	while (read < len) {
 		cond_resched();
 
 		thislen = mtd->oobsize - column;
 		thislen = min_t(int, thislen, len);
 
-		this->command(mtd, ONENAND_CMD_READOOB, from, mtd->oobsize);
+		this->command(mtd, readcmd, from, mtd->oobsize);
 
 		onenand_update_bufferram(mtd, from, 0);
 
 		ret = onenand_bbt_wait(mtd, FL_READING);
+		if (unlikely(ret))
+			ret = onenand_recover_lsb(mtd, from, ret);
+
 		if (ret)
 			break;
 
@@ -1230,9 +1546,11 @@ static int onenand_verify_oob(struct mtd_info *mtd, const u_char *buf, loff_t to
 {
 	struct onenand_chip *this = mtd->priv;
 	u_char *oob_buf = this->oob_buf;
-	int status, i;
+	int status, i, readcmd;
+
+	readcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_READ : ONENAND_CMD_READOOB;
 
-	this->command(mtd, ONENAND_CMD_READOOB, to, mtd->oobsize);
+	this->command(mtd, readcmd, to, mtd->oobsize);
 	onenand_update_bufferram(mtd, to, 0);
 	status = this->wait(mtd, FL_READING);
 	if (status)
@@ -1633,7 +1951,7 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 {
 	struct onenand_chip *this = mtd->priv;
 	int column, ret = 0, oobsize;
-	int written = 0;
+	int written = 0, oobcmd;
 	u_char *oobbuf;
 	size_t len = ops->ooblen;
 	const u_char *buf = ops->oobbuf;
@@ -1675,6 +1993,8 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 
 	oobbuf = this->oob_buf;
 
+	oobcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_PROG : ONENAND_CMD_PROGOOB;
+
 	/* Loop until all data write */
 	while (written < len) {
 		int thislen = min_t(int, oobsize, len - written);
@@ -1692,7 +2012,14 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 			memcpy(oobbuf + column, buf, thislen);
 		this->write_bufferram(mtd, ONENAND_SPARERAM, oobbuf, 0, mtd->oobsize);
 
-		this->command(mtd, ONENAND_CMD_PROGOOB, to, mtd->oobsize);
+		if (ONENAND_IS_MLC(this)) {
+			/* Set main area of DataRAM to 0xff*/
+			memset(this->page_buf, 0xff, mtd->writesize);
+			this->write_bufferram(mtd, ONENAND_DATARAM,
+					 this->page_buf, 0, mtd->writesize);
+		}
+
+		this->command(mtd, oobcmd, to, mtd->oobsize);
 
 		onenand_update_bufferram(mtd, to, 0);
 		if (ONENAND_IS_2PLANE(this)) {
@@ -1815,29 +2142,48 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 {
 	struct onenand_chip *this = mtd->priv;
 	unsigned int block_size;
-	loff_t addr;
-	int len;
-	int ret = 0;
+	loff_t addr = instr->addr;
+	loff_t len = instr->len;
+	int ret = 0, i;
+	struct mtd_erase_region_info *region = NULL;
+	loff_t region_end = 0;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%012llx, len = %llu\n", (unsigned long long) instr->addr, (unsigned long long) instr->len);
 
-	block_size = (1 << this->erase_shift);
-
-	/* Start address must align on block boundary */
-	if (unlikely(instr->addr & (block_size - 1))) {
-		printk(KERN_ERR "onenand_erase: Unaligned address\n");
+	/* Do not allow erase past end of device */
+	if (unlikely((len + addr) > mtd->size)) {
+		printk(KERN_ERR "onenand_erase: Erase past end of device\n");
 		return -EINVAL;
 	}
 
-	/* Length must align on block boundary */
-	if (unlikely(instr->len & (block_size - 1))) {
-		printk(KERN_ERR "onenand_erase: Length not block aligned\n");
-		return -EINVAL;
+	if (FLEXONENAND(this)) {
+		/* Find the eraseregion of this address */
+		i = flexonenand_region(mtd, addr);
+		region = &mtd->eraseregions[i];
+
+		block_size = region->erasesize;
+		region_end = region->offset + region->erasesize * region->numblocks;
+
+		/* Start address within region must align on block boundary.
+		 * Erase region's start offset is always block start address.
+		 */
+		if (unlikely((addr - region->offset) & (block_size - 1))) {
+			printk(KERN_ERR "onenand_erase: Unaligned address\n");
+			return -EINVAL;
+		}
+	} else {
+		block_size = 1 << this->erase_shift;
+
+		/* Start address must align on block boundary */
+		if (unlikely(addr & (block_size - 1))) {
+			printk(KERN_ERR "onenand_erase: Unaligned address\n");
+			return -EINVAL;
+		}
 	}
 
-	/* Do not allow erase past end of device */
-	if (unlikely((instr->len + instr->addr) > mtd->size)) {
-		printk(KERN_ERR "onenand_erase: Erase past end of device\n");
+	/* Length must align on block boundary */
+	if (unlikely(len & (block_size - 1))) {
+		printk(KERN_ERR "onenand_erase: Length not block aligned\n");
 		return -EINVAL;
 	}
 
@@ -1847,9 +2193,6 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 	onenand_get_device(mtd, FL_ERASING);
 
 	/* Loop throught the pages */
-	len = instr->len;
-	addr = instr->addr;
-
 	instr->state = MTD_ERASING;
 
 	while (len) {
@@ -1869,7 +2212,8 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 		ret = this->wait(mtd, FL_ERASING);
 		/* Check, if it is write protected */
 		if (ret) {
-			printk(KERN_ERR "onenand_erase: Failed erase, block %d\n", (unsigned) (addr >> this->erase_shift));
+			printk(KERN_ERR "onenand_erase: Failed erase, block %d\n",
+						 onenand_block(this, addr));
 			instr->state = MTD_ERASE_FAILED;
 			instr->fail_addr = addr;
 			goto erase_exit;
@@ -1877,6 +2221,22 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 
 		len -= block_size;
 		addr += block_size;
+
+		if (addr == region_end) {
+			if (!len)
+				break;
+			region++;
+
+			block_size = region->erasesize;
+			region_end = region->offset + region->erasesize * region->numblocks;
+
+			if (len & (block_size - 1)) {
+				/* FIXME: This should be handled at MTD partitioning level. */
+				printk(KERN_ERR "onenand_erase: Unaligned address\n");
+				goto erase_exit;
+			}
+		}
+
 	}
 
 	instr->state = MTD_ERASE_DONE;
@@ -1955,13 +2315,17 @@ static int onenand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	int block;
 
 	/* Get block number */
-	block = ((int) ofs) >> bbm->bbt_erase_shift;
+	block = onenand_block(this, ofs);
         if (bbm->bbt)
                 bbm->bbt[block >> 2] |= 0x01 << ((block & 0x03) << 1);
 
         /* We write two bytes, so we dont have to mess with 16 bit access */
         ofs += mtd->oobsize + (bbm->badblockpos & ~0x01);
-        return onenand_write_oob_nolock(mtd, ofs, &ops);
+	/* FIXME : What to do when marking SLC block in partition
+	 * 	   with MLC erasesize? For now, it is not advisable to
+	 *	   create partitions containing both SLC and MLC regions.
+	 */
+	return onenand_write_oob_nolock(mtd, ofs, &ops);
 }
 
 /**
@@ -2005,8 +2369,8 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
 	int start, end, block, value, status;
 	int wp_status_mask;
 
-	start = ofs >> this->erase_shift;
-	end = len >> this->erase_shift;
+	start = onenand_block(this, ofs);
+	end = onenand_block(this, ofs + len) - 1;
 
 	if (cmd == ONENAND_CMD_LOCK)
 		wp_status_mask = ONENAND_WP_LS;
@@ -2018,7 +2382,7 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
 		/* Set start block address */
 		this->write_word(start, this->base + ONENAND_REG_START_BLOCK_ADDRESS);
 		/* Set end block address */
-		this->write_word(start + end - 1, this->base + ONENAND_REG_END_BLOCK_ADDRESS);
+		this->write_word(end, this->base +  ONENAND_REG_END_BLOCK_ADDRESS);
 		/* Write lock command */
 		this->command(mtd, cmd, 0, 0);
 
@@ -2039,7 +2403,7 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
 	}
 
 	/* Block lock scheme */
-	for (block = start; block < start + end; block++) {
+	for (block = start; block < end + 1; block++) {
 		/* Set block address */
 		value = onenand_block_address(this, block);
 		this->write_word(value, this->base + ONENAND_REG_START_ADDRESS1);
@@ -2147,7 +2511,7 @@ static void onenand_unlock_all(struct mtd_info *mtd)
 {
 	struct onenand_chip *this = mtd->priv;
 	loff_t ofs = 0;
-	size_t len = this->chipsize;
+	loff_t len = mtd->size;
 
 	if (this->options & ONENAND_HAS_UNLOCK_ALL) {
 		/* Set start block address */
@@ -2168,7 +2532,7 @@ static void onenand_unlock_all(struct mtd_info *mtd)
 			return;
 
 		/* Workaround for all block unlock in DDP */
-		if (ONENAND_IS_DDP(this)) {
+		if (ONENAND_IS_DDP(this) && !FLEXONENAND(this)) {
 			/* All blocks on another chip */
 			ofs = this->chipsize >> 1;
 			len = this->chipsize >> 1;
@@ -2210,7 +2574,9 @@ static int do_otp_read(struct mtd_info *mtd, loff_t from, size_t len,
 	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
 	this->wait(mtd, FL_OTPING);
 
-	ret = onenand_read_ops_nolock(mtd, from, &ops);
+	ret = ONENAND_IS_MLC(this) ?
+		onenand_mlc_read_ops_nolock(mtd, from, &ops) :
+		onenand_read_ops_nolock(mtd, from, &ops);
 
 	/* Exit OTP access mode */
 	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
@@ -2277,21 +2643,32 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len,
 		size_t *retlen, u_char *buf)
 {
 	struct onenand_chip *this = mtd->priv;
-	struct mtd_oob_ops ops = {
-		.mode = MTD_OOB_PLACE,
-		.ooblen = len,
-		.oobbuf = buf,
-		.ooboffs = 0,
-	};
+	struct mtd_oob_ops ops;
 	int ret;
 
 	/* Enter OTP access mode */
 	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
 	this->wait(mtd, FL_OTPING);
 
-	ret = onenand_write_oob_nolock(mtd, from, &ops);
-
-	*retlen = ops.oobretlen;
+	if (FLEXONENAND(this)) {
+		/*
+		 * For Flex-OneNAND, we write lock mark to 1st word of sector 4 of
+		 * main area of page 49.
+		 */
+		ops.len = mtd->writesize;
+		ops.ooblen = 0;
+		ops.datbuf = buf;
+		ops.oobbuf = NULL;
+		ret = onenand_write_ops_nolock(mtd, mtd->writesize * 49, &ops);
+		*retlen = ops.retlen;
+	} else {
+		ops.mode = MTD_OOB_PLACE;
+		ops.ooblen = len;
+		ops.oobbuf = buf;
+		ops.ooboffs = 0;
+		ret = onenand_write_oob_nolock(mtd, from, &ops);
+		*retlen = ops.oobretlen;
+	}
 
 	/* Exit OTP access mode */
 	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
@@ -2475,27 +2852,34 @@ static int onenand_lock_user_prot_reg(struct mtd_info *mtd, loff_t from,
 			size_t len)
 {
 	struct onenand_chip *this = mtd->priv;
-	u_char *oob_buf = this->oob_buf;
+	u_char *buf = FLEXONENAND(this) ? this->page_buf : this->oob_buf;
 	size_t retlen;
 	int ret;
 
-	memset(oob_buf, 0xff, mtd->oobsize);
+	memset(buf, 0xff, FLEXONENAND(this) ? this->writesize
+						 : mtd->oobsize);
 	/*
 	 * Note: OTP lock operation
 	 *       OTP block : 0xXXFC
 	 *       1st block : 0xXXF3 (If chip support)
 	 *       Both      : 0xXXF0 (If chip support)
 	 */
-	oob_buf[ONENAND_OTP_LOCK_OFFSET] = 0xFC;
+	if (FLEXONENAND(this))
+		buf[FLEXONENAND_OTP_LOCK_OFFSET] = 0xFC;
+	else
+		buf[ONENAND_OTP_LOCK_OFFSET] = 0xFC;
 
 	/*
 	 * Write lock mark to 8th word of sector0 of page0 of the spare0.
 	 * We write 16 bytes spare area instead of 2 bytes.
+	 * For Flex-OneNAND, we write lock mark to 1st word of sector 4 of
+	 * main area of page 49.
 	 */
+
 	from = 0;
-	len = 16;
+	len = FLEXONENAND(this) ? mtd->writesize : 16;
 
-	ret = onenand_otp_walk(mtd, from, len, &retlen, oob_buf, do_otp_lock, MTD_OTP_USER);
+	ret = onenand_otp_walk(mtd, from, len, &retlen, buf, do_otp_lock, MTD_OTP_USER);
 
 	return ret ? : retlen;
 }
@@ -2542,6 +2926,14 @@ static void onenand_check_features(struct mtd_info *mtd)
 		break;
 	}
 
+	if (ONENAND_IS_MLC(this))
+		this->options &= ~ONENAND_HAS_2PLANE;
+
+	if (FLEXONENAND(this)) {
+		this->options &= ~ONENAND_HAS_CONT_LOCK;
+		this->options |= ONENAND_HAS_UNLOCK_ALL;
+	}
+
 	if (this->options & ONENAND_HAS_CONT_LOCK)
 		printk(KERN_DEBUG "Lock scheme is Continuous Lock\n");
 	if (this->options & ONENAND_HAS_UNLOCK_ALL)
@@ -2559,14 +2951,16 @@ static void onenand_check_features(struct mtd_info *mtd)
  */
 static void onenand_print_device_info(int device, int version)
 {
-        int vcc, demuxed, ddp, density;
+	int vcc, demuxed, ddp, density, flexonenand;
 
         vcc = device & ONENAND_DEVICE_VCC_MASK;
         demuxed = device & ONENAND_DEVICE_IS_DEMUX;
         ddp = device & ONENAND_DEVICE_IS_DDP;
         density = onenand_get_density(device);
-        printk(KERN_INFO "%sOneNAND%s %dMB %sV 16-bit (0x%02x)\n",
-                demuxed ? "" : "Muxed ",
+	flexonenand = device & DEVICE_IS_FLEXONENAND;
+	printk(KERN_INFO "%s%sOneNAND%s %dMB %sV 16-bit (0x%02x)\n",
+		demuxed ? "" : "Muxed ",
+		flexonenand ? "Flex-" : "",
                 ddp ? "(DDP)" : "",
                 (16 << density),
                 vcc ? "2.65/3.3" : "1.8",
@@ -2605,6 +2999,280 @@ static int onenand_check_maf(int manuf)
 	return (i == size);
 }
 
+/**
+* flexonenand_get_boundary	- Reads the SLC boundary
+* @param onenand_info		- onenand info structure
+**/
+static int flexonenand_get_boundary(struct mtd_info *mtd)
+{
+	struct onenand_chip *this = mtd->priv;
+	unsigned die, bdry;
+	int ret, syscfg, locked;
+
+	/* Disable ECC */
+	syscfg = this->read_word(this->base + ONENAND_REG_SYS_CFG1);
+	this->write_word((syscfg | 0x0100), this->base + ONENAND_REG_SYS_CFG1);
+
+	for (die = 0; die < this->dies; die++) {
+		this->command(mtd, FLEXONENAND_CMD_PI_ACCESS, die, 0);
+		this->wait(mtd, FL_SYNCING);
+
+		this->command(mtd, FLEXONENAND_CMD_READ_PI, die, 0);
+		ret = this->wait(mtd, FL_READING);
+
+		bdry = this->read_word(this->base + ONENAND_DATARAM);
+		if ((bdry >> FLEXONENAND_PI_UNLOCK_SHIFT) == 3)
+			locked = 0;
+		else
+			locked = 1;
+		this->boundary[die] = bdry & FLEXONENAND_PI_MASK;
+
+		this->command(mtd, ONENAND_CMD_RESET, 0, 0);
+		ret = this->wait(mtd, FL_RESETING);
+
+		printk(KERN_INFO "Die %d boundary: %d%s\n", die,
+		       this->boundary[die], locked ? "(Locked)" : "(Unlocked)");
+	}
+
+	/* Enable ECC */
+	this->write_word(syscfg, this->base + ONENAND_REG_SYS_CFG1);
+	return 0;
+}
+
+/**
+ * flexonenand_get_size - Fill up fields in onenand_chip and mtd_info
+ * 			  boundary[], diesize[], mtd->size, mtd->erasesize
+ * @param mtd		- MTD device structure
+ */
+static void flexonenand_get_size(struct mtd_info *mtd)
+{
+	struct onenand_chip *this = mtd->priv;
+	int die, i, eraseshift, density;
+	int blksperdie, maxbdry;
+	loff_t ofs;
+
+	density = onenand_get_density(this->device_id);
+	blksperdie = ((loff_t)(16 << density) << 20) >> (this->erase_shift);
+	blksperdie >>= ONENAND_IS_DDP(this) ? 1 : 0;
+	maxbdry = blksperdie - 1;
+	eraseshift = this->erase_shift - 1;
+
+	mtd->numeraseregions = this->dies << 1;
+
+	/* This fills up the device boundary */
+	flexonenand_get_boundary(mtd);
+	die = ofs = 0;
+	i = -1;
+	for (; die < this->dies; die++) {
+		if (!die || this->boundary[die-1] != maxbdry) {
+			i++;
+			mtd->eraseregions[i].offset = ofs;
+			mtd->eraseregions[i].erasesize = 1 << eraseshift;
+			mtd->eraseregions[i].numblocks =
+							this->boundary[die] + 1;
+			ofs += mtd->eraseregions[i].numblocks << eraseshift;
+			eraseshift++;
+		} else {
+			mtd->numeraseregions -= 1;
+			mtd->eraseregions[i].numblocks +=
+							this->boundary[die] + 1;
+			ofs += (this->boundary[die] + 1) << (eraseshift - 1);
+		}
+		if (this->boundary[die] != maxbdry) {
+			i++;
+			mtd->eraseregions[i].offset = ofs;
+			mtd->eraseregions[i].erasesize = 1 << eraseshift;
+			mtd->eraseregions[i].numblocks = maxbdry ^
+							 this->boundary[die];
+			ofs += mtd->eraseregions[i].numblocks << eraseshift;
+			eraseshift--;
+		} else
+			mtd->numeraseregions -= 1;
+	}
+
+	/* Expose MLC erase size except when all blocks are SLC */
+	mtd->erasesize = 1 << this->erase_shift;
+	if (mtd->numeraseregions == 1)
+		mtd->erasesize >>= 1;
+
+	printk(KERN_INFO "Device has %d eraseregions\n", mtd->numeraseregions);
+	for (i = 0; i < mtd->numeraseregions; i++)
+		printk(KERN_INFO "[offset: 0x%08x, erasesize: 0x%05x,"
+			" numblocks: %04u]\n",
+			(unsigned int) mtd->eraseregions[i].offset,
+			mtd->eraseregions[i].erasesize,
+			mtd->eraseregions[i].numblocks);
+
+	for (die = 0, mtd->size = 0; die < this->dies; die++) {
+		this->diesize[die] = (loff_t)blksperdie << this->erase_shift;
+		this->diesize[die] -= (loff_t)(this->boundary[die] + 1)
+						 << (this->erase_shift - 1);
+		mtd->size += this->diesize[die];
+	}
+}
+
+/**
+ * flexonenand_check_blocks_erased - Check if blocks are erased
+ * @param mtd_info	- mtd info structure
+ * @param start		- first erase block to check
+ * @param end		- last erase block to check
+ *
+ * Converting an unerased block from MLC to SLC
+ * causes byte values to change. Since both data and its ECC
+ * have changed, reads on the block give uncorrectable error.
+ * This might lead to the block being detected as bad.
+ *
+ * Avoid this by ensuring that the block to be converted is
+ * erased.
+ */
+static int flexonenand_check_blocks_erased(struct mtd_info *mtd, int start, int end)
+{
+	struct onenand_chip *this = mtd->priv;
+	int i, ret;
+	int block;
+	struct mtd_oob_ops ops = {
+		.mode = MTD_OOB_PLACE,
+		.ooboffs = 0,
+		.ooblen	= mtd->oobsize,
+		.datbuf	= NULL,
+		.oobbuf	= this->oob_buf,
+	};
+	loff_t addr;
+
+	printk(KERN_DEBUG "Check blocks from %d to %d\n", start, end);
+
+	for (block = start; block <= end; block++) {
+		addr = flexonenand_addr(this, block);
+		if (onenand_block_isbad_nolock(mtd, addr, 0))
+			continue;
+
+		/*
+		 * Since main area write results in ECC write to spare,
+		 * it is sufficient to check only ECC bytes for change.
+		 */
+		ret = onenand_read_oob_nolock(mtd, addr, &ops);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < mtd->oobsize; i++)
+			if (this->oob_buf[i] != 0xff)
+				break;
+
+		if (i != mtd->oobsize) {
+			printk(KERN_WARNING "Block %d not erased.\n", block);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * flexonenand_set_boundary	- Writes the SLC boundary
+ * @param mtd			- mtd info structure
+ */
+int flexonenand_set_boundary(struct mtd_info *mtd, int die,
+				    int boundary, int lock)
+{
+	struct onenand_chip *this = mtd->priv;
+	int ret, density, blksperdie, old, new, thisboundary;
+	loff_t addr;
+
+	/* Change only once for SDP Flex-OneNAND */
+	if (die && (!ONENAND_IS_DDP(this)))
+		return 0;
+
+	/* boundary value of -1 indicates no required change */
+	if (boundary < 0 || boundary == this->boundary[die])
+		return 0;
+
+	density = onenand_get_density(this->device_id);
+	blksperdie = ((16 << density) << 20) >> this->erase_shift;
+	blksperdie >>= ONENAND_IS_DDP(this) ? 1 : 0;
+
+	if (boundary >= blksperdie) {
+		printk(KERN_ERR "flexonenand_set_boundary: Invalid boundary value. "
+				"Boundary not changed.\n");
+		return -EINVAL;
+	}
+
+	/* Check if converting blocks are erased */
+	old = this->boundary[die] + (die * this->density_mask);
+	new = boundary + (die * this->density_mask);
+	ret = flexonenand_check_blocks_erased(mtd, min(old, new) + 1, max(old, new));
+	if (ret) {
+		printk(KERN_ERR "flexonenand_set_boundary: Please erase blocks before boundary change\n");
+		return ret;
+	}
+
+	this->command(mtd, FLEXONENAND_CMD_PI_ACCESS, die, 0);
+	this->wait(mtd, FL_SYNCING);
+
+	/* Check is boundary is locked */
+	this->command(mtd, FLEXONENAND_CMD_READ_PI, die, 0);
+	ret = this->wait(mtd, FL_READING);
+
+	thisboundary = this->read_word(this->base + ONENAND_DATARAM);
+	if ((thisboundary >> FLEXONENAND_PI_UNLOCK_SHIFT) != 3) {
+		printk(KERN_ERR "flexonenand_set_boundary: boundary locked\n");
+		ret = 1;
+		goto out;
+	}
+
+	printk(KERN_INFO "flexonenand_set_boundary: Changing die %d boundary: %d%s\n",
+			die, boundary, lock ? "(Locked)" : "(Unlocked)");
+
+	addr = die ? this->diesize[0] : 0;
+
+	boundary &= FLEXONENAND_PI_MASK;
+	boundary |= lock ? 0 : (3 << FLEXONENAND_PI_UNLOCK_SHIFT);
+
+	this->command(mtd, ONENAND_CMD_ERASE, addr, 0);
+	ret = this->wait(mtd, FL_ERASING);
+	if (ret) {
+		printk(KERN_ERR "flexonenand_set_boundary: Failed PI erase for Die %d\n", die);
+		goto out;
+	}
+
+	this->write_word(boundary, this->base + ONENAND_DATARAM);
+	this->command(mtd, ONENAND_CMD_PROG, addr, 0);
+	ret = this->wait(mtd, FL_WRITING);
+	if (ret) {
+		printk(KERN_ERR "flexonenand_set_boundary: Failed PI write for Die %d\n", die);
+		goto out;
+	}
+
+	this->command(mtd, FLEXONENAND_CMD_PI_UPDATE, die, 0);
+	ret = this->wait(mtd, FL_WRITING);
+out:
+	this->write_word(ONENAND_CMD_RESET, this->base + ONENAND_REG_COMMAND);
+	this->wait(mtd, FL_RESETING);
+	if (!ret)
+		/* Recalculate device size on boundary change*/
+		flexonenand_get_size(mtd);
+
+	return ret;
+}
+
+/**
+ * flexonenand_setup - 	capture Flex-OneNAND boundary and lock
+ * 			values  passed as kernel parameters
+ * @param s	kernel parameter string
+ */
+static int flexonenand_setup(char *s)
+{
+	int ints[5], i;
+
+	s = get_options(s, 5, ints);
+
+	for (i = 0; i < ints[0]; i++)
+		flex_bdry[i] = ints[i + 1];
+
+	return 1;
+}
+
+__setup("onenand.bdry=", flexonenand_setup);
+
 /**
  * onenand_probe - [OneNAND Interface] Probe the OneNAND device
  * @param mtd		MTD device structure
@@ -2647,6 +3315,7 @@ static int onenand_probe(struct mtd_info *mtd)
 	maf_id = this->read_word(this->base + ONENAND_REG_MANUFACTURER_ID);
 	dev_id = this->read_word(this->base + ONENAND_REG_DEVICE_ID);
 	ver_id = this->read_word(this->base + ONENAND_REG_VERSION_ID);
+	this->technology = this->read_word(this->base + ONENAND_REG_TECHNOLOGY);
 
 	/* Check OneNAND device */
 	if (maf_id != bram_maf_id || dev_id != bram_dev_id)
@@ -2658,29 +3327,55 @@ static int onenand_probe(struct mtd_info *mtd)
 	this->version_id = ver_id;
 
 	density = onenand_get_density(dev_id);
+	if (FLEXONENAND(this)) {
+		this->dies = ONENAND_IS_DDP(this) ? 2 : 1;
+		/* Maximum possible erase regions */
+		mtd->numeraseregions = this->dies << 1;
+		mtd->eraseregions = kzalloc(sizeof(struct mtd_erase_region_info)
+					* (this->dies << 1), GFP_KERNEL);
+		if (!mtd->eraseregions)
+			return -ENOMEM;
+	}
+
+	/*
+	 * For Flex-OneNAND, chipsize represents maximum possible device size.
+	 * mtd->size represents the actual device size.
+	 */
 	this->chipsize = (16 << density) << 20;
-	/* Set density mask. it is used for DDP */
-	if (ONENAND_IS_DDP(this))
-		this->density_mask = (1 << (density + 6));
-	else
-		this->density_mask = 0;
 
 	/* OneNAND page size & block size */
 	/* The data buffer size is equal to page size */
 	mtd->writesize = this->read_word(this->base + ONENAND_REG_DATA_BUFFER_SIZE);
+	/* We use the full BufferRAM */
+	if (ONENAND_IS_MLC(this))
+		mtd->writesize <<= 1;
+
 	mtd->oobsize = mtd->writesize >> 5;
 	/* Pages per a block are always 64 in OneNAND */
 	mtd->erasesize = mtd->writesize << 6;
+	/*
+	 * Flex-OneNAND SLC area has 64 pages per block.
+	 * Flex-OneNAND MLC area has 128 pages per block.
+	 * Expose MLC erase size to find erase_shift and page_mask.
+	 */
+	if (FLEXONENAND(this))
+		mtd->erasesize <<= 1;
 
 	this->erase_shift = ffs(mtd->erasesize) - 1;
 	this->page_shift = ffs(mtd->writesize) - 1;
 	this->page_mask = (1 << (this->erase_shift - this->page_shift)) - 1;
+	/* Set density mask. it is used for DDP */
+	if (ONENAND_IS_DDP(this))
+		this->density_mask = this->chipsize >> (this->erase_shift + 1);
 	/* It's real page size */
 	this->writesize = mtd->writesize;
 
 	/* REVIST: Multichip handling */
 
-	mtd->size = this->chipsize;
+	if (FLEXONENAND(this))
+		flexonenand_get_size(mtd);
+	else
+		mtd->size = this->chipsize;
 
 	/* Check OneNAND features */
 	onenand_check_features(mtd);
@@ -2735,7 +3430,7 @@ static void onenand_resume(struct mtd_info *mtd)
  */
 int onenand_scan(struct mtd_info *mtd, int maxchips)
 {
-	int i;
+	int i, ret;
 	struct onenand_chip *this = mtd->priv;
 
 	if (!this->read_word)
@@ -2797,6 +3492,10 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	 * Allow subpage writes up to oobsize.
 	 */
 	switch (mtd->oobsize) {
+	case 128:
+		this->ecclayout = &onenand_oob_128;
+		mtd->subpage_sft = 0;
+		break;
 	case 64:
 		this->ecclayout = &onenand_oob_64;
 		mtd->subpage_sft = 2;
@@ -2862,7 +3561,16 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	/* Unlock whole block */
 	onenand_unlock_all(mtd);
 
-	return this->scan_bbt(mtd);
+	ret = this->scan_bbt(mtd);
+	if ((!FLEXONENAND(this)) || ret)
+		return ret;
+
+	/* Change Flex-OneNAND boundaries if required */
+	for (i = 0; i < MAX_DIES; i++)
+		flexonenand_set_boundary(mtd, i, flex_bdry[2 * i],
+						 flex_bdry[(2 * i) + 1]);
+
+	return 0;
 }
 
 /**
@@ -2891,6 +3599,7 @@ void onenand_release(struct mtd_info *mtd)
 		kfree(this->page_buf);
 	if (this->options & ONENAND_OOBBUF_ALLOC)
 		kfree(this->oob_buf);
+	kfree(mtd->eraseregions);
 }
 
 EXPORT_SYMBOL_GPL(onenand_scan);
diff --git a/drivers/mtd/onenand/onenand_bbt.c b/drivers/mtd/onenand/onenand_bbt.c
index 2f53b51c6805..a91fcac1af01 100644
--- a/drivers/mtd/onenand/onenand_bbt.c
+++ b/drivers/mtd/onenand/onenand_bbt.c
@@ -63,6 +63,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	loff_t from;
 	size_t readlen, ooblen;
 	struct mtd_oob_ops ops;
+	int rgn;
 
 	printk(KERN_INFO "Scanning device for bad blocks\n");
 
@@ -76,7 +77,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	/* Note that numblocks is 2 * (real numblocks) here;
 	 * see i += 2 below as it makses shifting and masking less painful
 	 */
-	numblocks = mtd->size >> (bbm->bbt_erase_shift - 1);
+	numblocks = this->chipsize >> (bbm->bbt_erase_shift - 1);
 	startblock = 0;
 	from = 0;
 
@@ -106,7 +107,12 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 			}
 		}
 		i += 2;
-		from += (1 << bbm->bbt_erase_shift);
+
+		if (FLEXONENAND(this)) {
+			rgn = flexonenand_region(mtd, from);
+			from += mtd->eraseregions[rgn].erasesize;
+		} else
+			from += (1 << bbm->bbt_erase_shift);
 	}
 
 	return 0;
@@ -143,7 +149,7 @@ static int onenand_isbad_bbt(struct mtd_info *mtd, loff_t offs, int allowbbt)
 	uint8_t res;
 
 	/* Get block number * 2 */
-	block = (int) (offs >> (bbm->bbt_erase_shift - 1));
+	block = (int) (onenand_block(this, offs) << 1);
 	res = (bbm->bbt[block >> 3] >> (block & 0x06)) & 0x03;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "onenand_isbad_bbt: bbt info for offs 0x%08x: (block %d) 0x%02x\n",
@@ -178,7 +184,7 @@ int onenand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd)
 	struct bbm_info *bbm = this->bbm;
 	int len, ret = 0;
 
-	len = mtd->size >> (this->erase_shift + 2);
+	len = this->chipsize >> (this->erase_shift + 2);
 	/* Allocate memory (2bit per block) and clear the memory bad block table */
 	bbm->bbt = kzalloc(len, GFP_KERNEL);
 	if (!bbm->bbt) {
diff --git a/drivers/mtd/onenand/onenand_sim.c b/drivers/mtd/onenand/onenand_sim.c
index d64200b7c94b..f6e3c8aebd3a 100644
--- a/drivers/mtd/onenand/onenand_sim.c
+++ b/drivers/mtd/onenand/onenand_sim.c
@@ -6,6 +6,10 @@
  *  Copyright © 2005-2007 Samsung Electronics
  *  Kyungmin Park <kyungmin.park@samsung.com>
  *
+ *  Vishak G <vishak.g at samsung.com>, Rohit Hagargundgi <h.rohit at samsung.com>
+ *  Flex-OneNAND simulator support
+ *  Copyright (C) Samsung Electronics, 2008
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -24,16 +28,38 @@
 #ifndef CONFIG_ONENAND_SIM_MANUFACTURER
 #define CONFIG_ONENAND_SIM_MANUFACTURER         0xec
 #endif
+
 #ifndef CONFIG_ONENAND_SIM_DEVICE_ID
 #define CONFIG_ONENAND_SIM_DEVICE_ID            0x04
 #endif
+
+#define CONFIG_FLEXONENAND ((CONFIG_ONENAND_SIM_DEVICE_ID >> 9) & 1)
+
 #ifndef CONFIG_ONENAND_SIM_VERSION_ID
 #define CONFIG_ONENAND_SIM_VERSION_ID           0x1e
 #endif
 
+#ifndef CONFIG_ONENAND_SIM_TECHNOLOGY_ID
+#define CONFIG_ONENAND_SIM_TECHNOLOGY_ID CONFIG_FLEXONENAND
+#endif
+
+/* Initial boundary values for Flex-OneNAND Simulator */
+#ifndef CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY
+#define CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY	0x01
+#endif
+
+#ifndef CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY
+#define CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY	0x01
+#endif
+
 static int manuf_id	= CONFIG_ONENAND_SIM_MANUFACTURER;
 static int device_id	= CONFIG_ONENAND_SIM_DEVICE_ID;
 static int version_id	= CONFIG_ONENAND_SIM_VERSION_ID;
+static int technology_id = CONFIG_ONENAND_SIM_TECHNOLOGY_ID;
+static int boundary[] = {
+	CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY,
+	CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY,
+};
 
 struct onenand_flash {
 	void __iomem *base;
@@ -57,12 +83,18 @@ struct onenand_flash {
 	(writew(v, this->base + ONENAND_REG_WP_STATUS))
 
 /* It has all 0xff chars */
-#define MAX_ONENAND_PAGESIZE		(2048 + 64)
+#define MAX_ONENAND_PAGESIZE		(4096 + 128)
 static unsigned char *ffchars;
 
+#if CONFIG_FLEXONENAND
+#define PARTITION_NAME "Flex-OneNAND simulator partition"
+#else
+#define PARTITION_NAME "OneNAND simulator partition"
+#endif
+
 static struct mtd_partition os_partitions[] = {
 	{
-		.name		= "OneNAND simulator partition",
+		.name		= PARTITION_NAME,
 		.offset		= 0,
 		.size		= MTDPART_SIZ_FULL,
 	},
@@ -104,6 +136,7 @@ static void onenand_lock_handle(struct onenand_chip *this, int cmd)
 
 	switch (cmd) {
 	case ONENAND_CMD_UNLOCK:
+	case ONENAND_CMD_UNLOCK_ALL:
 		if (block_lock_scheme)
 			ONENAND_SET_WP_STATUS(ONENAND_WP_US, this);
 		else
@@ -228,10 +261,12 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 {
 	struct mtd_info *mtd = &info->mtd;
 	struct onenand_flash *flash = this->priv;
-	int main_offset, spare_offset;
+	int main_offset, spare_offset, die = 0;
 	void __iomem *src;
 	void __iomem *dest;
 	unsigned int i;
+	static int pi_operation;
+	int erasesize, rgn;
 
 	if (dataram) {
 		main_offset = mtd->writesize;
@@ -241,10 +276,27 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 		spare_offset = 0;
 	}
 
+	if (pi_operation) {
+		die = readw(this->base + ONENAND_REG_START_ADDRESS2);
+		die >>= ONENAND_DDP_SHIFT;
+	}
+
 	switch (cmd) {
+	case FLEXONENAND_CMD_PI_ACCESS:
+		pi_operation = 1;
+		break;
+
+	case ONENAND_CMD_RESET:
+		pi_operation = 0;
+		break;
+
 	case ONENAND_CMD_READ:
 		src = ONENAND_CORE(flash) + offset;
 		dest = ONENAND_MAIN_AREA(this, main_offset);
+		if (pi_operation) {
+			writew(boundary[die], this->base + ONENAND_DATARAM);
+			break;
+		}
 		memcpy(dest, src, mtd->writesize);
 		/* Fall through */
 
@@ -257,6 +309,10 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 	case ONENAND_CMD_PROG:
 		src = ONENAND_MAIN_AREA(this, main_offset);
 		dest = ONENAND_CORE(flash) + offset;
+		if (pi_operation) {
+			boundary[die] = readw(this->base + ONENAND_DATARAM);
+			break;
+		}
 		/* To handle partial write */
 		for (i = 0; i < (1 << mtd->subpage_sft); i++) {
 			int off = i * this->subpagesize;
@@ -284,9 +340,18 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 		break;
 
 	case ONENAND_CMD_ERASE:
-		memset(ONENAND_CORE(flash) + offset, 0xff, mtd->erasesize);
+		if (pi_operation)
+			break;
+
+		if (FLEXONENAND(this)) {
+			rgn = flexonenand_region(mtd, offset);
+			erasesize = mtd->eraseregions[rgn].erasesize;
+		} else
+			erasesize = mtd->erasesize;
+
+		memset(ONENAND_CORE(flash) + offset, 0xff, erasesize);
 		memset(ONENAND_CORE_SPARE(flash, this, offset), 0xff,
-		       (mtd->erasesize >> 5));
+		       (erasesize >> 5));
 		break;
 
 	default:
@@ -339,7 +404,7 @@ static void onenand_command_handle(struct onenand_chip *this, int cmd)
 	}
 
 	if (block != -1)
-		offset += block << this->erase_shift;
+		offset = onenand_addr(this, block);
 
 	if (page != -1)
 		offset += page << this->page_shift;
@@ -390,6 +455,7 @@ static int __init flash_init(struct onenand_flash *flash)
 	}
 
 	density = device_id >> ONENAND_DEVICE_DENSITY_SHIFT;
+	density &= ONENAND_DEVICE_DENSITY_MASK;
 	size = ((16 << 20) << density);
 
 	ONENAND_CORE(flash) = vmalloc(size + (size >> 5));
@@ -405,8 +471,9 @@ static int __init flash_init(struct onenand_flash *flash)
 	writew(manuf_id, flash->base + ONENAND_REG_MANUFACTURER_ID);
 	writew(device_id, flash->base + ONENAND_REG_DEVICE_ID);
 	writew(version_id, flash->base + ONENAND_REG_VERSION_ID);
+	writew(technology_id, flash->base + ONENAND_REG_TECHNOLOGY);
 
-	if (density < 2)
+	if (density < 2 && (!CONFIG_FLEXONENAND))
 		buffer_size = 0x0400;	/* 1KiB page */
 	else
 		buffer_size = 0x0800;	/* 2KiB page */
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 0fa3ac4ad576..9aab82c1c743 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -17,6 +17,7 @@
 #include <linux/mtd/onenand_regs.h>
 #include <linux/mtd/bbm.h>
 
+#define MAX_DIES		2
 #define MAX_BUFFERRAM		2
 
 /* Scan and identify a OneNAND device */
@@ -51,7 +52,12 @@ struct onenand_bufferram {
 /**
  * struct onenand_chip - OneNAND Private Flash Chip Data
  * @base:		[BOARDSPECIFIC] address to access OneNAND
+ * @dies:		[INTERN][FLEX-ONENAND] number of dies on chip
+ * @boundary:		[INTERN][FLEX-ONENAND] Boundary of the dies
+ * @diesize:		[INTERN][FLEX-ONENAND] Size of the dies
  * @chipsize:		[INTERN] the size of one chip for multichip arrays
+ *			FIXME For Flex-OneNAND, chipsize holds maximum possible
+ *			device size ie when all blocks are considered MLC
  * @device_id:		[INTERN] device ID
  * @density_mask:	chip density, used for DDP devices
  * @verstion_id:	[INTERN] version ID
@@ -92,9 +98,13 @@ struct onenand_bufferram {
  */
 struct onenand_chip {
 	void __iomem		*base;
+	unsigned		dies;
+	unsigned		boundary[MAX_DIES];
+	loff_t			diesize[MAX_DIES];
 	unsigned int		chipsize;
 	unsigned int		device_id;
 	unsigned int		version_id;
+	unsigned int		technology;
 	unsigned int		density_mask;
 	unsigned int		options;
 
@@ -145,6 +155,8 @@ struct onenand_chip {
 #define ONENAND_SET_BUFFERRAM0(this)		(this->bufferram_index = 0)
 #define ONENAND_SET_BUFFERRAM1(this)		(this->bufferram_index = 1)
 
+#define FLEXONENAND(this)						\
+	(this->device_id & DEVICE_IS_FLEXONENAND)
 #define ONENAND_GET_SYS_CFG1(this)					\
 	(this->read_word(this->base + ONENAND_REG_SYS_CFG1))
 #define ONENAND_SET_SYS_CFG1(v, this)					\
@@ -153,6 +165,9 @@ struct onenand_chip {
 #define ONENAND_IS_DDP(this)						\
 	(this->device_id & ONENAND_DEVICE_IS_DDP)
 
+#define ONENAND_IS_MLC(this)						\
+	(this->technology & ONENAND_TECHNOLOGY_IS_MLC)
+
 #ifdef CONFIG_MTD_ONENAND_2X_PROGRAM
 #define ONENAND_IS_2PLANE(this)						\
 	(this->options & ONENAND_HAS_2PLANE)
@@ -190,5 +205,8 @@ struct onenand_manufacturers {
 
 int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 			 struct mtd_oob_ops *ops);
+unsigned onenand_block(struct onenand_chip *this, loff_t addr);
+loff_t onenand_addr(struct onenand_chip *this, int block);
+int flexonenand_region(struct mtd_info *mtd, loff_t addr);
 
 #endif	/* __LINUX_MTD_ONENAND_H */
diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index 0c6bbe28f38c..86a6bbef6465 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -67,6 +67,9 @@
 /*
  * Device ID Register F001h (R)
  */
+#define DEVICE_IS_FLEXONENAND		(1 << 9)
+#define FLEXONENAND_PI_MASK		(0x3ff)
+#define FLEXONENAND_PI_UNLOCK_SHIFT	(14)
 #define ONENAND_DEVICE_DENSITY_MASK	(0xf)
 #define ONENAND_DEVICE_DENSITY_SHIFT	(4)
 #define ONENAND_DEVICE_IS_DDP		(1 << 3)
@@ -83,6 +86,11 @@
  */
 #define ONENAND_VERSION_PROCESS_SHIFT	(8)
 
+/*
+ * Technology Register F006h (R)
+ */
+#define ONENAND_TECHNOLOGY_IS_MLC	(1 << 0)
+
 /*
  * Start Address 1 F100h (R/W) & Start Address 2 F101h (R/W)
  */
@@ -93,7 +101,8 @@
 /*
  * Start Address 8 F107h (R/W)
  */
-#define ONENAND_FPA_MASK		(0x3f)
+/* Note: It's actually 0x3f in case of SLC */
+#define ONENAND_FPA_MASK		(0x7f)
 #define ONENAND_FPA_SHIFT		(2)
 #define ONENAND_FSA_MASK		(0x03)
 
@@ -105,7 +114,8 @@
 #define ONENAND_BSA_BOOTRAM		(0 << 2)
 #define ONENAND_BSA_DATARAM0		(2 << 2)
 #define ONENAND_BSA_DATARAM1		(3 << 2)
-#define ONENAND_BSC_MASK		(0x03)
+/* Note: It's actually 0x03 in case of SLC */
+#define ONENAND_BSC_MASK		(0x07)
 
 /*
  * Command Register F220h (R/W)
@@ -124,9 +134,13 @@
 #define ONENAND_CMD_RESET		(0xF0)
 #define ONENAND_CMD_OTP_ACCESS		(0x65)
 #define ONENAND_CMD_READID		(0x90)
+#define FLEXONENAND_CMD_PI_UPDATE	(0x05)
+#define FLEXONENAND_CMD_PI_ACCESS	(0x66)
+#define FLEXONENAND_CMD_RECOVER_LSB	(0x05)
 
 /* NOTE: Those are not *REAL* commands */
 #define ONENAND_CMD_BUFFERRAM		(0x1978)
+#define FLEXONENAND_CMD_READ_PI		(0x1985)
 
 /*
  * System Configuration 1 Register F221h (R, R/W)
@@ -192,10 +206,12 @@
 #define ONENAND_ECC_1BIT_ALL		(0x5555)
 #define ONENAND_ECC_2BIT		(1 << 1)
 #define ONENAND_ECC_2BIT_ALL		(0xAAAA)
+#define FLEXONENAND_UNCORRECTABLE_ERROR	(0x1010)
 
 /*
  * One-Time Programmable (OTP)
  */
+#define FLEXONENAND_OTP_LOCK_OFFSET		(2048)
 #define ONENAND_OTP_LOCK_OFFSET		(14)
 
 #endif	/* __ONENAND_REG_H */
-- 
cgit v1.2.3-71-gd317


From 31bb999ee73748068ddc271dd99b22dcc418efe3 Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kmpark@infradead.org>
Date: Tue, 12 May 2009 13:46:57 -0700
Subject: mtd: onenand: add bbt_wait & unlock_all as replaceable for some
 platform

Add bbt_wait & unlock_all as replaceable for some platform such as
  s3c64xx s3c64xx has its own OneNAND controller and another interface

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/onenand/onenand_base.c | 12 ++++++++++--
 include/linux/mtd/onenand.h        |  5 +++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 8d4c9c253732..864327ed7fb3 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1506,7 +1506,7 @@ int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 
 		onenand_update_bufferram(mtd, from, 0);
 
-		ret = onenand_bbt_wait(mtd, FL_READING);
+		ret = this->bbt_wait(mtd, FL_READING);
 		if (unlikely(ret))
 			ret = onenand_recover_lsb(mtd, from, ret);
 
@@ -2527,6 +2527,10 @@ static void onenand_unlock_all(struct mtd_info *mtd)
 		    & ONENAND_CTRL_ONGO)
 			continue;
 
+		/* Don't check lock status */
+		if (this->options & ONENAND_SKIP_UNLOCK_CHECK)
+			return;
+
 		/* Check lock status */
 		if (onenand_check_lock_status(this))
 			return;
@@ -3442,6 +3446,10 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 		this->command = onenand_command;
 	if (!this->wait)
 		onenand_setup_wait(mtd);
+	if (!this->bbt_wait)
+		this->bbt_wait = onenand_bbt_wait;
+	if (!this->unlock_all)
+		this->unlock_all = onenand_unlock_all;
 
 	if (!this->read_bufferram)
 		this->read_bufferram = onenand_read_bufferram;
@@ -3559,7 +3567,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->owner = THIS_MODULE;
 
 	/* Unlock whole block */
-	onenand_unlock_all(mtd);
+	this->unlock_all(mtd);
 
 	ret = this->scan_bbt(mtd);
 	if ((!FLEXONENAND(this)) || ret)
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 9aab82c1c743..8ed873374381 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -74,6 +74,8 @@ struct onenand_bufferram {
  * @command:		[REPLACEABLE] hardware specific function for writing
  *			commands to the chip
  * @wait:		[REPLACEABLE] hardware specific function for wait on ready
+ * @bbt_wait:		[REPLACEABLE] hardware specific function for bbt wait on ready
+ * @unlock_all:		[REPLACEABLE] hardware specific function for unlock all
  * @read_bufferram:	[REPLACEABLE] hardware specific function for BufferRAM Area
  * @write_bufferram:	[REPLACEABLE] hardware specific function for BufferRAM Area
  * @read_word:		[REPLACEABLE] hardware specific function for read
@@ -118,6 +120,8 @@ struct onenand_chip {
 
 	int (*command)(struct mtd_info *mtd, int cmd, loff_t address, size_t len);
 	int (*wait)(struct mtd_info *mtd, int state);
+	int (*bbt_wait)(struct mtd_info *mtd, int state);
+	void (*unlock_all)(struct mtd_info *mtd);
 	int (*read_bufferram)(struct mtd_info *mtd, int area,
 			unsigned char *buffer, int offset, size_t count);
 	int (*write_bufferram)(struct mtd_info *mtd, int area,
@@ -184,6 +188,7 @@ struct onenand_chip {
 #define ONENAND_HAS_CONT_LOCK		(0x0001)
 #define ONENAND_HAS_UNLOCK_ALL		(0x0002)
 #define ONENAND_HAS_2PLANE		(0x0004)
+#define ONENAND_SKIP_UNLOCK_CHECK	(0x0100)
 #define ONENAND_PAGEBUF_ALLOC		(0x1000)
 #define ONENAND_OOBBUF_ALLOC		(0x2000)
 
-- 
cgit v1.2.3-71-gd317


From c6f4a42de60b981dd210de01cd3e575835e3158e Mon Sep 17 00:00:00 2001
From: Minkyu Kang <mk7.kang@samsung.com>
Date: Fri, 5 Jun 2009 15:33:04 +0900
Subject: Add MAX17040 Fuel Gauge driver

The MAX17040 is a I2C interfaced Fuel Gauge systems for lithium-ion
batteries This patch adds support the MAX17040 Fuel Gauge

Signed-off-by: Minkyu Kang <mk7.kang@samsung.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/Kconfig            |   8 +
 drivers/power/Makefile           |   3 +-
 drivers/power/max17040_battery.c | 309 +++++++++++++++++++++++++++++++++++++++
 include/linux/max17040_battery.h |  19 +++
 4 files changed, 338 insertions(+), 1 deletion(-)
 create mode 100644 drivers/power/max17040_battery.c
 create mode 100644 include/linux/max17040_battery.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 33da1127992a..7eda34838bfe 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -82,6 +82,14 @@ config BATTERY_DA9030
 	  Say Y here to enable support for batteries charger integrated into
 	  DA9030 PMIC.
 
+config BATTERY_MAX17040
+	tristate "Maxim MAX17040 Fuel Gauge"
+	depends on I2C
+	help
+	  MAX17040 is fuel-gauge systems for lithium-ion (Li+) batteries
+	  in handheld and portable equipment. The MAX17040 is configured
+	  to operate with a single lithium cell
+
 config CHARGER_PCF50633
 	tristate "NXP PCF50633 MBC"
 	depends on MFD_PCF50633
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 2fcf41d13e5c..daf3179689aa 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -25,4 +25,5 @@ obj-$(CONFIG_BATTERY_TOSA)	+= tosa_battery.o
 obj-$(CONFIG_BATTERY_WM97XX)	+= wm97xx_battery.o
 obj-$(CONFIG_BATTERY_BQ27x00)	+= bq27x00_battery.o
 obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
-obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
\ No newline at end of file
+obj-$(CONFIG_BATTERY_MAX17040)	+= max17040_battery.o
+obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
diff --git a/drivers/power/max17040_battery.c b/drivers/power/max17040_battery.c
new file mode 100644
index 000000000000..87b98bf27ae1
--- /dev/null
+++ b/drivers/power/max17040_battery.c
@@ -0,0 +1,309 @@
+/*
+ *  max17040_battery.c
+ *  fuel-gauge systems for lithium-ion (Li+) batteries
+ *
+ *  Copyright (C) 2009 Samsung Electronics
+ *  Minkyu Kang <mk7.kang@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/delay.h>
+#include <linux/power_supply.h>
+#include <linux/max17040_battery.h>
+
+#define MAX17040_VCELL_MSB	0x02
+#define MAX17040_VCELL_LSB	0x03
+#define MAX17040_SOC_MSB	0x04
+#define MAX17040_SOC_LSB	0x05
+#define MAX17040_MODE_MSB	0x06
+#define MAX17040_MODE_LSB	0x07
+#define MAX17040_VER_MSB	0x08
+#define MAX17040_VER_LSB	0x09
+#define MAX17040_RCOMP_MSB	0x0C
+#define MAX17040_RCOMP_LSB	0x0D
+#define MAX17040_CMD_MSB	0xFE
+#define MAX17040_CMD_LSB	0xFF
+
+#define MAX17040_DELAY		1000
+#define MAX17040_BATTERY_FULL	95
+
+struct max17040_chip {
+	struct i2c_client		*client;
+	struct delayed_work		work;
+	struct power_supply		battery;
+	struct max17040_platform_data	*pdata;
+
+	/* State Of Connect */
+	int online;
+	/* battery voltage */
+	int vcell;
+	/* battery capacity */
+	int soc;
+	/* State Of Charge */
+	int status;
+};
+
+static int max17040_get_property(struct power_supply *psy,
+			    enum power_supply_property psp,
+			    union power_supply_propval *val)
+{
+	struct max17040_chip *chip = container_of(psy,
+				struct max17040_chip, battery);
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		val->intval = chip->status;
+		break;
+	case POWER_SUPPLY_PROP_ONLINE:
+		val->intval = chip->online;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = chip->vcell;
+		break;
+	case POWER_SUPPLY_PROP_CAPACITY:
+		val->intval = chip->soc;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int max17040_write_reg(struct i2c_client *client, int reg, u8 value)
+{
+	int ret;
+
+	ret = i2c_smbus_write_byte_data(client, reg, value);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
+
+	return ret;
+}
+
+static int max17040_read_reg(struct i2c_client *client, int reg)
+{
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(client, reg);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
+
+	return ret;
+}
+
+static void max17040_reset(struct i2c_client *client)
+{
+	max17040_write_reg(client, MAX17040_CMD_MSB, 0x54);
+	max17040_write_reg(client, MAX17040_CMD_LSB, 0x00);
+}
+
+static void max17040_get_vcell(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+	u8 msb;
+	u8 lsb;
+
+	msb = max17040_read_reg(client, MAX17040_VCELL_MSB);
+	lsb = max17040_read_reg(client, MAX17040_VCELL_LSB);
+
+	chip->vcell = (msb << 4) + (lsb >> 4);
+}
+
+static void max17040_get_soc(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+	u8 msb;
+	u8 lsb;
+
+	msb = max17040_read_reg(client, MAX17040_SOC_MSB);
+	lsb = max17040_read_reg(client, MAX17040_SOC_LSB);
+
+	chip->soc = msb;
+}
+
+static void max17040_get_version(struct i2c_client *client)
+{
+	u8 msb;
+	u8 lsb;
+
+	msb = max17040_read_reg(client, MAX17040_VER_MSB);
+	lsb = max17040_read_reg(client, MAX17040_VER_LSB);
+
+	dev_info(&client->dev, "MAX17040 Fuel-Gauge Ver %d%d\n", msb, lsb);
+}
+
+static void max17040_get_online(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	if (chip->pdata->battery_online)
+		chip->online = chip->pdata->battery_online();
+	else
+		chip->online = 1;
+}
+
+static void max17040_get_status(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	if (!chip->pdata->charger_online || !chip->pdata->charger_enable) {
+		chip->status = POWER_SUPPLY_STATUS_UNKNOWN;
+		return;
+	}
+
+	if (chip->pdata->charger_online()) {
+		if (chip->pdata->charger_enable())
+			chip->status = POWER_SUPPLY_STATUS_CHARGING;
+		else
+			chip->status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+	} else {
+		chip->status = POWER_SUPPLY_STATUS_DISCHARGING;
+	}
+
+	if (chip->soc > MAX17040_BATTERY_FULL)
+		chip->status = POWER_SUPPLY_STATUS_FULL;
+}
+
+static void max17040_work(struct work_struct *work)
+{
+	struct max17040_chip *chip;
+
+	chip = container_of(work, struct max17040_chip, work.work);
+
+	max17040_get_vcell(chip->client);
+	max17040_get_soc(chip->client);
+	max17040_get_online(chip->client);
+	max17040_get_status(chip->client);
+
+	schedule_delayed_work(&chip->work, MAX17040_DELAY);
+}
+
+static enum power_supply_property max17040_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_ONLINE,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+};
+
+static int __devinit max17040_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	struct max17040_chip *chip;
+	int ret;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE))
+		return -EIO;
+
+	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	chip->client = client;
+	chip->pdata = client->dev.platform_data;
+
+	i2c_set_clientdata(client, chip);
+
+	chip->battery.name		= "battery";
+	chip->battery.type		= POWER_SUPPLY_TYPE_BATTERY;
+	chip->battery.get_property	= max17040_get_property;
+	chip->battery.properties	= max17040_battery_props;
+	chip->battery.num_properties	= ARRAY_SIZE(max17040_battery_props);
+
+	ret = power_supply_register(&client->dev, &chip->battery);
+	if (ret) {
+		dev_err(&client->dev, "failed: power supply register\n");
+		i2c_set_clientdata(client, NULL);
+		kfree(chip);
+		return ret;
+	}
+
+	max17040_reset(client);
+	max17040_get_version(client);
+
+	INIT_DELAYED_WORK_DEFERRABLE(&chip->work, max17040_work);
+	schedule_delayed_work(&chip->work, MAX17040_DELAY);
+
+	return 0;
+}
+
+static int __devexit max17040_remove(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	power_supply_unregister(&chip->battery);
+	cancel_delayed_work(&chip->work);
+	i2c_set_clientdata(client, NULL);
+	kfree(chip);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+
+static int max17040_suspend(struct i2c_client *client,
+		pm_message_t state)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	cancel_delayed_work(&chip->work);
+	return 0;
+}
+
+static int max17040_resume(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	schedule_delayed_work(&chip->work, MAX17040_DELAY);
+	return 0;
+}
+
+#else
+
+#define max17040_suspend NULL
+#define max17040_resume NULL
+
+#endif /* CONFIG_PM */
+
+static const struct i2c_device_id max17040_id[] = {
+	{ "max17040", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, max17040_id);
+
+static struct i2c_driver max17040_i2c_driver = {
+	.driver	= {
+		.name	= "max17040",
+	},
+	.probe		= max17040_probe,
+	.remove		= __devexit_p(max17040_remove),
+	.suspend	= max17040_suspend,
+	.resume		= max17040_resume,
+	.id_table	= max17040_id,
+};
+
+static int __init max17040_init(void)
+{
+	return i2c_add_driver(&max17040_i2c_driver);
+}
+module_init(max17040_init);
+
+static void __exit max17040_exit(void)
+{
+	i2c_del_driver(&max17040_i2c_driver);
+}
+module_exit(max17040_exit);
+
+MODULE_AUTHOR("Minkyu Kang <mk7.kang@samsung.com>");
+MODULE_DESCRIPTION("MAX17040 Fuel Gauge");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/max17040_battery.h b/include/linux/max17040_battery.h
new file mode 100644
index 000000000000..ad97b06cf930
--- /dev/null
+++ b/include/linux/max17040_battery.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (C) 2009 Samsung Electronics
+ *  Minkyu Kang <mk7.kang@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MAX17040_BATTERY_H_
+#define __MAX17040_BATTERY_H_
+
+struct max17040_platform_data {
+	int (*battery_online)(void);
+	int (*charger_online)(void);
+	int (*charger_enable)(void);
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 04846b5b8112e53b588038349b3e92b8485c1807 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Mon, 20 Apr 2009 10:54:52 +0900
Subject: PCI MSI: Remove unused/obsolete macros and definitions

Impact: cleanup, spec compliance

This patch does:

 - Remove unused msi/msix_enable/disable macros.
   User should use msi/msix_set_enable() functions instead.

 - Remove unused msix_mask/unmask/pending macros.
   These macros are useless because they are not based on any of
   the PCI Local Bus Specifications properly.
   It seems that they were written based on a draft of PCI spec,
   and that the draft was the MSI-X ECN that underwent membership
   review in September 2002.
   (* In the draft, the size of a entry in MSI-X table was 64bit,
      containing 32bit message data and DWORD aligned lower address
      plus a pending bit and a mask bit.(30+1+1bit)  The higher
      address was placed in MSI-X capability structure and shared
      by all entries.)

 - Remove PCI_MSIX_FLAGS_BITMASK.
   This definition also come from the draft ECN.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.h        | 8 +-------
 include/linux/pci_regs.h | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 71f4df2ef654..4fed59261952 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -19,18 +19,12 @@
 	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
 #define msi_mask_bits_reg(base, is64bit) \
 	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
-#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
 
 #define msix_table_offset_reg(base)	(base + 0x04)
 #define msix_pba_offset_reg(base)	(base + 0x08)
-#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
-#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
 #define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
-#define multi_msix_capable		msix_table_size
-#define msix_unmask(address)	 	(address & ~PCI_MSIX_FLAGS_BITMASK)
-#define msix_mask(address)		(address | PCI_MSIX_FLAGS_BITMASK)
-#define msix_is_pending(address) 	(address & PCI_MSIX_FLAGS_PENDMASK)
+#define multi_msix_capable(control)	msix_table_size((control))
 
 #endif /* MSI_H */
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 616bf8b3c8b5..dcba7668e0cd 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -304,7 +304,6 @@
 #define  PCI_MSIX_FLAGS_ENABLE	(1 << 15)
 #define  PCI_MSIX_FLAGS_MASKALL	(1 << 14)
 #define PCI_MSIX_FLAGS_BIRMASK	(7 << 0)
-#define PCI_MSIX_FLAGS_BITMASK	(1 << 0)
 
 /* CompactPCI Hotswap Register */
 
-- 
cgit v1.2.3-71-gd317


From 67b5db6502ddd27d65dea43bf036abbd82d0dfc9 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Mon, 20 Apr 2009 10:54:59 +0900
Subject: PCI MSI: Define PCI_MSI_MASK_32/64

Impact: cleanup, improve readability

Define PCI_MSI_MASK_32/64 for 32/64bit devices, instead of using
implicit offset (-4), "PCI_MSI_MASK_BIT - 4" and "PCI_MSI_MASK_BIT".

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c        | 2 +-
 drivers/pci/msi.h        | 6 +++---
 include/linux/pci_regs.h | 3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 362773247fbf..7ffac27d5d4a 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -381,7 +381,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec)
 	entry->msi_attrib.default_irq = dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.pos = pos;
 
-	entry->mask_pos = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
+	entry->mask_pos = msi_mask_reg(pos, entry->msi_attrib.is_64);
 	/* All MSIs are unmasked by default, Mask them all */
 	if (entry->msi_attrib.maskbit)
 		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 4fed59261952..a0662842550b 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -16,9 +16,9 @@
 #define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
 #define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
 #define msi_data_reg(base, is64bit)	\
-	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
-#define msi_mask_bits_reg(base, is64bit) \
-	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+	(base + ((is64bit == 1) ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32))
+#define msi_mask_reg(base, is64bit)	\
+	(base + ((is64bit == 1) ? PCI_MSI_MASK_64 : PCI_MSI_MASK_32))
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
 
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index dcba7668e0cd..83b02f5a25b2 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -295,8 +295,9 @@
 #define PCI_MSI_ADDRESS_LO	4	/* Lower 32 bits */
 #define PCI_MSI_ADDRESS_HI	8	/* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
 #define PCI_MSI_DATA_32		8	/* 16 bits of data for 32-bit devices */
+#define PCI_MSI_MASK_32		12	/* Mask bits register for 32-bit devices */
 #define PCI_MSI_DATA_64		12	/* 16 bits of data for 64-bit devices */
-#define PCI_MSI_MASK_BIT	16	/* Mask bits register */
+#define PCI_MSI_MASK_64		16	/* Mask bits register for 64-bit devices */
 
 /* MSI-X registers (these are at offset PCI_MSIX_FLAGS) */
 #define PCI_MSIX_FLAGS		2
-- 
cgit v1.2.3-71-gd317


From 1f82de10d6b1d845155363c895c552e61b36b51a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 23 Apr 2009 20:48:32 -0700
Subject: PCI/x86: don't assume prefetchable ranges are 64bit

We should not assign 64bit ranges to PCI devices that only take 32bit
prefetchable addresses.

Try to set IORESOURCE_MEM_64 in 64bit resource of pci_device/pci_bridge
and make the bus resource only have that bit set when all devices under
it support 64bit prefetchable memory.  Use that flag to allocate
resources from that range.

Reported-by: Yannick <yannick.roehlly@free.fr>
Reviewed-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci.h |  1 +
 drivers/pci/bus.c          |  7 ++++++-
 drivers/pci/probe.c        |  9 ++++++--
 drivers/pci/setup-bus.c    | 52 +++++++++++++++++++++++++++++++++++++---------
 include/linux/ioport.h     |  2 ++
 include/linux/pci.h        |  4 ++++
 6 files changed, 62 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b51a1e8b0baf..927958d13c19 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -130,6 +130,7 @@ extern void pci_iommu_alloc(void);
 
 /* generic pci stuff */
 #include <asm-generic/pci.h>
+#define PCIBIOS_MAX_MEM_32 0xffffffff
 
 #ifdef CONFIG_NUMA
 /* Returns the node based on pci bus */
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 97a8194063b5..40af27f31043 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -41,9 +41,14 @@ pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
 		void *alignf_data)
 {
 	int i, ret = -ENOMEM;
+	resource_size_t max = -1;
 
 	type_mask |= IORESOURCE_IO | IORESOURCE_MEM;
 
+	/* don't allocate too high if the pref mem doesn't support 64bit*/
+	if (!(res->flags & IORESOURCE_MEM_64))
+		max = PCIBIOS_MAX_MEM_32;
+
 	for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) {
 		struct resource *r = bus->resource[i];
 		if (!r)
@@ -62,7 +67,7 @@ pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
 		/* Ok, try it out.. */
 		ret = allocate_resource(r, res, size,
 					r->start ? : min,
-					-1, align,
+					max, align,
 					alignf, alignf_data);
 		if (ret == 0)
 			break;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f1ae2475ffff..b962326e3d95 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -193,7 +193,7 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 		res->flags |= pci_calc_resource_flags(l) | IORESOURCE_SIZEALIGN;
 		if (type == pci_bar_io) {
 			l &= PCI_BASE_ADDRESS_IO_MASK;
-			mask = PCI_BASE_ADDRESS_IO_MASK & 0xffff;
+			mask = PCI_BASE_ADDRESS_IO_MASK & IO_SPACE_LIMIT;
 		} else {
 			l &= PCI_BASE_ADDRESS_MEM_MASK;
 			mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
@@ -237,6 +237,8 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 			dev_printk(KERN_DEBUG, &dev->dev,
 				"reg %x 64bit mmio: %pR\n", pos, res);
 		}
+
+		res->flags |= IORESOURCE_MEM_64;
 	} else {
 		sz = pci_size(l, sz, mask);
 
@@ -362,7 +364,10 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child)
 		}
 	}
 	if (base <= limit) {
-		res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		res->flags = (mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) |
+					 IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		if (res->flags & PCI_PREF_RANGE_TYPE_64)
+			res->flags |= IORESOURCE_MEM_64;
 		res->start = base;
 		res->end = limit + 0xfffff;
 		dev_printk(KERN_DEBUG, &dev->dev, "bridge %sbit mmio pref: %pR\n",
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a00f85471b6e..e1c360a5b0db 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -143,6 +143,7 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	struct pci_dev *bridge = bus->self;
 	struct pci_bus_region region;
 	u32 l, bu, lu, io_upper16;
+	int pref_mem64;
 
 	if (pci_is_enabled(bridge))
 		return;
@@ -198,16 +199,22 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, 0);
 
 	/* Set up PREF base/limit. */
+	pref_mem64 = 0;
 	bu = lu = 0;
 	pcibios_resource_to_bus(bridge, &region, bus->resource[2]);
 	if (bus->resource[2]->flags & IORESOURCE_PREFETCH) {
+		int width = 8;
 		l = (region.start >> 16) & 0xfff0;
 		l |= region.end & 0xfff00000;
-		bu = upper_32_bits(region.start);
-		lu = upper_32_bits(region.end);
-		dev_info(&bridge->dev, "  PREFETCH window: %#016llx-%#016llx\n",
-		    (unsigned long long)region.start,
-		    (unsigned long long)region.end);
+		if (bus->resource[2]->flags & IORESOURCE_MEM_64) {
+			pref_mem64 = 1;
+			bu = upper_32_bits(region.start);
+			lu = upper_32_bits(region.end);
+			width = 16;
+		}
+		dev_info(&bridge->dev, "  PREFETCH window: %#0*llx-%#0*llx\n",
+				width, (unsigned long long)region.start,
+				width, (unsigned long long)region.end);
 	}
 	else {
 		l = 0x0000fff0;
@@ -215,9 +222,11 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	}
 	pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, l);
 
-	/* Set the upper 32 bits of PREF base & limit. */
-	pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, bu);
-	pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, lu);
+	if (pref_mem64) {
+		/* Set the upper 32 bits of PREF base & limit. */
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, bu);
+		pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, lu);
+	}
 
 	pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl);
 }
@@ -255,8 +264,25 @@ static void pci_bridge_check_ranges(struct pci_bus *bus)
 		pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
 		pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, 0x0);
 	}
-	if (pmem)
+	if (pmem) {
 		b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		if ((pmem & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64)
+			b_res[2].flags |= IORESOURCE_MEM_64;
+	}
+
+	/* double check if bridge does support 64 bit pref */
+	if (b_res[2].flags & IORESOURCE_MEM_64) {
+		u32 mem_base_hi, tmp;
+		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32,
+					 &mem_base_hi);
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
+					       0xffffffff);
+		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32, &tmp);
+		if (!tmp)
+			b_res[2].flags &= ~IORESOURCE_MEM_64;
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
+				       mem_base_hi);
+	}
 }
 
 /* Helper function for sizing routines: find first available
@@ -336,6 +362,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
 	int order, max_order;
 	struct resource *b_res = find_free_bus_resource(bus, type);
+	unsigned int mem64_mask = 0;
 
 	if (!b_res)
 		return 0;
@@ -344,9 +371,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 	max_order = 0;
 	size = 0;
 
+	mem64_mask = b_res->flags & IORESOURCE_MEM_64;
+	b_res->flags &= ~IORESOURCE_MEM_64;
+
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		int i;
-		
+
 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 			struct resource *r = &dev->resource[i];
 			resource_size_t r_size;
@@ -372,6 +402,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 				aligns[order] += align;
 			if (order > max_order)
 				max_order = order;
+			mem64_mask &= r->flags & IORESOURCE_MEM_64;
 		}
 	}
 
@@ -396,6 +427,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 	b_res->start = min_align;
 	b_res->end = size + min_align - 1;
 	b_res->flags |= IORESOURCE_STARTALIGN;
+	b_res->flags |= mem64_mask;
 	return 1;
 }
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 32e4b2f72294..786e7b8cece9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -49,6 +49,8 @@ struct resource_list {
 #define IORESOURCE_SIZEALIGN	0x00020000	/* size indicates alignment */
 #define IORESOURCE_STARTALIGN	0x00040000	/* start field is alignment */
 
+#define IORESOURCE_MEM_64	0x00100000
+
 #define IORESOURCE_EXCLUSIVE	0x08000000	/* Userland may not map this resource */
 #define IORESOURCE_DISABLED	0x10000000
 #define IORESOURCE_UNSET	0x20000000
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 72698d89e767..6dfa47d25ba4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1097,6 +1097,10 @@ static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus,
 
 #include <asm/pci.h>
 
+#ifndef PCIBIOS_MAX_MEM_32
+#define PCIBIOS_MAX_MEM_32 (-1)
+#endif
+
 /* these helpers provide future and backwards compatibility
  * for accessing popular PCI BAR info */
 #define pci_resource_start(dev, bar)	((dev)->resource[(bar)].start)
-- 
cgit v1.2.3-71-gd317


From 3b073eda9557975a87a27b08a46a545fe8da66fb Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Tue, 31 Mar 2009 09:24:22 -0600
Subject: PCI: remove deprecated pci_find_slot() interface

The last in-tree caller of pci_find_slot has been converted, so
let's get rid of this deprecated interface.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/search.c | 30 ------------------------------
 include/linux/pci.h  |  8 --------
 2 files changed, 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 710d4ea69568..650bc0a538dc 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -114,36 +114,6 @@ pci_find_next_bus(const struct pci_bus *from)
 }
 
 #ifdef CONFIG_PCI_LEGACY
-/**
- * pci_find_slot - locate PCI device from a given PCI slot
- * @bus: number of PCI bus on which desired PCI device resides
- * @devfn: encodes number of PCI slot in which the desired PCI
- * device resides and the logical device number within that slot
- * in case of multi-function devices.
- *
- * Given a PCI bus and slot/function number, the desired PCI device
- * is located in system global list of PCI devices.  If the device
- * is found, a pointer to its data structure is returned.  If no
- * device is found, %NULL is returned.
- *
- * NOTE: Do not use this function any more; use pci_get_slot() instead, as
- * the PCI device returned by this function can disappear at any moment in
- * time.
- */
-struct pci_dev *pci_find_slot(unsigned int bus, unsigned int devfn)
-{
-	struct pci_dev *dev = NULL;
-
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-		if (dev->bus->number == bus && dev->devfn == devfn) {
-			pci_dev_put(dev);
-			return dev;
-		}
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(pci_find_slot);
-
 /**
  * pci_find_device - begin or continue searching for a PCI device by vendor/device id
  * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6dfa47d25ba4..19ee92c53ef7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -599,8 +599,6 @@ extern void pci_sort_breadthfirst(void);
 struct pci_dev __deprecated *pci_find_device(unsigned int vendor,
 					     unsigned int device,
 					     struct pci_dev *from);
-struct pci_dev __deprecated *pci_find_slot(unsigned int bus,
-					   unsigned int devfn);
 #endif /* CONFIG_PCI_LEGACY */
 
 enum pci_lost_interrupt_reason {
@@ -936,12 +934,6 @@ static inline struct pci_dev *pci_find_device(unsigned int vendor,
 	return NULL;
 }
 
-static inline struct pci_dev *pci_find_slot(unsigned int bus,
-					    unsigned int devfn)
-{
-	return NULL;
-}
-
 static inline struct pci_dev *pci_get_device(unsigned int vendor,
 					     unsigned int device,
 					     struct pci_dev *from)
-- 
cgit v1.2.3-71-gd317


From 43c16408842b0eeb367c23a6fa540ce69f99e347 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Wed, 22 Apr 2009 16:52:09 -0600
Subject: PCI: Add support for turning PCIe ECRC on or off

Adds support for PCI Express transaction layer end-to-end CRC checking
(ECRC).  This patch will enable/disable ECRC checking by setting/clearing
the ECRC Check Enable and/or ECRC Generation Enable bits for devices that
support ECRC.

The ECRC setting is controlled by the "pci=ecrc=<policy>" command-line
option. If this option is not set or is set to 'bios", the enable and
generation bits are left in whatever state that firmware/BIOS set them to.
The "off" setting turns them off, and the "on" option turns them on (if the
device supports it).

Turning ECRC on or off can be a data integrity versus performance
tradeoff.  In theory, turning it on will catch more data errors, turning
it off means possibly better performance since CRC does not need to be
calculated by the PCIe hardware and packet sizes are reduced.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/kernel-parameters.txt |   6 ++
 drivers/pci/pci.c                   |   2 +
 drivers/pci/pcie/aer/Kconfig        |  13 ++++
 drivers/pci/pcie/aer/Makefile       |   2 +
 drivers/pci/pcie/aer/aerdrv_core.c  |  16 +++--
 drivers/pci/pcie/aer/ecrc.c         | 131 ++++++++++++++++++++++++++++++++++++
 include/linux/pci.h                 |  11 +++
 7 files changed, 174 insertions(+), 7 deletions(-)
 create mode 100644 drivers/pci/pcie/aer/ecrc.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7bdaf5080408..395d1a013ebb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1824,6 +1824,12 @@ and is between 256 and 4096 characters. It is defined in the file
 				PAGE_SIZE is used as alignment.
 				PCI-PCI bridge can be specified, if resource
 				windows need to be expanded.
+		ecrc=		Enable/disable PCIe ECRC (transaction layer
+				end-to-end CRC checking).
+				bios: Use BIOS/firmware settings. This is the
+				the default.
+				off: Turn ECRC off
+				on: Turn ECRC on.
 
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 			Management.
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 761557688b18..56fb18d2cb52 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2588,6 +2588,8 @@ static int __init pci_setup(char *str)
 			} else if (!strncmp(str, "resource_alignment=", 19)) {
 				pci_set_resource_alignment_param(str + 19,
 							strlen(str + 19));
+			} else if (!strncmp(str, "ecrc=", 5)) {
+				pcie_ecrc_get_policy(str + 5);
 			} else {
 				printk(KERN_ERR "PCI: Unknown option `%s'\n",
 						str);
diff --git a/drivers/pci/pcie/aer/Kconfig b/drivers/pci/pcie/aer/Kconfig
index c3bde588aa13..db4cb950933a 100644
--- a/drivers/pci/pcie/aer/Kconfig
+++ b/drivers/pci/pcie/aer/Kconfig
@@ -10,3 +10,16 @@ config PCIEAER
 	  This enables PCI Express Root Port Advanced Error Reporting
 	  (AER) driver support. Error reporting messages sent to Root
 	  Port will be handled by PCI Express AER driver.
+
+
+#
+# PCI Express ECRC
+#
+config PCIE_ECRC
+	bool "PCI Express ECRC settings control"
+	depends on PCIEAER
+	help
+	  Used to override firmware/bios settings for PCI Express ECRC
+	  (transaction layer end-to-end CRC checking).
+
+	  When in doubt, say N.
diff --git a/drivers/pci/pcie/aer/Makefile b/drivers/pci/pcie/aer/Makefile
index 8da3bd8455a8..7f93411c56e5 100644
--- a/drivers/pci/pcie/aer/Makefile
+++ b/drivers/pci/pcie/aer/Makefile
@@ -4,6 +4,8 @@
 
 obj-$(CONFIG_PCIEAER) += aerdriver.o
 
+obj-$(CONFIG_PCIE_ECRC)	+= ecrc.o
+
 aerdriver-objs := aerdrv_errprint.o aerdrv_core.o aerdrv.o
 aerdriver-$(CONFIG_ACPI) += aerdrv_acpi.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 307452f30035..dd3829e68e3f 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -113,15 +113,17 @@ static void set_device_error_reporting(struct pci_dev *dev, void *data)
 {
 	bool enable = *((bool *)data);
 
-	if (dev->pcie_type != PCIE_RC_PORT &&
-	    dev->pcie_type != PCIE_SW_UPSTREAM_PORT &&
-	    dev->pcie_type != PCIE_SW_DOWNSTREAM_PORT)
-		return;
+	if (dev->pcie_type == PCIE_RC_PORT ||
+	    dev->pcie_type == PCIE_SW_UPSTREAM_PORT ||
+	    dev->pcie_type == PCIE_SW_DOWNSTREAM_PORT) {
+		if (enable)
+			pci_enable_pcie_error_reporting(dev);
+		else
+			pci_disable_pcie_error_reporting(dev);
+	}
 
 	if (enable)
-		pci_enable_pcie_error_reporting(dev);
-	else
-		pci_disable_pcie_error_reporting(dev);
+		pcie_set_ecrc_checking(dev);
 }
 
 /**
diff --git a/drivers/pci/pcie/aer/ecrc.c b/drivers/pci/pcie/aer/ecrc.c
new file mode 100644
index 000000000000..ece97df4df6d
--- /dev/null
+++ b/drivers/pci/pcie/aer/ecrc.c
@@ -0,0 +1,131 @@
+/*
+ *    Enables/disables PCIe ECRC checking.
+ *
+ *    (C) Copyright 2009 Hewlett-Packard Development Company, L.P.
+ *    Andrew Patterson <andrew.patterson@hp.com>
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ *    General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ *    02111-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/errno.h>
+#include "../../pci.h"
+
+#define ECRC_POLICY_DEFAULT 0		/* ECRC set by BIOS */
+#define ECRC_POLICY_OFF     1		/* ECRC off for performance */
+#define ECRC_POLICY_ON      2		/* ECRC on for data integrity */
+
+static int ecrc_policy = ECRC_POLICY_DEFAULT;
+
+static const char *ecrc_policy_str[] = {
+	[ECRC_POLICY_DEFAULT] = "bios",
+	[ECRC_POLICY_OFF] = "off",
+	[ECRC_POLICY_ON] = "on"
+};
+
+/**
+ * enable_ercr_checking - enable PCIe ECRC checking for a device
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+static int enable_ecrc_checking(struct pci_dev *dev)
+{
+	int pos;
+	u32 reg32;
+
+	if (!dev->is_pcie)
+		return -ENODEV;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	if (!pos)
+		return -ENODEV;
+
+	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	if (reg32 & PCI_ERR_CAP_ECRC_GENC)
+		reg32 |= PCI_ERR_CAP_ECRC_GENE;
+	if (reg32 & PCI_ERR_CAP_ECRC_CHKC)
+		reg32 |= PCI_ERR_CAP_ECRC_CHKE;
+	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+
+	return 0;
+}
+
+/**
+ * disable_ercr_checking - disables PCIe ECRC checking for a device
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+static int disable_ecrc_checking(struct pci_dev *dev)
+{
+	int pos;
+	u32 reg32;
+
+	if (!dev->is_pcie)
+		return -ENODEV;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	if (!pos)
+		return -ENODEV;
+
+	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	reg32 &= ~(PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+
+	return 0;
+}
+
+/**
+ * pcie_set_ecrc_checking - set/unset PCIe ECRC checking for a device based on global policy
+ * @dev: the PCI device
+ */
+void pcie_set_ecrc_checking(struct pci_dev *dev)
+{
+	switch (ecrc_policy) {
+	case ECRC_POLICY_DEFAULT:
+		return;
+	case ECRC_POLICY_OFF:
+		disable_ecrc_checking(dev);
+		break;
+	case ECRC_POLICY_ON:
+		enable_ecrc_checking(dev);;
+		break;
+	default:
+		return;
+	}
+}
+
+/**
+ * pcie_ecrc_get_policy - parse kernel command-line ecrc option
+ */
+void pcie_ecrc_get_policy(char *str)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ecrc_policy_str); i++)
+		if (!strncmp(str, ecrc_policy_str[i],
+			     strlen(ecrc_policy_str[i])))
+			break;
+	if (i >= ARRAY_SIZE(ecrc_policy_str))
+		return;
+
+	ecrc_policy = i;
+}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 19ee92c53ef7..ec03b90d3510 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -878,6 +878,17 @@ static inline int pcie_aspm_enabled(void)
 extern int pcie_aspm_enabled(void);
 #endif
 
+#ifndef CONFIG_PCIE_ECRC
+static inline void pcie_set_ecrc_checking(struct pci_dev *dev)
+{
+	return;
+}
+static inline void pcie_ecrc_get_policy(char *str) {};
+#else
+extern void pcie_set_ecrc_checking(struct pci_dev *dev);
+extern void pcie_ecrc_get_policy(char *str);
+#endif
+
 #define pci_enable_msi(pdev)	pci_enable_msi_block(pdev, 1)
 
 #ifdef CONFIG_HT_IRQ
-- 
cgit v1.2.3-71-gd317


From c4bf2f372db09ef8d16a25a60d523bfa1c50f7b5 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 11 Jun 2009 23:53:55 -0400
Subject: ACPI, PCI, x86: move MCFG parsing routine from ACPI to PCI file

Move
arch/x86/kernel/acpi/boot.c: acpi_parse_mcfg()
to
arch/x86/pci/mmconfig-shared.c: pci_parse_mcfg()
where it is used, and make it static.

Move associated globals and helper routine with it.

No functional change.

This code move is in preparation for SFI support,
which will allow the PCI code to find the MCFG table
on systems which do not support ACPI.

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci_x86.h |  3 ++
 arch/x86/kernel/acpi/boot.c    | 66 ------------------------------------------
 arch/x86/pci/mmconfig-shared.c | 65 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/acpi.h           |  3 --
 4 files changed, 67 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e60fd3e14bdf..b399988eee3a 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -121,6 +121,9 @@ extern int __init pcibios_init(void);
 extern int __init pci_mmcfg_arch_init(void);
 extern void __init pci_mmcfg_arch_free(void);
 
+extern struct acpi_mcfg_allocation *pci_mmcfg_config;
+extern int pci_mmcfg_config_num;
+
 /*
  * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
  * on their northbrige except through the * %eax register. As such, you MUST
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 817d6a5e115d..f54e0e557cd2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -117,72 +117,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
 	early_iounmap(map, size);
 }
 
-#ifdef CONFIG_PCI_MMCONFIG
-
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
-/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
-struct acpi_mcfg_allocation *pci_mmcfg_config;
-int pci_mmcfg_config_num;
-
-static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
-{
-	if (!strcmp(mcfg->header.oem_id, "SGI"))
-		acpi_mcfg_64bit_base_addr = TRUE;
-
-	return 0;
-}
-
-int __init acpi_parse_mcfg(struct acpi_table_header *header)
-{
-	struct acpi_table_mcfg *mcfg;
-	unsigned long i;
-	int config_size;
-
-	if (!header)
-		return -EINVAL;
-
-	mcfg = (struct acpi_table_mcfg *)header;
-
-	/* how many config structures do we have */
-	pci_mmcfg_config_num = 0;
-	i = header->length - sizeof(struct acpi_table_mcfg);
-	while (i >= sizeof(struct acpi_mcfg_allocation)) {
-		++pci_mmcfg_config_num;
-		i -= sizeof(struct acpi_mcfg_allocation);
-	};
-	if (pci_mmcfg_config_num == 0) {
-		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
-		return -ENODEV;
-	}
-
-	config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
-	pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
-	if (!pci_mmcfg_config) {
-		printk(KERN_WARNING PREFIX
-		       "No memory for MCFG config tables\n");
-		return -ENOMEM;
-	}
-
-	memcpy(pci_mmcfg_config, &mcfg[1], config_size);
-
-	acpi_mcfg_oem_check(mcfg);
-
-	for (i = 0; i < pci_mmcfg_config_num; ++i) {
-		if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
-		    !acpi_mcfg_64bit_base_addr) {
-			printk(KERN_ERR PREFIX
-			       "MMCONFIG not in low 4GB of memory\n");
-			kfree(pci_mmcfg_config);
-			pci_mmcfg_config_num = 0;
-			return -ENODEV;
-		}
-	}
-
-	return 0;
-}
-#endif				/* CONFIG_PCI_MMCONFIG */
-
 #ifdef CONFIG_X86_LOCAL_APIC
 static int __init acpi_parse_madt(struct acpi_table_header *table)
 {
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 8766b0e216c5..712443ec6d43 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -523,6 +523,69 @@ reject:
 
 static int __initdata known_bridge;
 
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
+/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
+struct acpi_mcfg_allocation *pci_mmcfg_config;
+int pci_mmcfg_config_num;
+
+static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
+{
+	if (!strcmp(mcfg->header.oem_id, "SGI"))
+		acpi_mcfg_64bit_base_addr = TRUE;
+
+	return 0;
+}
+
+static int __init pci_parse_mcfg(struct acpi_table_header *header)
+{
+	struct acpi_table_mcfg *mcfg;
+	unsigned long i;
+	int config_size;
+
+	if (!header)
+		return -EINVAL;
+
+	mcfg = (struct acpi_table_mcfg *)header;
+
+	/* how many config structures do we have */
+	pci_mmcfg_config_num = 0;
+	i = header->length - sizeof(struct acpi_table_mcfg);
+	while (i >= sizeof(struct acpi_mcfg_allocation)) {
+		++pci_mmcfg_config_num;
+		i -= sizeof(struct acpi_mcfg_allocation);
+	};
+	if (pci_mmcfg_config_num == 0) {
+		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
+		return -ENODEV;
+	}
+
+	config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
+	pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
+	if (!pci_mmcfg_config) {
+		printk(KERN_WARNING PREFIX
+		       "No memory for MCFG config tables\n");
+		return -ENOMEM;
+	}
+
+	memcpy(pci_mmcfg_config, &mcfg[1], config_size);
+
+	acpi_mcfg_oem_check(mcfg);
+
+	for (i = 0; i < pci_mmcfg_config_num; ++i) {
+		if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
+		    !acpi_mcfg_64bit_base_addr) {
+			printk(KERN_ERR PREFIX
+			       "MMCONFIG not in low 4GB of memory\n");
+			kfree(pci_mmcfg_config);
+			pci_mmcfg_config_num = 0;
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
 static void __init __pci_mmcfg_init(int early)
 {
 	/* MMCONFIG disabled */
@@ -543,7 +606,7 @@ static void __init __pci_mmcfg_init(int early)
 	}
 
 	if (!known_bridge)
-		acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+		acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
 
 	pci_mmcfg_reject_broken(early);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 88be890ee3c7..73cb141150df 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -113,9 +113,6 @@ void acpi_irq_stats_init(void);
 extern u32 acpi_irq_handled;
 extern u32 acpi_irq_not_handled;
 
-extern struct acpi_mcfg_allocation *pci_mmcfg_config;
-extern int pci_mmcfg_config_num;
-
 extern int sbf_port;
 extern unsigned long acpi_realmode_flags;
 
-- 
cgit v1.2.3-71-gd317


From 4a7a16dc061e4c57bf288150f51bd4c2ace33723 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 12 Jun 2009 20:42:08 -0400
Subject: ACPI: move declaration acpi_early_init() to acpi.h

Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/acpi.h | 3 +++
 init/main.c          | 6 +-----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 73cb141150df..bf17681cb06f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -290,7 +290,10 @@ void __init acpi_s4_no_nvs(void);
 				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
 
 extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags);
+extern void acpi_early_init(void);
+
 #else	/* CONFIG_ACPI */
+static inline void acpi_early_init(void) { }
 
 static inline int early_acpi_boot_init(void)
 {
diff --git a/init/main.c b/init/main.c
index d721dad05dd7..f1b9f0fdb1b4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -24,6 +24,7 @@
 #include <linux/smp_lock.h>
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
+#include <linux/acpi.h>
 #include <linux/tty.h>
 #include <linux/gfp.h>
 #include <linux/percpu.h>
@@ -86,11 +87,6 @@ extern void sbus_init(void);
 extern void prio_tree_init(void);
 extern void radix_tree_init(void);
 extern void free_initmem(void);
-#ifdef	CONFIG_ACPI
-extern void acpi_early_init(void);
-#else
-static inline void acpi_early_init(void) { }
-#endif
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
 #endif
-- 
cgit v1.2.3-71-gd317


From c76acec6d55107b652a37c90b36c00bc8b04dabb Mon Sep 17 00:00:00 2001
From: Jay Fenlason <fenlason@redhat.com>
Date: Mon, 18 May 2009 13:08:06 -0400
Subject: firewire: add IPv4 support

Implement IPv4 over IEEE 1394 as per RFC 2734 for the newer firewire
stack.  This feature has only been present in the older ieee1394 stack
via the eth1394 driver.

Still to do:
  - fix ipv4_priv and ipv4_node lifetime logic
  - fix determination of speeds and max payloads
  - fix bus reset handling
  - fix unaligned memory accesses
  - fix coding style
  - further testing/ improvement of fragment reassembly
  - perhaps multicast support

Signed-off-by: Jay Fenlason <fenlason@redhat.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de> (rebased, copyright note, changelog)
---
 drivers/firewire/Makefile    |    2 +
 drivers/firewire/core-card.c |    4 +
 drivers/firewire/core-iso.c  |    7 +
 drivers/firewire/core.h      |   87 --
 drivers/firewire/fw-ipv4.c   | 1819 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/firewire.h     |   94 +++
 6 files changed, 1926 insertions(+), 87 deletions(-)
 create mode 100644 drivers/firewire/fw-ipv4.c

(limited to 'include/linux')

diff --git a/drivers/firewire/Makefile b/drivers/firewire/Makefile
index bc3b9bf822bf..31edf30c558d 100644
--- a/drivers/firewire/Makefile
+++ b/drivers/firewire/Makefile
@@ -6,7 +6,9 @@ firewire-core-y += core-card.o core-cdev.o core-device.o \
                    core-iso.o core-topology.o core-transaction.o
 firewire-ohci-y += ohci.o
 firewire-sbp2-y += sbp2.o
+firewire-ipv4-y += fw-ipv4.o
 
 obj-$(CONFIG_FIREWIRE) += firewire-core.o
 obj-$(CONFIG_FIREWIRE_OHCI) += firewire-ohci.o
 obj-$(CONFIG_FIREWIRE_SBP2) += firewire-sbp2.o
+obj-$(CONFIG_FIREWIRE_IPV4) += firewire-ipv4.o
diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 4c1be64fdddd..cdab32b20675 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -176,6 +176,7 @@ int fw_core_add_descriptor(struct fw_descriptor *desc)
 
 	return 0;
 }
+EXPORT_SYMBOL(fw_core_add_descriptor);
 
 void fw_core_remove_descriptor(struct fw_descriptor *desc)
 {
@@ -189,6 +190,7 @@ void fw_core_remove_descriptor(struct fw_descriptor *desc)
 
 	mutex_unlock(&card_mutex);
 }
+EXPORT_SYMBOL(fw_core_remove_descriptor);
 
 static void allocate_broadcast_channel(struct fw_card *card, int generation)
 {
@@ -427,6 +429,8 @@ void fw_card_initialize(struct fw_card *card,
 	card->local_node = NULL;
 
 	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
+	card->netdev = NULL;
+	INIT_LIST_HEAD(&card->ipv4_nodes);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c
index 28076c892d7e..448ddd7d887b 100644
--- a/drivers/firewire/core-iso.c
+++ b/drivers/firewire/core-iso.c
@@ -80,6 +80,7 @@ int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL(fw_iso_buffer_init);
 
 int fw_iso_buffer_map(struct fw_iso_buffer *buffer, struct vm_area_struct *vma)
 {
@@ -114,6 +115,7 @@ void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,
 	kfree(buffer->pages);
 	buffer->pages = NULL;
 }
+EXPORT_SYMBOL(fw_iso_buffer_destroy);
 
 struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 		int type, int channel, int speed, size_t header_size,
@@ -136,6 +138,7 @@ struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 
 	return ctx;
 }
+EXPORT_SYMBOL(fw_iso_context_create);
 
 void fw_iso_context_destroy(struct fw_iso_context *ctx)
 {
@@ -143,12 +146,14 @@ void fw_iso_context_destroy(struct fw_iso_context *ctx)
 
 	card->driver->free_iso_context(ctx);
 }
+EXPORT_SYMBOL(fw_iso_context_destroy);
 
 int fw_iso_context_start(struct fw_iso_context *ctx,
 			 int cycle, int sync, int tags)
 {
 	return ctx->card->driver->start_iso(ctx, cycle, sync, tags);
 }
+EXPORT_SYMBOL(fw_iso_context_start);
 
 int fw_iso_context_queue(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
@@ -159,11 +164,13 @@ int fw_iso_context_queue(struct fw_iso_context *ctx,
 
 	return card->driver->queue_iso(ctx, packet, buffer, payload);
 }
+EXPORT_SYMBOL(fw_iso_context_queue);
 
 int fw_iso_context_stop(struct fw_iso_context *ctx)
 {
 	return ctx->card->driver->stop_iso(ctx);
 }
+EXPORT_SYMBOL(fw_iso_context_stop);
 
 /*
  * Isochronous bus resource management (channels, bandwidth), client side
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 0a25a7b38a80..c3cfc647e5e3 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -1,7 +1,6 @@
 #ifndef _FIREWIRE_CORE_H
 #define _FIREWIRE_CORE_H
 
-#include <linux/dma-mapping.h>
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/idr.h>
@@ -97,17 +96,6 @@ int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
 int fw_compute_block_crc(u32 *block);
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
 
-struct fw_descriptor {
-	struct list_head link;
-	size_t length;
-	u32 immediate;
-	u32 key;
-	const u32 *data;
-};
-
-int fw_core_add_descriptor(struct fw_descriptor *desc);
-void fw_core_remove_descriptor(struct fw_descriptor *desc);
-
 
 /* -cdev */
 
@@ -130,77 +118,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event);
 
 /* -iso */
 
-/*
- * The iso packet format allows for an immediate header/payload part
- * stored in 'header' immediately after the packet info plus an
- * indirect payload part that is pointer to by the 'payload' field.
- * Applications can use one or the other or both to implement simple
- * low-bandwidth streaming (e.g. audio) or more advanced
- * scatter-gather streaming (e.g. assembling video frame automatically).
- */
-struct fw_iso_packet {
-	u16 payload_length;	/* Length of indirect payload. */
-	u32 interrupt:1;	/* Generate interrupt on this packet */
-	u32 skip:1;		/* Set to not send packet at all. */
-	u32 tag:2;
-	u32 sy:4;
-	u32 header_length:8;	/* Length of immediate header. */
-	u32 header[0];
-};
-
-#define FW_ISO_CONTEXT_TRANSMIT	0
-#define FW_ISO_CONTEXT_RECEIVE	1
-
-#define FW_ISO_CONTEXT_MATCH_TAG0	 1
-#define FW_ISO_CONTEXT_MATCH_TAG1	 2
-#define FW_ISO_CONTEXT_MATCH_TAG2	 4
-#define FW_ISO_CONTEXT_MATCH_TAG3	 8
-#define FW_ISO_CONTEXT_MATCH_ALL_TAGS	15
-
-/*
- * An iso buffer is just a set of pages mapped for DMA in the
- * specified direction.  Since the pages are to be used for DMA, they
- * are not mapped into the kernel virtual address space.  We store the
- * DMA address in the page private. The helper function
- * fw_iso_buffer_map() will map the pages into a given vma.
- */
-struct fw_iso_buffer {
-	enum dma_data_direction direction;
-	struct page **pages;
-	int page_count;
-};
-
-typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
-				  u32 cycle, size_t header_length,
-				  void *header, void *data);
-
-struct fw_iso_context {
-	struct fw_card *card;
-	int type;
-	int channel;
-	int speed;
-	size_t header_size;
-	fw_iso_callback_t callback;
-	void *callback_data;
-};
-
-int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
-		       int page_count, enum dma_data_direction direction);
 int fw_iso_buffer_map(struct fw_iso_buffer *buffer, struct vm_area_struct *vma);
-void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
-
-struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
-		int type, int channel, int speed, size_t header_size,
-		fw_iso_callback_t callback, void *callback_data);
-int fw_iso_context_queue(struct fw_iso_context *ctx,
-			 struct fw_iso_packet *packet,
-			 struct fw_iso_buffer *buffer,
-			 unsigned long payload);
-int fw_iso_context_start(struct fw_iso_context *ctx,
-			 int cycle, int sync, int tags);
-int fw_iso_context_stop(struct fw_iso_context *ctx);
-void fw_iso_context_destroy(struct fw_iso_context *ctx);
-
 void fw_iso_resource_manage(struct fw_card *card, int generation,
 		u64 channels_mask, int *channel, int *bandwidth, bool allocate);
 
@@ -285,9 +203,4 @@ void fw_flush_transactions(struct fw_card *card);
 void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count);
 
-static inline int fw_stream_packet_destination_id(int tag, int channel, int sy)
-{
-	return tag << 14 | channel << 8 | sy;
-}
-
 #endif /* _FIREWIRE_CORE_H */
diff --git a/drivers/firewire/fw-ipv4.c b/drivers/firewire/fw-ipv4.c
new file mode 100644
index 000000000000..4de6dbb95f0c
--- /dev/null
+++ b/drivers/firewire/fw-ipv4.c
@@ -0,0 +1,1819 @@
+/*
+ * IPv4 over IEEE 1394, per RFC 2734
+ *
+ * Copyright (C) 2009 Jay Fenlason <fenlason@redhat.com>
+ *
+ * based on eth1394 by Ben Collins et al
+ */
+
+#include <linux/device.h>
+#include <linux/ethtool.h>
+#include <linux/firewire.h>
+#include <linux/firewire-constants.h>
+#include <linux/highmem.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include <asm/unaligned.h>
+#include <net/arp.h>
+
+/* Things to potentially make runtime cofigurable */
+/* must be at least as large as our maximum receive size */
+#define FIFO_SIZE 4096
+/* Network timeout in glibbles */
+#define IPV4_TIMEOUT       100000
+
+/* Runitme configurable paramaters */
+static int ipv4_mpd = 25;
+static int ipv4_max_xmt = 0;
+/* 16k for receiving arp and broadcast packets.  Enough? */
+static int ipv4_iso_page_count = 4;
+
+MODULE_AUTHOR("Jay Fenlason (fenlason@redhat.com)");
+MODULE_DESCRIPTION("Firewire IPv4 Driver (IPv4-over-IEEE1394 as per RFC 2734)");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(ieee1394, ipv4_id_table);
+module_param_named(max_partial_datagrams, ipv4_mpd, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_partial_datagrams, "Maximum number of received"
+ " incomplete fragmented datagrams (default = 25).");
+
+/* Max xmt is useful for forcing fragmentation, which makes testing easier. */
+module_param_named(max_transmit, ipv4_max_xmt, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_transmit, "Maximum datagram size to transmit"
+ " (larger datagrams will be fragmented) (default = 0 (use hardware defaults).");
+
+/* iso page count controls how many pages will be used for receiving broadcast packets. */
+module_param_named(iso_pages, ipv4_iso_page_count, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(iso_pages, "Number of pages to use for receiving broadcast packets"
+ " (default = 4).");
+
+/* uncomment this line to do debugging */
+#define fw_debug(s, args...) printk(KERN_DEBUG KBUILD_MODNAME ": " s, ## args)
+
+/* comment out these lines to do debugging. */
+/* #undef fw_debug */
+/* #define fw_debug(s...) */
+/* #define print_hex_dump(l...) */
+
+/* Define a fake hardware header format for the networking core.  Note that
+ * header size cannot exceed 16 bytes as that is the size of the header cache.
+ * Also, we do not need the source address in the header so we omit it and
+ * keep the header to under 16 bytes */
+#define IPV4_ALEN (8)
+/* This must equal sizeof(struct ipv4_ether_hdr) */
+#define IPV4_HLEN (10)
+
+/* FIXME: what's a good size for this? */
+#define INVALID_FIFO_ADDR (u64)~0ULL
+
+/* Things specified by standards */
+#define BROADCAST_CHANNEL 31
+
+#define S100_BUFFER_SIZE 512
+#define MAX_BUFFER_SIZE 4096
+
+#define IPV4_GASP_SPECIFIER_ID	0x00005EU
+#define IPV4_GASP_VERSION	0x00000001U
+
+#define IPV4_GASP_OVERHEAD (2 * sizeof(u32)) /* for GASP header */
+
+#define IPV4_UNFRAG_HDR_SIZE	sizeof(u32)
+#define IPV4_FRAG_HDR_SIZE	(2 * sizeof(u32))
+#define IPV4_FRAG_OVERHEAD	sizeof(u32)
+
+#define ALL_NODES (0xffc0 | 0x003f)
+
+#define IPV4_HDR_UNFRAG		0	/* unfragmented		*/
+#define IPV4_HDR_FIRSTFRAG	1	/* first fragment	*/
+#define IPV4_HDR_LASTFRAG	2	/* last fragment	*/
+#define IPV4_HDR_INTFRAG	3	/* interior fragment	*/
+
+/* Our arp packet (ARPHRD_IEEE1394) */
+/* FIXME: note that this is probably bogus on weird-endian machines */
+struct ipv4_arp {
+	u16 hw_type;		/* 0x0018	*/
+	u16 proto_type;		/* 0x0806       */
+	u8 hw_addr_len;		/* 16		*/
+	u8 ip_addr_len;         /* 4		*/
+	u16 opcode;	        /* ARP Opcode	*/
+	/* Above is exactly the same format as struct arphdr */
+
+	u64 s_uniq_id;		/* Sender's 64bit EUI			*/
+	u8 max_rec;             /* Sender's max packet size		*/
+	u8 sspd;		/* Sender's max speed			*/
+	u16 fifo_hi;            /* hi 16bits of sender's FIFO addr	*/
+	u32 fifo_lo;            /* lo 32bits of sender's FIFO addr	*/
+	u32 sip;		/* Sender's IP Address			*/
+	u32 tip;		/* IP Address of requested hw addr	*/
+} __attribute__((packed));
+
+struct ipv4_ether_hdr {
+	unsigned char	h_dest[IPV4_ALEN];	/* destination address */
+	unsigned short  h_proto;                /* packet type ID field */
+}  __attribute__((packed));
+
+static inline struct ipv4_ether_hdr *ipv4_ether_hdr(const struct sk_buff *skb)
+{
+	return (struct ipv4_ether_hdr *)skb_mac_header(skb);
+}
+
+enum ipv4_tx_type {
+	IPV4_UNKNOWN = 0,
+	IPV4_GASP = 1,
+	IPV4_WRREQ = 2,
+};
+
+enum ipv4_broadcast_state {
+	IPV4_BROADCAST_ERROR,
+	IPV4_BROADCAST_RUNNING,
+	IPV4_BROADCAST_STOPPED,
+};
+
+#define ipv4_get_hdr_lf(h)		(((h)->w0&0xC0000000)>>30)
+#define ipv4_get_hdr_ether_type(h)	(((h)->w0&0x0000FFFF)    )
+#define ipv4_get_hdr_dg_size(h)		(((h)->w0&0x0FFF0000)>>16)
+#define ipv4_get_hdr_fg_off(h)		(((h)->w0&0x00000FFF)    )
+#define ipv4_get_hdr_dgl(h)		(((h)->w1&0xFFFF0000)>>16)
+
+#define ipv4_set_hdr_lf(lf)		(( lf)<<30)
+#define ipv4_set_hdr_ether_type(et)	(( et)    )
+#define ipv4_set_hdr_dg_size(dgs)	((dgs)<<16)
+#define ipv4_set_hdr_fg_off(fgo)	((fgo)    )
+
+#define ipv4_set_hdr_dgl(dgl)		((dgl)<<16)
+
+struct ipv4_hdr {
+	u32 w0;
+	u32 w1;
+};
+
+static inline void ipv4_make_uf_hdr( struct ipv4_hdr *hdr, unsigned ether_type) {
+	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_UNFRAG)
+		   |ipv4_set_hdr_ether_type(ether_type);
+	fw_debug ( "Setting unfragmented header %p to %x\n", hdr, hdr->w0 );
+}
+
+static inline void ipv4_make_ff_hdr ( struct ipv4_hdr *hdr, unsigned ether_type, unsigned dg_size, unsigned dgl ) {
+	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_FIRSTFRAG)
+		   |ipv4_set_hdr_dg_size(dg_size)
+		   |ipv4_set_hdr_ether_type(ether_type);
+	hdr->w1 = ipv4_set_hdr_dgl(dgl);
+	fw_debug ( "Setting fragmented header %p to first_frag %x,%x (et %x, dgs %x, dgl %x)\n", hdr, hdr->w0, hdr->w1,
+ ether_type, dg_size, dgl );
+}
+
+static inline void ipv4_make_sf_hdr ( struct ipv4_hdr *hdr, unsigned lf, unsigned dg_size, unsigned fg_off, unsigned dgl) {
+	hdr->w0 = ipv4_set_hdr_lf(lf)
+		 |ipv4_set_hdr_dg_size(dg_size)
+		 |ipv4_set_hdr_fg_off(fg_off);
+	hdr->w1 = ipv4_set_hdr_dgl(dgl);
+	fw_debug ( "Setting fragmented header %p to %x,%x (lf %x, dgs %x, fo %x dgl %x)\n",
+ hdr, hdr->w0, hdr->w1,
+ lf, dg_size, fg_off, dgl );
+}
+
+/* End of IP1394 headers */
+
+/* Fragment types */
+#define ETH1394_HDR_LF_UF	0	/* unfragmented		*/
+#define ETH1394_HDR_LF_FF	1	/* first fragment	*/
+#define ETH1394_HDR_LF_LF	2	/* last fragment	*/
+#define ETH1394_HDR_LF_IF	3	/* interior fragment	*/
+
+#define IP1394_HW_ADDR_LEN	16	/* As per RFC		*/
+
+/* This list keeps track of what parts of the datagram have been filled in */
+struct ipv4_fragment_info {
+        struct list_head fragment_info;
+	u16 offset;
+	u16 len;
+};
+
+struct ipv4_partial_datagram {
+	struct list_head pdg_list;
+	struct list_head fragment_info;
+	struct sk_buff *skb;
+	/* FIXME Why not use skb->data? */
+	char *pbuf;
+	u16 datagram_label;
+	u16 ether_type;
+	u16 datagram_size;
+};
+
+/*
+ * We keep one of these for each IPv4 capable device attached to a fw_card.
+ * The list of them is stored in the fw_card structure rather than in the
+ * ipv4_priv because the remote IPv4 nodes may be probed before the card is,
+ * so we need a place to store them before the ipv4_priv structure is
+ * allocated.
+ */
+struct ipv4_node {
+	struct list_head ipv4_nodes;
+	/* guid of the remote node */
+	u64 guid;
+	/* FIFO address to transmit datagrams to, or INVALID_FIFO_ADDR */
+	u64 fifo;
+
+	spinlock_t pdg_lock;	/* partial datagram lock		*/
+	/* List of partial datagrams received from this node */
+	struct list_head pdg_list;
+	/* Number of entries in pdg_list at the moment */
+	unsigned pdg_size;
+
+	/* max payload to transmit to this remote node */
+	/* This already includes the IPV4_FRAG_HDR_SIZE overhead */
+	u16 max_payload;
+	/* outgoing datagram label */
+	u16 datagram_label;
+	/* Current node_id of the remote node */
+	u16 nodeid;
+	/* current generation of the remote node */
+	u8 generation;
+	/* max speed that this node can receive at */
+	u8 xmt_speed;
+};
+
+struct ipv4_priv {
+	spinlock_t lock;
+
+	enum ipv4_broadcast_state broadcast_state;
+	struct fw_iso_context *broadcast_rcv_context;
+	struct fw_iso_buffer broadcast_rcv_buffer;
+	void **broadcast_rcv_buffer_ptrs;
+	unsigned broadcast_rcv_next_ptr;
+	unsigned num_broadcast_rcv_ptrs;
+	unsigned rcv_buffer_size;
+	/*
+	 * This value is the maximum unfragmented datagram size that can be
+	 * sent by the hardware.  It already has the GASP overhead and the
+	 * unfragmented datagram header overhead calculated into it.
+	 */
+	unsigned broadcast_xmt_max_payload;
+	u16 broadcast_xmt_datagramlabel;
+
+	/*
+	 * The csr address that remote nodes must send datagrams to for us to
+	 * receive them.
+	 */
+	struct fw_address_handler handler;
+	u64 local_fifo;
+
+	/* Wake up to xmt	 */
+        /* struct work_struct wake;*/
+	/* List of packets to be sent */
+	struct list_head packet_list;
+	/*
+	 * List of packets that were broadcasted.  When we get an ISO interrupt
+	 * one of them has been sent
+	 */
+	struct list_head broadcasted_list;
+	/* List of packets that have been sent but not yet acked */
+	struct list_head sent_list;
+
+	struct fw_card *card;
+};
+
+/* This is our task struct. It's used for the packet complete callback.  */
+struct ipv4_packet_task {
+	/*
+	 * ptask can actually be on priv->packet_list, priv->broadcasted_list,
+	 * or priv->sent_list depending on its current state.
+	 */
+	struct list_head packet_list;
+	struct fw_transaction transaction;
+	struct ipv4_hdr hdr;
+	struct sk_buff *skb;
+	struct ipv4_priv *priv;
+	enum ipv4_tx_type tx_type;
+	int outstanding_pkts;
+	unsigned max_payload;
+	u64 fifo_addr;
+	u16 dest_node;
+	u8 generation;
+	u8 speed;
+};
+
+static struct kmem_cache *ipv4_packet_task_cache;
+
+static const char ipv4_driver_name[] = "firewire-ipv4";
+
+static const struct ieee1394_device_id ipv4_id_table[] = {
+	{
+		.match_flags  = IEEE1394_MATCH_SPECIFIER_ID |
+				IEEE1394_MATCH_VERSION,
+		.specifier_id = IPV4_GASP_SPECIFIER_ID,
+		.version      = IPV4_GASP_VERSION,
+	},
+	{ }
+};
+
+static u32 ipv4_unit_directory_data[] = {
+	0x00040000,					/* unit directory */
+	0x12000000 | IPV4_GASP_SPECIFIER_ID,	/* specifier ID */
+	0x81000003,					/* text descriptor */
+	0x13000000 | IPV4_GASP_VERSION,		/* version */
+	0x81000005,					/* text descriptor */
+
+	0x00030000,					/* Three quadlets */
+	0x00000000,					/* Text */
+	0x00000000,					/* Language 0 */
+	0x49414e41,					/* I A N A */
+	0x00030000,					/* Three quadlets */
+	0x00000000,					/* Text */
+	0x00000000,					/* Language 0 */
+	0x49507634,					/* I P v 4 */
+};
+
+static struct fw_descriptor ipv4_unit_directory = {
+	.length = ARRAY_SIZE(ipv4_unit_directory_data),
+	.key = 0xd1000000,
+	.data = ipv4_unit_directory_data
+};
+
+static int ipv4_send_packet(struct ipv4_packet_task *ptask );
+
+/* ------------------------------------------------------------------ */
+/******************************************
+ * HW Header net device functions
+ ******************************************/
+  /* These functions have been adapted from net/ethernet/eth.c */
+
+/* Create a fake MAC header for an arbitrary protocol layer.
+ * saddr=NULL means use device source address
+ * daddr=NULL means leave destination address (eg unresolved arp). */
+
+static int ipv4_header ( struct sk_buff *skb, struct net_device *dev,
+		       unsigned short type, const void *daddr,
+		       const void *saddr, unsigned len) {
+	struct ipv4_ether_hdr *eth;
+
+	eth = (struct ipv4_ether_hdr *)skb_push(skb, sizeof(*eth));
+	eth->h_proto = htons(type);
+
+	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
+		memset(eth->h_dest, 0, dev->addr_len);
+		return dev->hard_header_len;
+	}
+
+	if (daddr) {
+		memcpy(eth->h_dest, daddr, dev->addr_len);
+		return dev->hard_header_len;
+	}
+
+	return -dev->hard_header_len;
+}
+
+/* Rebuild the faked MAC header. This is called after an ARP
+ * (or in future other address resolution) has completed on this
+ * sk_buff. We now let ARP fill in the other fields.
+ *
+ * This routine CANNOT use cached dst->neigh!
+ * Really, it is used only when dst->neigh is wrong.
+ */
+
+static int ipv4_rebuild_header(struct sk_buff *skb)
+{
+	struct ipv4_ether_hdr *eth;
+
+	eth = (struct ipv4_ether_hdr *)skb->data;
+	if (eth->h_proto == htons(ETH_P_IP))
+		return arp_find((unsigned char *)&eth->h_dest, skb);
+
+	fw_notify ( "%s: unable to resolve type %04x addresses\n",
+		   skb->dev->name,ntohs(eth->h_proto) );
+	return 0;
+}
+
+static int ipv4_header_cache(const struct neighbour *neigh, struct hh_cache *hh) {
+	unsigned short type = hh->hh_type;
+	struct net_device *dev;
+	struct ipv4_ether_hdr *eth;
+
+	if (type == htons(ETH_P_802_3))
+		return -1;
+	dev = neigh->dev;
+	eth = (struct ipv4_ether_hdr *)((u8 *)hh->hh_data + 16 - sizeof(*eth));
+	eth->h_proto = type;
+	memcpy(eth->h_dest, neigh->ha, dev->addr_len);
+
+	hh->hh_len = IPV4_HLEN;
+	return 0;
+}
+
+/* Called by Address Resolution module to notify changes in address. */
+static void ipv4_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char * haddr ) {
+	memcpy((u8 *)hh->hh_data + 16 - IPV4_HLEN, haddr, dev->addr_len);
+}
+
+static int ipv4_header_parse(const struct sk_buff *skb, unsigned char *haddr) {
+	memcpy(haddr, skb->dev->dev_addr, IPV4_ALEN);
+	return IPV4_ALEN;
+}
+
+static const struct header_ops ipv4_header_ops = {
+	.create         = ipv4_header,
+	.rebuild        = ipv4_rebuild_header,
+	.cache		= ipv4_header_cache,
+	.cache_update	= ipv4_header_cache_update,
+	.parse          = ipv4_header_parse,
+};
+
+/* ------------------------------------------------------------------ */
+
+/* FIXME: is this correct for all cases? */
+static bool ipv4_frag_overlap(struct ipv4_partial_datagram *pd, unsigned offset, unsigned len)
+{
+        struct ipv4_fragment_info *fi;
+	unsigned end = offset + len;
+
+	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
+		if (offset < fi->offset + fi->len && end > fi->offset) {
+			fw_debug ( "frag_overlap pd %p fi %p (%x@%x) with %x@%x\n", pd, fi, fi->len, fi->offset, len, offset );
+			return true;
+		}
+	}
+	fw_debug ( "frag_overlap %p does not overlap with %x@%x\n", pd, len, offset );
+	return false;
+}
+
+/* Assumes that new fragment does not overlap any existing fragments */
+static struct ipv4_fragment_info *ipv4_frag_new ( struct ipv4_partial_datagram *pd, unsigned offset, unsigned len ) {
+	struct ipv4_fragment_info *fi, *fi2, *new;
+	struct list_head *list;
+
+	fw_debug ( "frag_new pd %p %x@%x\n", pd, len, offset );
+	list = &pd->fragment_info;
+	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
+		if (fi->offset + fi->len == offset) {
+			/* The new fragment can be tacked on to the end */
+			/* Did the new fragment plug a hole? */
+			fi2 = list_entry(fi->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+			if (fi->offset + fi->len == fi2->offset) {
+				fw_debug ( "pd %p: hole filling %p (%x@%x) and %p(%x@%x): now %x@%x\n", pd, fi, fi->len, fi->offset,
+				fi2, fi2->len, fi2->offset, fi->len + len + fi2->len, fi->offset );
+				/* glue fragments together */
+				fi->len += len + fi2->len;
+				list_del(&fi2->fragment_info);
+				kfree(fi2);
+			} else {
+				fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, fi->len+len, fi->offset );
+				fi->len += len;
+			}
+			return fi;
+		}
+		if (offset + len == fi->offset) {
+			/* The new fragment can be tacked on to the beginning */
+			/* Did the new fragment plug a hole? */
+			fi2 = list_entry(fi->fragment_info.prev, struct ipv4_fragment_info, fragment_info);
+			if (fi2->offset + fi2->len == fi->offset) {
+				/* glue fragments together */
+				fw_debug ( "pd %p: extending %p and merging with %p from %x@%x to %x@%x\n",
+ pd, fi2, fi, fi2->len, fi2->offset, fi2->len + fi->len + len, fi2->offset );
+				fi2->len += fi->len + len;
+				list_del(&fi->fragment_info);
+				kfree(fi);
+				return fi2;
+			}
+			fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, offset, fi->len + len );
+			fi->offset = offset;
+			fi->len += len;
+			return fi;
+		}
+		if (offset > fi->offset + fi->len) {
+			list = &fi->fragment_info;
+			break;
+		}
+		if (offset + len < fi->offset) {
+			list = fi->fragment_info.prev;
+			break;
+		}
+	}
+
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new) {
+		fw_error ( "out of memory in fragment handling!\n" );
+		return NULL;
+	}
+
+	new->offset = offset;
+	new->len = len;
+	list_add(&new->fragment_info, list);
+	fw_debug ( "pd %p: new frag %p %x@%x\n", pd, new, new->len, new->offset );
+	list_for_each_entry( fi, &pd->fragment_info, fragment_info )
+		fw_debug ( "fi %p %x@%x\n", fi, fi->len, fi->offset );
+	return new;
+}
+
+/* ------------------------------------------------------------------ */
+
+static struct ipv4_partial_datagram *ipv4_pd_new(struct net_device *netdev,
+ struct ipv4_node *node, u16 datagram_label, unsigned dg_size, u32 *frag_buf,
+ unsigned frag_off, unsigned frag_len) {
+	struct ipv4_partial_datagram *new;
+	struct ipv4_fragment_info *fi;
+
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new)
+		goto fail;
+	INIT_LIST_HEAD(&new->fragment_info);
+	fi = ipv4_frag_new ( new, frag_off, frag_len);
+	if ( fi == NULL )
+		goto fail_w_new;
+	new->datagram_label = datagram_label;
+	new->datagram_size = dg_size;
+	new->skb = dev_alloc_skb(dg_size + netdev->hard_header_len + 15);
+	if ( new->skb == NULL )
+		goto fail_w_fi;
+	skb_reserve(new->skb, (netdev->hard_header_len + 15) & ~15);
+	new->pbuf = skb_put(new->skb, dg_size);
+	memcpy(new->pbuf + frag_off, frag_buf, frag_len);
+	list_add_tail(&new->pdg_list, &node->pdg_list);
+	fw_debug ( "pd_new: new pd %p { dgl %u, dg_size %u, skb %p, pbuf %p } on node %p\n",
+ new, new->datagram_label, new->datagram_size, new->skb, new->pbuf, node );
+	return new;
+
+fail_w_fi:
+	kfree(fi);
+fail_w_new:
+	kfree(new);
+fail:
+	fw_error("ipv4_pd_new: no memory\n");
+	return NULL;
+}
+
+static struct ipv4_partial_datagram *ipv4_pd_find(struct ipv4_node *node, u16 datagram_label) {
+	struct ipv4_partial_datagram *pd;
+
+	list_for_each_entry(pd, &node->pdg_list, pdg_list) {
+	        if ( pd->datagram_label == datagram_label ) {
+			fw_debug ( "pd_find(node %p, label %u): pd %p\n", node, datagram_label, pd );
+			return pd;
+		}
+	}
+	fw_debug ( "pd_find(node %p, label %u) no entry\n", node, datagram_label );
+	return NULL;
+}
+
+
+static void ipv4_pd_delete ( struct ipv4_partial_datagram *old ) {
+	struct ipv4_fragment_info *fi, *n;
+
+	fw_debug ( "pd_delete %p\n", old );
+	list_for_each_entry_safe(fi, n, &old->fragment_info, fragment_info) {
+		fw_debug ( "Freeing fi %p\n", fi );
+		kfree(fi);
+	}
+	list_del(&old->pdg_list);
+	dev_kfree_skb_any(old->skb);
+	kfree(old);
+}
+
+static bool ipv4_pd_update ( struct ipv4_node *node, struct ipv4_partial_datagram *pd,
+ u32 *frag_buf, unsigned frag_off, unsigned frag_len) {
+	fw_debug ( "pd_update node %p, pd %p, frag_buf %p, %x@%x\n", node, pd, frag_buf, frag_len, frag_off );
+	if ( ipv4_frag_new ( pd, frag_off, frag_len ) == NULL)
+		return false;
+	memcpy(pd->pbuf + frag_off, frag_buf, frag_len);
+
+	/*
+	 * Move list entry to beginnig of list so that oldest partial
+	 * datagrams percolate to the end of the list
+	 */
+	list_move_tail(&pd->pdg_list, &node->pdg_list);
+	fw_debug ( "New pd list:\n" );
+	list_for_each_entry ( pd, &node->pdg_list, pdg_list ) {
+		fw_debug ( "pd %p\n", pd );
+	}
+	return true;
+}
+
+static bool ipv4_pd_is_complete ( struct ipv4_partial_datagram *pd ) {
+	struct ipv4_fragment_info *fi;
+	bool ret;
+
+	fi = list_entry(pd->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+
+	ret = (fi->len == pd->datagram_size);
+	fw_debug ( "pd_is_complete (pd %p, dgs %x): fi %p (%x@%x) %s\n", pd, pd->datagram_size, fi, fi->len, fi->offset, ret ? "yes" : "no" );
+	return ret;
+}
+
+/* ------------------------------------------------------------------ */
+
+static int ipv4_node_new ( struct fw_card *card, struct fw_device *device ) {
+	struct ipv4_node *node;
+
+	node = kmalloc ( sizeof(*node), GFP_KERNEL );
+	if ( ! node ) {
+		fw_error ( "allocate new node failed\n" );
+		return -ENOMEM;
+	}
+	node->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	node->fifo = INVALID_FIFO_ADDR;
+	INIT_LIST_HEAD(&node->pdg_list);
+	spin_lock_init(&node->pdg_lock);
+	node->pdg_size = 0;
+	node->generation = device->generation;
+	rmb();
+	node->nodeid = device->node_id;
+	 /* FIXME what should it really be? */
+	node->max_payload = S100_BUFFER_SIZE - IPV4_UNFRAG_HDR_SIZE;
+	node->datagram_label = 0U;
+	node->xmt_speed = device->max_speed;
+	list_add_tail ( &node->ipv4_nodes, &card->ipv4_nodes );
+	fw_debug ( "node_new: %p { guid %016llx, generation %u, nodeid %x, max_payload %x, xmt_speed %x } added\n",
+ node, (unsigned long long)node->guid, node->generation, node->nodeid, node->max_payload, node->xmt_speed );
+	return 0;
+}
+
+static struct ipv4_node *ipv4_node_find_by_guid(struct ipv4_priv *priv, u64 guid) {
+	struct ipv4_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
+		if (node->guid == guid) {
+			/* FIXME: lock the node first? */
+			spin_unlock_irqrestore ( &priv->lock, flags );
+			fw_debug ( "node_find_by_guid (%016llx) found %p\n", (unsigned long long)guid, node );
+			return node;
+		}
+
+	spin_unlock_irqrestore ( &priv->lock, flags );
+	fw_debug ( "node_find_by_guid (%016llx) not found\n", (unsigned long long)guid );
+	return NULL;
+}
+
+static struct ipv4_node *ipv4_node_find_by_nodeid(struct ipv4_priv *priv, u16 nodeid) {
+	struct ipv4_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
+		if (node->nodeid == nodeid) {
+			/* FIXME: lock the node first? */
+			spin_unlock_irqrestore ( &priv->lock, flags );
+			fw_debug ( "node_find_by_nodeid (%x) found %p\n", nodeid, node );
+			return node;
+		}
+	fw_debug ( "node_find_by_nodeid (%x) not found\n", nodeid );
+	spin_unlock_irqrestore ( &priv->lock, flags );
+	return NULL;
+}
+
+/* This is only complicated because we can't assume priv exists */
+static void ipv4_node_delete ( struct fw_card *card, struct fw_device *device ) {
+	struct net_device *netdev;
+	struct ipv4_priv *priv;
+	struct ipv4_node *node;
+	u64 guid;
+	unsigned long flags;
+	struct ipv4_partial_datagram *pd, *pd_next;
+
+	guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	netdev = card->netdev;
+	if ( netdev )
+		priv = netdev_priv ( netdev );
+	else
+		priv = NULL;
+	if ( priv )
+		spin_lock_irqsave ( &priv->lock, flags );
+	list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
+		if ( node->guid == guid ) {
+			list_del ( &node->ipv4_nodes );
+			list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
+				ipv4_pd_delete ( pd );
+			break;
+		}
+	}
+	if ( priv )
+		spin_unlock_irqrestore ( &priv->lock, flags );
+}
+
+/* ------------------------------------------------------------------ */
+
+
+static int ipv4_finish_incoming_packet ( struct net_device *netdev,
+ struct sk_buff *skb, u16 source_node_id, bool is_broadcast, u16 ether_type ) {
+	struct ipv4_priv *priv;
+	static u64 broadcast_hw = ~0ULL;
+	int status;
+	u64 guid;
+
+	fw_debug ( "ipv4_finish_incoming_packet(%p, %p, %x, %s, %x\n",
+ netdev, skb, source_node_id, is_broadcast ? "true" : "false", ether_type );
+	priv = netdev_priv(netdev);
+	/* Write metadata, and then pass to the receive level */
+	skb->dev = netdev;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;  /* don't check it */
+
+	/*
+	 * Parse the encapsulation header. This actually does the job of
+	 * converting to an ethernet frame header, as well as arp
+	 * conversion if needed. ARP conversion is easier in this
+	 * direction, since we are using ethernet as our backend.
+	 */
+	/*
+	 * If this is an ARP packet, convert it. First, we want to make
+	 * use of some of the fields, since they tell us a little bit
+	 * about the sending machine.
+	 */
+	if (ether_type == ETH_P_ARP) {
+		struct ipv4_arp *arp1394;
+		struct arphdr *arp;
+		unsigned char *arp_ptr;
+		u64 fifo_addr;
+		u8 max_rec;
+		u8 sspd;
+		u16 max_payload;
+		struct ipv4_node *node;
+		static const u16 ipv4_speed_to_max_payload[] = {
+			/* S100, S200, S400, S800, S1600, S3200 */
+			    512, 1024, 2048, 4096,  4096,  4096
+		};
+
+		/* fw_debug ( "ARP packet\n" ); */
+		arp1394 = (struct ipv4_arp *)skb->data;
+		arp = (struct arphdr *)skb->data;
+		arp_ptr = (unsigned char *)(arp + 1);
+		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32 |
+ ntohl(arp1394->fifo_lo);
+		max_rec = priv->card->max_receive;
+		if ( arp1394->max_rec < max_rec )
+			max_rec = arp1394->max_rec;
+		sspd = arp1394->sspd;
+		/*
+		 * Sanity check. MacOSX seems to be sending us 131 in this
+		 * field (atleast on my Panther G5). Not sure why.
+		 */
+		if (sspd > 5 ) {
+			fw_notify ( "sspd %x out of range\n", sspd );
+			sspd = 0;
+		}
+
+		max_payload = min(ipv4_speed_to_max_payload[sspd],
+ (u16)(1 << (max_rec + 1))) - IPV4_UNFRAG_HDR_SIZE;
+
+		guid = be64_to_cpu(get_unaligned(&arp1394->s_uniq_id));
+		node = ipv4_node_find_by_guid(priv, guid);
+		if (!node) {
+			fw_notify ( "No node for ARP packet from %llx\n", guid );
+			goto failed_proto;
+		}
+		if ( node->nodeid != source_node_id || node->generation != priv->card->generation ) {
+			fw_notify ( "Internal error: node->nodeid (%x) != soucre_node_id (%x) or node->generation (%x) != priv->card->generation(%x)\n",
+ node->nodeid, source_node_id, node->generation, priv->card->generation );
+			node->nodeid = source_node_id;
+			node->generation = priv->card->generation;
+		}
+
+		/* FIXME: for debugging */
+		if ( sspd > SCODE_400 )
+			sspd = SCODE_400;
+		/* Update our speed/payload/fifo_offset table */
+		/*
+		 * FIXME: this does not handle cases where two high-speed endpoints must use a slower speed because of
+		 * a lower speed hub between them.  We need to look at the actual topology map here.
+		 */
+		fw_debug ( "Setting node %p fifo %llx (was %llx), max_payload %x (was %x), speed %x (was %x)\n",
+ node, fifo_addr, node->fifo, max_payload, node->max_payload, sspd, node->xmt_speed );
+		node->fifo =	fifo_addr;
+		node->max_payload = max_payload;
+		/*
+		 * Only allow speeds to go down from their initial value.
+		 * Otherwise a local node that can only do S400 or slower may
+		 * be told to transmit at S800 to a faster remote node.
+		 */
+		if ( node->xmt_speed > sspd )
+			node->xmt_speed = sspd;
+
+		/*
+		 * Now that we're done with the 1394 specific stuff, we'll
+		 * need to alter some of the data.  Believe it or not, all
+		 * that needs to be done is sender_IP_address needs to be
+		 * moved, the destination hardware address get stuffed
+		 * in and the hardware address length set to 8.
+		 *
+		 * IMPORTANT: The code below overwrites 1394 specific data
+		 * needed above so keep the munging of the data for the
+		 * higher level IP stack last.
+		 */
+
+		arp->ar_hln = 8;
+		arp_ptr += arp->ar_hln;		/* skip over sender unique id */
+		*(u32 *)arp_ptr = arp1394->sip; /* move sender IP addr */
+		arp_ptr += arp->ar_pln;		/* skip over sender IP addr */
+
+		if (arp->ar_op == htons(ARPOP_REQUEST))
+			memset(arp_ptr, 0, sizeof(u64));
+		else
+			memcpy(arp_ptr, netdev->dev_addr, sizeof(u64));
+	}
+
+	/* Now add the ethernet header. */
+	guid = cpu_to_be64(priv->card->guid);
+	if (dev_hard_header(skb, netdev, ether_type, is_broadcast ? &broadcast_hw : &guid, NULL,
+ skb->len) >= 0) {
+		struct ipv4_ether_hdr *eth;
+		u16 *rawp;
+		__be16 protocol;
+
+		skb_reset_mac_header(skb);
+		skb_pull(skb, sizeof(*eth));
+		eth = ipv4_ether_hdr(skb);
+		if (*eth->h_dest & 1) {
+			if (memcmp(eth->h_dest, netdev->broadcast, netdev->addr_len) == 0) {
+				fw_debug ( "Broadcast\n" );
+				skb->pkt_type = PACKET_BROADCAST;
+			}
+#if 0
+			else
+				skb->pkt_type = PACKET_MULTICAST;
+#endif
+		} else {
+			if (memcmp(eth->h_dest, netdev->dev_addr, netdev->addr_len)) {
+				u64 a1, a2;
+
+				memcpy ( &a1, eth->h_dest, sizeof(u64));
+				memcpy ( &a2, netdev->dev_addr, sizeof(u64));
+				fw_debug ( "Otherhost %llx %llx %x\n", a1, a2, netdev->addr_len );
+				skb->pkt_type = PACKET_OTHERHOST;
+			}
+		}
+		if (ntohs(eth->h_proto) >= 1536) {
+			fw_debug ( " proto %x %x\n", eth->h_proto, ntohs(eth->h_proto) );
+			protocol = eth->h_proto;
+		} else {
+			rawp = (u16 *)skb->data;
+			if (*rawp == 0xFFFF) {
+				fw_debug ( "proto 802_3\n" );
+				protocol = htons(ETH_P_802_3);
+			} else {
+				fw_debug ( "proto 802_2\n" );
+				protocol = htons(ETH_P_802_2);
+			}
+		}
+		skb->protocol = protocol;
+	}
+	status = netif_rx(skb);
+	if ( status == NET_RX_DROP) {
+		netdev->stats.rx_errors++;
+		netdev->stats.rx_dropped++;
+	} else {
+		netdev->stats.rx_packets++;
+		netdev->stats.rx_bytes += skb->len;
+	}
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+	return 0;
+
+ failed_proto:
+	netdev->stats.rx_errors++;
+	netdev->stats.rx_dropped++;
+	dev_kfree_skb_any(skb);
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+	netdev->last_rx = jiffies;
+	return 0;
+}
+
+/* ------------------------------------------------------------------ */
+
+static int ipv4_incoming_packet ( struct ipv4_priv *priv, u32 *buf, int len, u16 source_node_id, bool is_broadcast ) {
+	struct sk_buff *skb;
+	struct net_device *netdev;
+	struct ipv4_hdr hdr;
+	unsigned lf;
+	unsigned long flags;
+	struct ipv4_node *node;
+	struct ipv4_partial_datagram *pd;
+	int fg_off;
+	int dg_size;
+	u16 datagram_label;
+	int retval;
+	u16 ether_type;
+
+	fw_debug ( "ipv4_incoming_packet(%p, %p, %d, %x, %s)\n", priv, buf, len, source_node_id, is_broadcast ? "true" : "false" );
+	netdev = priv->card->netdev;
+
+	hdr.w0 = ntohl(buf[0]);
+	lf = ipv4_get_hdr_lf(&hdr);
+	if ( lf == IPV4_HDR_UNFRAG ) {
+		/*
+		 * An unfragmented datagram has been received by the ieee1394
+		 * bus. Build an skbuff around it so we can pass it to the
+		 * high level network layer.
+		 */
+		ether_type = ipv4_get_hdr_ether_type(&hdr);
+		fw_debug ( "header w0 = %x, lf = %x, ether_type = %x\n", hdr.w0, lf, ether_type );
+		buf++;
+		len -= IPV4_UNFRAG_HDR_SIZE;
+
+		skb = dev_alloc_skb(len + netdev->hard_header_len + 15);
+		if (unlikely(!skb)) {
+			fw_error ( "Out of memory for incoming packet\n");
+			netdev->stats.rx_dropped++;
+			return -1;
+		}
+		skb_reserve(skb, (netdev->hard_header_len + 15) & ~15);
+		memcpy(skb_put(skb, len), buf, len );
+		return ipv4_finish_incoming_packet(netdev, skb, source_node_id, is_broadcast, ether_type );
+	}
+	/* A datagram fragment has been received, now the fun begins. */
+	hdr.w1 = ntohl(buf[1]);
+	buf +=2;
+	len -= IPV4_FRAG_HDR_SIZE;
+	if ( lf ==IPV4_HDR_FIRSTFRAG ) {
+		ether_type = ipv4_get_hdr_ether_type(&hdr);
+		fg_off = 0;
+	} else {
+		fg_off = ipv4_get_hdr_fg_off(&hdr);
+		ether_type = 0; /* Shut up compiler! */
+	}
+	datagram_label = ipv4_get_hdr_dgl(&hdr);
+	dg_size = ipv4_get_hdr_dg_size(&hdr); /* ??? + 1 */
+	fw_debug ( "fragmented: %x.%x = lf %x, ether_type %x, fg_off %x, dgl %x, dg_size %x\n", hdr.w0, hdr.w1, lf, ether_type, fg_off, datagram_label, dg_size );
+	node = ipv4_node_find_by_nodeid ( priv, source_node_id);
+	spin_lock_irqsave(&node->pdg_lock, flags);
+	pd = ipv4_pd_find( node, datagram_label );
+	if (pd == NULL) {
+		while ( node->pdg_size >= ipv4_mpd ) {
+			/* remove the oldest */
+			ipv4_pd_delete ( list_first_entry(&node->pdg_list, struct ipv4_partial_datagram, pdg_list) );
+			node->pdg_size--;
+		}
+		pd = ipv4_pd_new ( netdev, node, datagram_label, dg_size,
+ buf, fg_off, len);
+		if ( pd == NULL) {
+			retval = -ENOMEM;
+			goto bad_proto;
+		}
+		node->pdg_size++;
+	} else {
+		if (ipv4_frag_overlap(pd, fg_off, len) || pd->datagram_size != dg_size) {
+			/*
+			 * Differing datagram sizes or overlapping fragments,
+			 * Either way the remote machine is playing silly buggers
+			 * with us: obliterate the old datagram and start a new one.
+			 */
+			ipv4_pd_delete ( pd );
+			pd = ipv4_pd_new ( netdev, node, datagram_label,
+ dg_size, buf, fg_off, len);
+			if ( pd == NULL ) {
+				retval = -ENOMEM;
+				node->pdg_size--;
+				goto bad_proto;
+			}
+		} else {
+			bool worked;
+
+			worked = ipv4_pd_update ( node, pd,
+ buf, fg_off, len );
+			if ( ! worked ) {
+				/*
+				 * Couldn't save off fragment anyway
+				 * so might as well obliterate the
+				 * datagram now.
+				 */
+				ipv4_pd_delete ( pd );
+				node->pdg_size--;
+				goto bad_proto;
+			}
+		}
+	} /* new datagram or add to existing one */
+
+	if ( lf == IPV4_HDR_FIRSTFRAG )
+		pd->ether_type = ether_type;
+	if ( ipv4_pd_is_complete ( pd ) ) {
+		ether_type = pd->ether_type;
+		node->pdg_size--;
+		skb = skb_get(pd->skb);
+		ipv4_pd_delete ( pd );
+		spin_unlock_irqrestore(&node->pdg_lock, flags);
+		return ipv4_finish_incoming_packet ( netdev, skb, source_node_id, false, ether_type );
+	}
+	/*
+	 * Datagram is not complete, we're done for the
+	 * moment.
+	 */
+	spin_unlock_irqrestore(&node->pdg_lock, flags);
+	return 0;
+
+ bad_proto:
+	spin_unlock_irqrestore(&node->pdg_lock, flags);
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+	return 0;
+}
+
+static void ipv4_receive_packet ( struct fw_card *card, struct fw_request *r,
+ int tcode, int destination, int source, int generation, int speed,
+ unsigned long long offset, void *payload, size_t length, void *callback_data ) {
+	struct ipv4_priv *priv;
+	int status;
+
+	fw_debug ( "ipv4_receive_packet(%p,%p,%x,%x,%x,%x,%x,%llx,%p,%lx,%p)\n",
+ card, r, tcode, destination, source, generation, speed, offset, payload,
+ (unsigned long)length, callback_data);
+	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, payload, length, false );
+	priv = callback_data;
+	if (   tcode != TCODE_WRITE_BLOCK_REQUEST
+	    || destination != card->node_id
+	    || generation != card->generation
+	    || offset != priv->handler.offset ) {
+		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
+		fw_debug("Conflict error card node_id=%x, card generation=%x, local offset %llx\n",
+ card->node_id, card->generation, (unsigned long long)priv->handler.offset );
+		return;
+	}
+	status = ipv4_incoming_packet ( priv, payload, length, source, false );
+	if ( status != 0 ) {
+		fw_error ( "Incoming packet failure\n" );
+		fw_send_response ( card, r, RCODE_CONFLICT_ERROR );
+		return;
+	}
+	fw_send_response ( card, r, RCODE_COMPLETE );
+}
+
+static void ipv4_receive_broadcast(struct fw_iso_context *context, u32 cycle,
+ size_t header_length, void *header, void *data) {
+	struct ipv4_priv *priv;
+	struct fw_iso_packet packet;
+	struct fw_card *card;
+	u16 *hdr_ptr;
+	u32 *buf_ptr;
+	int retval;
+	u32 length;
+	u16 source_node_id;
+	u32 specifier_id;
+	u32 ver;
+	unsigned long offset;
+	unsigned long flags;
+
+	fw_debug ( "ipv4_receive_broadcast ( context=%p, cycle=%x, header_length=%lx, header=%p, data=%p )\n", context, cycle, (unsigned long)header_length, header, data );
+	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, header, header_length, false );
+	priv = data;
+	card = priv->card;
+	hdr_ptr = header;
+	length = ntohs(hdr_ptr[0]);
+	spin_lock_irqsave(&priv->lock,flags);
+	offset = priv->rcv_buffer_size * priv->broadcast_rcv_next_ptr;
+	buf_ptr = priv->broadcast_rcv_buffer_ptrs[priv->broadcast_rcv_next_ptr++];
+	if ( priv->broadcast_rcv_next_ptr == priv->num_broadcast_rcv_ptrs )
+		priv->broadcast_rcv_next_ptr = 0;
+	spin_unlock_irqrestore(&priv->lock,flags);
+	fw_debug ( "length %u at %p\n", length, buf_ptr );
+	print_hex_dump ( KERN_DEBUG, "buffer: ", DUMP_PREFIX_OFFSET, 32, 1, buf_ptr, length, false );
+
+	specifier_id =    (be32_to_cpu(buf_ptr[0]) & 0xffff) << 8
+			| (be32_to_cpu(buf_ptr[1]) & 0xff000000) >> 24;
+	ver = be32_to_cpu(buf_ptr[1]) & 0xFFFFFF;
+	source_node_id = be32_to_cpu(buf_ptr[0]) >> 16;
+	/* fw_debug ( "source %x SpecID %x ver %x\n", source_node_id, specifier_id, ver ); */
+	if ( specifier_id == IPV4_GASP_SPECIFIER_ID && ver == IPV4_GASP_VERSION ) {
+		buf_ptr += 2;
+		length -= IPV4_GASP_OVERHEAD;
+		ipv4_incoming_packet(priv, buf_ptr, length, source_node_id, true);
+	} else
+		fw_debug ( "Ignoring packet: not GASP\n" );
+	packet.payload_length = priv->rcv_buffer_size;
+	packet.interrupt = 1;
+	packet.skip = 0;
+	packet.tag = 3;
+	packet.sy = 0;
+	packet.header_length = IPV4_GASP_OVERHEAD;
+	spin_lock_irqsave(&priv->lock,flags);
+	retval = fw_iso_context_queue ( priv->broadcast_rcv_context, &packet,
+ &priv->broadcast_rcv_buffer, offset );
+	spin_unlock_irqrestore(&priv->lock,flags);
+	if ( retval < 0 )
+		fw_error ( "requeue failed\n" );
+}
+
+static void debug_ptask ( struct ipv4_packet_task *ptask ) {
+	static const char *tx_types[] = { "Unknown", "GASP", "Write" };
+
+	fw_debug ( "packet %p { hdr { w0 %x w1 %x }, skb %p, priv %p,"
+ " tx_type %s, outstanding_pkts %d, max_payload %x, fifo %llx,"
+ " speed %x, dest_node %x, generation %x }\n",
+ ptask, ptask->hdr.w0, ptask->hdr.w1, ptask->skb, ptask->priv,
+ ptask->tx_type > IPV4_WRREQ ? "Invalid" : tx_types[ptask->tx_type],
+ ptask->outstanding_pkts,  ptask->max_payload,
+ ptask->fifo_addr, ptask->speed, ptask->dest_node, ptask->generation );
+	print_hex_dump ( KERN_DEBUG, "packet :", DUMP_PREFIX_OFFSET, 32, 1,
+ ptask->skb->data, ptask->skb->len, false );
+}
+
+static void ipv4_transmit_packet_done ( struct ipv4_packet_task *ptask ) {
+	struct ipv4_priv *priv;
+	unsigned long flags;
+
+	priv = ptask->priv;
+	spin_lock_irqsave ( &priv->lock, flags );
+	list_del ( &ptask->packet_list );
+	spin_unlock_irqrestore ( &priv->lock, flags );
+	ptask->outstanding_pkts--;
+	if ( ptask->outstanding_pkts > 0 ) {
+		u16 dg_size;
+		u16 fg_off;
+		u16 datagram_label;
+		u16 lf;
+		struct sk_buff *skb;
+
+		/* Update the ptask to point to the next fragment and send it */
+		lf = ipv4_get_hdr_lf(&ptask->hdr);
+		switch (lf) {
+		case IPV4_HDR_LASTFRAG:
+		case IPV4_HDR_UNFRAG:
+		default:
+			fw_error ( "Outstanding packet %x lf %x, header %x,%x\n", ptask->outstanding_pkts, lf, ptask->hdr.w0, ptask->hdr.w1 );
+			BUG();
+
+		case IPV4_HDR_FIRSTFRAG:
+			/* Set frag type here for future interior fragments */
+			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
+			fg_off = ptask->max_payload - IPV4_FRAG_HDR_SIZE;
+			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+			break;
+
+		case IPV4_HDR_INTFRAG:
+			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
+			fg_off = ipv4_get_hdr_fg_off(&ptask->hdr) + ptask->max_payload - IPV4_FRAG_HDR_SIZE;
+			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+			break;
+		}
+		skb = ptask->skb;
+		skb_pull ( skb, ptask->max_payload );
+		if ( ptask->outstanding_pkts > 1 ) {
+			ipv4_make_sf_hdr ( &ptask->hdr,
+  IPV4_HDR_INTFRAG, dg_size, fg_off, datagram_label );
+		} else {
+			ipv4_make_sf_hdr ( &ptask->hdr,
+  IPV4_HDR_LASTFRAG, dg_size, fg_off, datagram_label );
+			ptask->max_payload = skb->len + IPV4_FRAG_HDR_SIZE;
+
+		}
+		ipv4_send_packet ( ptask );
+	} else {
+		dev_kfree_skb_any ( ptask->skb );
+		kmem_cache_free( ipv4_packet_task_cache, ptask );
+	}
+}
+
+static void ipv4_write_complete ( struct fw_card *card, int rcode,
+ void *payload, size_t length, void *data ) {
+	struct ipv4_packet_task *ptask;
+
+	ptask = data;
+	fw_debug ( "ipv4_write_complete ( %p, %x, %p, %lx, %p )\n",
+ card, rcode, payload, (unsigned long)length, data );
+	debug_ptask ( ptask );
+
+	if ( rcode == RCODE_COMPLETE ) {
+		ipv4_transmit_packet_done ( ptask );
+	} else {
+		fw_error ( "ipv4_write_complete: failed: %x\n", rcode );
+		/* ??? error recovery */
+	}
+}
+
+static int ipv4_send_packet ( struct ipv4_packet_task *ptask ) {
+	struct ipv4_priv *priv;
+	unsigned tx_len;
+	struct ipv4_hdr *bufhdr;
+	unsigned long flags;
+	struct net_device *netdev;
+#if 0 /* stefanr */
+	int retval;
+#endif
+
+	fw_debug ( "ipv4_send_packet\n" );
+	debug_ptask ( ptask );
+	priv = ptask->priv;
+	tx_len = ptask->max_payload;
+	switch (ipv4_get_hdr_lf(&ptask->hdr)) {
+	case IPV4_HDR_UNFRAG:
+		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_UNFRAG_HDR_SIZE);
+		bufhdr->w0 = htonl(ptask->hdr.w0);
+		break;
+
+	case IPV4_HDR_FIRSTFRAG:
+	case IPV4_HDR_INTFRAG:
+	case IPV4_HDR_LASTFRAG:
+		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_FRAG_HDR_SIZE);
+		bufhdr->w0 = htonl(ptask->hdr.w0);
+		bufhdr->w1 = htonl(ptask->hdr.w1);
+		break;
+
+	default:
+		BUG();
+	}
+	if ( ptask->tx_type == IPV4_GASP ) {
+		u32 *packets;
+		int generation;
+		int nodeid;
+
+		/* ptask->generation may not have been set yet */
+		generation = priv->card->generation;
+		smp_rmb();
+		nodeid = priv->card->node_id;
+		packets = (u32 *)skb_push(ptask->skb, sizeof(u32)*2);
+		packets[0] = htonl(nodeid << 16 | (IPV4_GASP_SPECIFIER_ID>>8));
+		packets[1] = htonl((IPV4_GASP_SPECIFIER_ID & 0xFF) << 24 | IPV4_GASP_VERSION);
+		fw_send_request ( priv->card, &ptask->transaction, TCODE_STREAM_DATA,
+ fw_stream_packet_destination_id(3, BROADCAST_CHANNEL, 0),
+ generation, SCODE_100, 0ULL, ptask->skb->data, tx_len + 8, ipv4_write_complete, ptask );
+		spin_lock_irqsave(&priv->lock,flags);
+		list_add_tail ( &ptask->packet_list, &priv->broadcasted_list );
+		spin_unlock_irqrestore(&priv->lock,flags);
+#if 0 /* stefanr */
+		return retval;
+#else
+		return 0;
+#endif
+	}
+	fw_debug("send_request (%p, %p, WRITE_BLOCK, %x, %x, %x, %llx, %p, %d, %p, %p\n",
+ priv->card, &ptask->transaction, ptask->dest_node, ptask->generation,
+ ptask->speed, (unsigned long long)ptask->fifo_addr, ptask->skb->data, tx_len,
+ ipv4_write_complete, ptask );
+	fw_send_request ( priv->card, &ptask->transaction,
+ TCODE_WRITE_BLOCK_REQUEST, ptask->dest_node, ptask->generation, ptask->speed,
+ ptask->fifo_addr, ptask->skb->data, tx_len, ipv4_write_complete, ptask );
+	spin_lock_irqsave(&priv->lock,flags);
+	list_add_tail ( &ptask->packet_list, &priv->sent_list );
+	spin_unlock_irqrestore(&priv->lock,flags);
+	netdev = priv->card->netdev;
+	netdev->trans_start = jiffies;
+	return 0;
+}
+
+static int ipv4_broadcast_start ( struct ipv4_priv *priv ) {
+	struct fw_iso_context *context;
+	int retval;
+	unsigned num_packets;
+	unsigned max_receive;
+	struct fw_iso_packet packet;
+	unsigned long offset;
+	unsigned u;
+	/* unsigned transmit_speed; */
+
+#if 0 /* stefanr */
+	if ( priv->card->broadcast_channel != (BROADCAST_CHANNEL_VALID|BROADCAST_CHANNEL_INITIAL)) {
+		fw_notify ( "Invalid broadcast channel %x\n", priv->card->broadcast_channel );
+		/* FIXME: try again later? */
+		/* return -EINVAL; */
+	}
+#endif
+	if ( priv->local_fifo == INVALID_FIFO_ADDR ) {
+		struct fw_address_region region;
+
+		priv->handler.length = FIFO_SIZE;
+		priv->handler.address_callback = ipv4_receive_packet;
+		priv->handler.callback_data = priv;
+		/* FIXME: this is OHCI, but what about others? */
+		region.start = 0xffff00000000ULL;
+		region.end =   0xfffffffffffcULL;
+
+		retval = fw_core_add_address_handler ( &priv->handler, &region );
+		if ( retval < 0 )
+			goto failed_initial;
+		priv->local_fifo = priv->handler.offset;
+	}
+
+	/*
+	 * FIXME: rawiso limits us to PAGE_SIZE.  This only matters if we ever have
+	 * a machine with PAGE_SIZE < 4096
+	 */
+	max_receive = 1U << (priv->card->max_receive + 1);
+	num_packets = ( ipv4_iso_page_count * PAGE_SIZE ) / max_receive;
+	if ( ! priv->broadcast_rcv_context ) {
+		void **ptrptr;
+
+		context = fw_iso_context_create ( priv->card,
+ FW_ISO_CONTEXT_RECEIVE, BROADCAST_CHANNEL,
+ priv->card->link_speed, 8, ipv4_receive_broadcast, priv );
+		if (IS_ERR(context)) {
+			retval = PTR_ERR(context);
+			goto failed_context_create;
+		}
+		retval = fw_iso_buffer_init ( &priv->broadcast_rcv_buffer,
+ priv->card, ipv4_iso_page_count, DMA_FROM_DEVICE );
+		if ( retval < 0 )
+			goto failed_buffer_init;
+		ptrptr = kmalloc ( sizeof(void*)*num_packets, GFP_KERNEL );
+		if ( ! ptrptr ) {
+			retval = -ENOMEM;
+			goto failed_ptrs_alloc;
+		}
+		priv->broadcast_rcv_buffer_ptrs = ptrptr;
+		for ( u = 0; u < ipv4_iso_page_count; u++ ) {
+			void *ptr;
+			unsigned v;
+
+			ptr = kmap ( priv->broadcast_rcv_buffer.pages[u] );
+			for ( v = 0; v < num_packets / ipv4_iso_page_count; v++ )
+				*ptrptr++ = (void *)((char *)ptr + v * max_receive);
+		}
+		priv->broadcast_rcv_context = context;
+	} else
+		context = priv->broadcast_rcv_context;
+
+	packet.payload_length = max_receive;
+	packet.interrupt = 1;
+	packet.skip = 0;
+	packet.tag = 3;
+	packet.sy = 0;
+	packet.header_length = IPV4_GASP_OVERHEAD;
+	offset = 0;
+	for ( u = 0; u < num_packets; u++ ) {
+		retval = fw_iso_context_queue ( context, &packet,
+ &priv->broadcast_rcv_buffer, offset );
+		if ( retval < 0 )
+			goto failed_rcv_queue;
+		offset += max_receive;
+	}
+	priv->num_broadcast_rcv_ptrs = num_packets;
+	priv->rcv_buffer_size = max_receive;
+	priv->broadcast_rcv_next_ptr = 0U;
+	retval = fw_iso_context_start ( context, -1, 0, FW_ISO_CONTEXT_MATCH_ALL_TAGS ); /* ??? sync */
+	if ( retval < 0 )
+		goto failed_rcv_queue;
+	/* FIXME: adjust this when we know the max receive speeds of all other IP nodes on the bus. */
+	/* since we only xmt at S100 ??? */
+	priv->broadcast_xmt_max_payload = S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD - IPV4_UNFRAG_HDR_SIZE;
+	priv->broadcast_state = IPV4_BROADCAST_RUNNING;
+	return 0;
+
+ failed_rcv_queue:
+	kfree ( priv->broadcast_rcv_buffer_ptrs );
+	priv->broadcast_rcv_buffer_ptrs = NULL;
+ failed_ptrs_alloc:
+	fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
+ failed_buffer_init:
+	fw_iso_context_destroy ( context );
+	priv->broadcast_rcv_context = NULL;
+ failed_context_create:
+	fw_core_remove_address_handler ( &priv->handler );
+ failed_initial:
+	priv->local_fifo = INVALID_FIFO_ADDR;
+	return retval;
+}
+
+/* This is called after an "ifup" */
+static int ipv4_open(struct net_device *dev) {
+	struct ipv4_priv *priv;
+	int ret;
+
+	priv = netdev_priv(dev);
+	if (priv->broadcast_state == IPV4_BROADCAST_ERROR) {
+		ret = ipv4_broadcast_start ( priv );
+		if (ret)
+			return ret;
+	}
+	netif_start_queue(dev);
+	return 0;
+}
+
+/* This is called after an "ifdown" */
+static int ipv4_stop(struct net_device *netdev)
+{
+	/* flush priv->wake */
+	/* flush_scheduled_work(); */
+
+	netif_stop_queue(netdev);
+	return 0;
+}
+
+/* Transmit a packet (called by kernel) */
+static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct ipv4_ether_hdr hdr_buf;
+	struct ipv4_priv *priv = netdev_priv(netdev);
+	__be16 proto;
+	u16 dest_node;
+	enum ipv4_tx_type tx_type;
+	unsigned max_payload;
+	u16 dg_size;
+	u16 *datagram_label_ptr;
+	struct ipv4_packet_task *ptask;
+	struct ipv4_node *node = NULL;
+
+	ptask = kmem_cache_alloc(ipv4_packet_task_cache, GFP_ATOMIC);
+	if (ptask == NULL)
+		goto fail;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		goto fail;
+
+	/*
+	 * Get rid of the fake ipv4 header, but first make a copy.
+	 * We might need to rebuild the header on tx failure.
+	 */
+	memcpy(&hdr_buf, skb->data, sizeof(hdr_buf));
+	skb_pull(skb, sizeof(hdr_buf));
+
+	proto = hdr_buf.h_proto;
+	dg_size = skb->len;
+
+	/*
+	 * Set the transmission type for the packet.  ARP packets and IP
+	 * broadcast packets are sent via GASP.
+	 */
+	if (   memcmp(hdr_buf.h_dest, netdev->broadcast, IPV4_ALEN) == 0
+	    || proto == htons(ETH_P_ARP)
+	    || (   proto == htons(ETH_P_IP)
+		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)) ) ) {
+		/* fw_debug ( "transmitting arp or multicast packet\n" );*/
+		tx_type = IPV4_GASP;
+		dest_node = ALL_NODES;
+		max_payload = priv->broadcast_xmt_max_payload;
+		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD); */
+		datagram_label_ptr = &priv->broadcast_xmt_datagramlabel;
+		ptask->fifo_addr = INVALID_FIFO_ADDR;
+		ptask->generation = 0U;
+		ptask->dest_node = 0U;
+		ptask->speed = 0;
+	} else {
+		__be64 guid = get_unaligned((u64 *)hdr_buf.h_dest);
+		u8 generation;
+
+		node = ipv4_node_find_by_guid(priv, be64_to_cpu(guid));
+		if (!node) {
+			fw_debug ( "Normal packet but no node\n" );
+			goto fail;
+		}
+
+		if (node->fifo == INVALID_FIFO_ADDR) {
+			fw_debug ( "Normal packet but no fifo addr\n" );
+			goto fail;
+		}
+
+		/* fw_debug ( "Transmitting normal packet to %x at %llxx\n", node->nodeid, node->fifo ); */
+		generation = node->generation;
+		dest_node = node->nodeid;
+		max_payload = node->max_payload;
+		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_FRAG_HDR_SIZE); */
+
+		datagram_label_ptr = &node->datagram_label;
+		tx_type = IPV4_WRREQ;
+		ptask->fifo_addr = node->fifo;
+		ptask->generation = generation;
+		ptask->dest_node = dest_node;
+		ptask->speed = node->xmt_speed;
+	}
+
+	/* If this is an ARP packet, convert it */
+	if (proto == htons(ETH_P_ARP)) {
+		/* Convert a standard ARP packet to 1394 ARP. The first 8 bytes (the entire
+		 * arphdr) is the same format as the ip1394 header, so they overlap.  The rest
+		 * needs to be munged a bit.  The remainder of the arphdr is formatted based
+		 * on hwaddr len and ipaddr len.  We know what they'll be, so it's easy to
+		 * judge.
+		 *
+		 * Now that the EUI is used for the hardware address all we need to do to make
+		 * this work for 1394 is to insert 2 quadlets that contain max_rec size,
+		 * speed, and unicast FIFO address information between the sender_unique_id
+		 * and the IP addresses.
+		 */
+		struct arphdr *arp = (struct arphdr *)skb->data;
+		unsigned char *arp_ptr = (unsigned char *)(arp + 1);
+		struct ipv4_arp *arp1394 = (struct ipv4_arp *)skb->data;
+		u32 ipaddr;
+
+		ipaddr = *(u32*)(arp_ptr + IPV4_ALEN);
+		arp1394->hw_addr_len    = 16;
+		arp1394->max_rec        = priv->card->max_receive;
+		arp1394->sspd		= priv->card->link_speed;
+		arp1394->fifo_hi	= htons(priv->local_fifo >> 32);
+		arp1394->fifo_lo        = htonl(priv->local_fifo & 0xFFFFFFFF);
+		arp1394->sip		= ipaddr;
+	}
+	if ( ipv4_max_xmt && max_payload > ipv4_max_xmt )
+		max_payload = ipv4_max_xmt;
+
+	ptask->hdr.w0 = 0;
+	ptask->hdr.w1 = 0;
+	ptask->skb = skb;
+	ptask->priv = priv;
+        ptask->tx_type = tx_type;
+	/* Does it all fit in one packet? */
+	if ( dg_size <= max_payload ) {
+		ipv4_make_uf_hdr(&ptask->hdr, be16_to_cpu(proto));
+		ptask->outstanding_pkts = 1;
+		max_payload = dg_size + IPV4_UNFRAG_HDR_SIZE;
+	} else {
+		u16 datagram_label;
+
+		max_payload -= IPV4_FRAG_OVERHEAD;
+		datagram_label = (*datagram_label_ptr)++;
+		ipv4_make_ff_hdr(&ptask->hdr, be16_to_cpu(proto), dg_size, datagram_label );
+		ptask->outstanding_pkts = DIV_ROUND_UP(dg_size, max_payload);
+		max_payload += IPV4_FRAG_HDR_SIZE;
+	}
+	ptask->max_payload = max_payload;
+	ipv4_send_packet ( ptask );
+	return NETDEV_TX_OK;
+
+ fail:
+	if (ptask)
+		kmem_cache_free(ipv4_packet_task_cache, ptask);
+
+	if (skb != NULL)
+		dev_kfree_skb(skb);
+
+	netdev->stats.tx_dropped++;
+	netdev->stats.tx_errors++;
+
+	/*
+	 * FIXME: According to a patch from 2003-02-26, "returning non-zero
+	 * causes serious problems" here, allegedly.  Before that patch,
+	 * -ERRNO was returned which is not appropriate under Linux 2.6.
+	 * Perhaps more needs to be done?  Stop the queue in serious
+	 * conditions and restart it elsewhere?
+	 */
+	return NETDEV_TX_OK;
+}
+
+/*
+ * FIXME: What to do if we timeout? I think a host reset is probably in order,
+ * so that's what we do. Should we increment the stat counters too?
+ */
+static void ipv4_tx_timeout(struct net_device *dev) {
+	struct ipv4_priv *priv;
+
+	priv = netdev_priv(dev);
+	fw_error ( "%s: Timeout, resetting host\n", dev->name );
+#if 0 /* stefanr */
+	fw_core_initiate_bus_reset ( priv->card, 1 );
+#endif
+}
+
+static int ipv4_change_mtu ( struct net_device *dev, int new_mtu ) {
+#if 0
+	int max_mtu;
+	struct ipv4_priv *priv;
+#endif
+
+	if (new_mtu < 68)
+		return -EINVAL;
+
+#if 0
+	priv = netdev_priv(dev);
+	/* This is not actually true because we can fragment packets at the firewire layer */
+	max_mtu = (1 << (priv->card->max_receive + 1))
+		                - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
+	if (new_mtu > max_mtu) {
+		fw_notify ( "%s: Local node constrains MTU to %d\n", dev->name, max_mtu);
+		return -ERANGE;
+	}
+#endif
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static void ipv4_get_drvinfo(struct net_device *dev,
+struct ethtool_drvinfo *info) {
+	strcpy(info->driver, ipv4_driver_name);
+	strcpy(info->bus_info, "ieee1394"); /* FIXME provide more detail? */
+}
+
+static struct ethtool_ops ipv4_ethtool_ops = {
+	.get_drvinfo = ipv4_get_drvinfo,
+};
+
+static const struct net_device_ops ipv4_netdev_ops = {
+	.ndo_open       = ipv4_open,
+	.ndo_stop	= ipv4_stop,
+	.ndo_start_xmit = ipv4_tx,
+	.ndo_tx_timeout = ipv4_tx_timeout,
+	.ndo_change_mtu = ipv4_change_mtu,
+};
+
+static void ipv4_init_dev ( struct net_device *dev ) {
+	dev->header_ops		= &ipv4_header_ops;
+	dev->netdev_ops         = &ipv4_netdev_ops;
+	SET_ETHTOOL_OPS(dev, &ipv4_ethtool_ops);
+
+	dev->watchdog_timeo	= IPV4_TIMEOUT;
+	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
+	dev->features		= NETIF_F_HIGHDMA;
+	dev->addr_len		= IPV4_ALEN;
+	dev->hard_header_len	= IPV4_HLEN;
+	dev->type		= ARPHRD_IEEE1394;
+
+	/* FIXME: This value was copied from ether_setup(). Is it too much? */
+	dev->tx_queue_len	= 1000;
+}
+
+static int ipv4_probe ( struct device *dev ) {
+	struct fw_unit * unit;
+	struct fw_device *device;
+	struct fw_card *card;
+	struct net_device *netdev;
+	struct ipv4_priv *priv;
+	unsigned max_mtu;
+	__be64 guid;
+
+	fw_debug("ipv4 Probing\n" );
+	unit = fw_unit ( dev );
+	device = fw_device ( unit->device.parent );
+	card = device->card;
+
+	if ( ! device->is_local ) {
+		int added;
+
+		fw_debug ( "Non-local, adding remote node entry\n" );
+		added = ipv4_node_new ( card, device );
+		return added;
+	}
+	fw_debug("ipv4 Local: adding netdev\n" );
+	netdev = alloc_netdev ( sizeof(*priv), "fw-ipv4-%d", ipv4_init_dev );
+	if ( netdev == NULL) {
+		fw_error( "Out of memory\n");
+		goto out;
+	}
+
+	SET_NETDEV_DEV(netdev, card->device);
+	priv = netdev_priv(netdev);
+
+	spin_lock_init(&priv->lock);
+	priv->broadcast_state = IPV4_BROADCAST_ERROR;
+	priv->broadcast_rcv_context = NULL;
+	priv->broadcast_xmt_max_payload = 0;
+	priv->broadcast_xmt_datagramlabel = 0;
+
+	priv->local_fifo = INVALID_FIFO_ADDR;
+
+	/* INIT_WORK(&priv->wake, ipv4_handle_queue);*/
+	INIT_LIST_HEAD(&priv->packet_list);
+	INIT_LIST_HEAD(&priv->broadcasted_list);
+	INIT_LIST_HEAD(&priv->sent_list );
+
+	priv->card = card;
+
+	/*
+	 * Use the RFC 2734 default 1500 octets or the maximum payload
+	 * as initial MTU
+	 */
+	max_mtu = (1 << (card->max_receive + 1))
+		  - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
+	netdev->mtu = min(1500U, max_mtu);
+
+	/* Set our hardware address while we're at it */
+	guid = cpu_to_be64(card->guid);
+	memcpy(netdev->dev_addr, &guid, sizeof(u64));
+	memset(netdev->broadcast, 0xff, sizeof(u64));
+	if ( register_netdev ( netdev ) ) {
+		fw_error ( "Cannot register the driver\n");
+		goto out;
+	}
+
+	fw_notify ( "%s: IPv4 over Firewire on device %016llx\n",
+ netdev->name, card->guid );
+	card->netdev = netdev;
+
+	return 0 /* ipv4_new_node ( ud ) */;
+ out:
+	if ( netdev )
+		free_netdev ( netdev );
+	return -ENOENT;
+}
+
+
+static int ipv4_remove ( struct device *dev ) {
+	struct fw_unit * unit;
+	struct fw_device *device;
+	struct fw_card *card;
+	struct net_device *netdev;
+	struct ipv4_priv *priv;
+	struct ipv4_node *node;
+	struct ipv4_partial_datagram *pd, *pd_next;
+	struct ipv4_packet_task *ptask, *pt_next;
+
+	fw_debug("ipv4 Removing\n" );
+	unit = fw_unit ( dev );
+	device = fw_device ( unit->device.parent );
+	card = device->card;
+
+	if ( ! device->is_local ) {
+		fw_debug ( "Node %x is non-local, removing remote node entry\n", device->node_id );
+		ipv4_node_delete ( card, device );
+		return 0;
+	}
+	netdev = card->netdev;
+	if ( netdev ) {
+		fw_debug ( "Node %x is local: deleting netdev\n", device->node_id );
+		priv = netdev_priv ( netdev );
+		unregister_netdev ( netdev );
+		fw_debug ( "unregistered\n" );
+		if ( priv->local_fifo != INVALID_FIFO_ADDR )
+			fw_core_remove_address_handler ( &priv->handler );
+		fw_debug ( "address handler gone\n" );
+		if ( priv->broadcast_rcv_context ) {
+			fw_iso_context_stop ( priv->broadcast_rcv_context );
+			fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
+			fw_iso_context_destroy ( priv->broadcast_rcv_context );
+			fw_debug ( "rcv stopped\n" );
+		}
+		list_for_each_entry_safe( ptask, pt_next, &priv->packet_list, packet_list ) {
+			dev_kfree_skb_any ( ptask->skb );
+			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		}
+		list_for_each_entry_safe( ptask, pt_next, &priv->broadcasted_list, packet_list ) {
+			dev_kfree_skb_any ( ptask->skb );
+			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		}
+		list_for_each_entry_safe( ptask, pt_next, &priv->sent_list, packet_list ) {
+			dev_kfree_skb_any ( ptask->skb );
+			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		}
+		fw_debug ( "lists emptied\n" );
+		list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
+			if ( node->pdg_size ) {
+				list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
+					ipv4_pd_delete ( pd );
+				node->pdg_size = 0;
+			}
+			node->fifo = INVALID_FIFO_ADDR;
+		}
+		fw_debug ( "nodes cleaned up\n" );
+		free_netdev ( netdev );
+		card->netdev = NULL;
+		fw_debug ( "done\n" );
+	}
+	return 0;
+}
+
+static void ipv4_update ( struct fw_unit *unit ) {
+	struct fw_device *device;
+	struct fw_card *card;
+
+	fw_debug ( "ipv4_update unit %p\n", unit );
+	device = fw_device ( unit->device.parent );
+	card = device->card;
+	if ( ! device->is_local ) {
+		struct ipv4_node *node;
+		u64 guid;
+		struct net_device *netdev;
+		struct ipv4_priv *priv;
+
+		netdev = card->netdev;
+		if ( netdev ) {
+			priv = netdev_priv ( netdev );
+			guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+			node = ipv4_node_find_by_guid ( priv, guid );
+			if ( ! node ) {
+				fw_error ( "ipv4_update: no node for device %llx\n", guid );
+				return;
+			}
+			fw_debug ( "Non-local, updating remote node entry for guid %llx old generation %x, old nodeid %x\n", guid, node->generation, node->nodeid );
+			node->generation = device->generation;
+			rmb();
+			node->nodeid = device->node_id;
+			fw_debug ( "New generation %x, new nodeid %x\n", node->generation, node->nodeid );
+		} else
+			fw_error ( "nonlocal, but no netdev?  How can that be?\n" );
+	} else {
+		/* FIXME: What do we need to do on bus reset? */
+		fw_debug ( "Local, doing nothing\n" );
+	}
+}
+
+static struct fw_driver ipv4_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = ipv4_driver_name,
+		.bus = &fw_bus_type,
+		.probe = ipv4_probe,
+		.remove = ipv4_remove,
+	},
+	.update = ipv4_update,
+	.id_table = ipv4_id_table,
+};
+
+static int __init ipv4_init ( void ) {
+	int added;
+
+	added = fw_core_add_descriptor ( &ipv4_unit_directory );
+	if ( added < 0 )
+		fw_error ( "Failed to add descriptor" );
+	ipv4_packet_task_cache = kmem_cache_create("packet_task",
+ sizeof(struct ipv4_packet_task), 0, 0, NULL);
+	fw_debug("Adding ipv4 module\n" );
+	return driver_register ( &ipv4_driver.driver );
+}
+
+static void __exit ipv4_cleanup ( void ) {
+	fw_core_remove_descriptor ( &ipv4_unit_directory );
+	fw_debug("Removing ipv4 module\n" );
+	driver_unregister ( &ipv4_driver.driver );
+}
+
+module_init(ipv4_init);
+module_exit(ipv4_cleanup);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index e584b7215e8b..d44f47d3b2d9 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -3,6 +3,7 @@
 
 #include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/list.h>
@@ -130,6 +131,13 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
+	/* Only non-NULL if firewire-ipv4 is active on this card. */
+	void *netdev;
+	/*
+	 * The nodes get probed before the card, so we need a place to store
+	 * them independent of card->netdev
+	 */
+	struct list_head ipv4_nodes;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
@@ -355,4 +363,90 @@ int fw_run_transaction(struct fw_card *card, int tcode, int destination_id,
 		       int generation, int speed, unsigned long long offset,
 		       void *payload, size_t length);
 
+static inline int fw_stream_packet_destination_id(int tag, int channel, int sy)
+{
+	return tag << 14 | channel << 8 | sy;
+}
+
+struct fw_descriptor {
+	struct list_head link;
+	size_t length;
+	u32 immediate;
+	u32 key;
+	const u32 *data;
+};
+
+int fw_core_add_descriptor(struct fw_descriptor *desc);
+void fw_core_remove_descriptor(struct fw_descriptor *desc);
+
+/*
+ * The iso packet format allows for an immediate header/payload part
+ * stored in 'header' immediately after the packet info plus an
+ * indirect payload part that is pointer to by the 'payload' field.
+ * Applications can use one or the other or both to implement simple
+ * low-bandwidth streaming (e.g. audio) or more advanced
+ * scatter-gather streaming (e.g. assembling video frame automatically).
+ */
+struct fw_iso_packet {
+	u16 payload_length;	/* Length of indirect payload. */
+	u32 interrupt:1;	/* Generate interrupt on this packet */
+	u32 skip:1;		/* Set to not send packet at all. */
+	u32 tag:2;
+	u32 sy:4;
+	u32 header_length:8;	/* Length of immediate header. */
+	u32 header[0];
+};
+
+#define FW_ISO_CONTEXT_TRANSMIT	0
+#define FW_ISO_CONTEXT_RECEIVE	1
+
+#define FW_ISO_CONTEXT_MATCH_TAG0	 1
+#define FW_ISO_CONTEXT_MATCH_TAG1	 2
+#define FW_ISO_CONTEXT_MATCH_TAG2	 4
+#define FW_ISO_CONTEXT_MATCH_TAG3	 8
+#define FW_ISO_CONTEXT_MATCH_ALL_TAGS	15
+
+/*
+ * An iso buffer is just a set of pages mapped for DMA in the
+ * specified direction.  Since the pages are to be used for DMA, they
+ * are not mapped into the kernel virtual address space.  We store the
+ * DMA address in the page private. The helper function
+ * fw_iso_buffer_map() will map the pages into a given vma.
+ */
+struct fw_iso_buffer {
+	enum dma_data_direction direction;
+	struct page **pages;
+	int page_count;
+};
+
+int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
+		       int page_count, enum dma_data_direction direction);
+void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
+
+struct fw_iso_context;
+typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
+				  u32 cycle, size_t header_length,
+				  void *header, void *data);
+struct fw_iso_context {
+	struct fw_card *card;
+	int type;
+	int channel;
+	int speed;
+	size_t header_size;
+	fw_iso_callback_t callback;
+	void *callback_data;
+};
+
+struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
+		int type, int channel, int speed, size_t header_size,
+		fw_iso_callback_t callback, void *callback_data);
+int fw_iso_context_queue(struct fw_iso_context *ctx,
+			 struct fw_iso_packet *packet,
+			 struct fw_iso_buffer *buffer,
+			 unsigned long payload);
+int fw_iso_context_start(struct fw_iso_context *ctx,
+			 int cycle, int sync, int tags);
+int fw_iso_context_stop(struct fw_iso_context *ctx);
+void fw_iso_context_destroy(struct fw_iso_context *ctx);
+
 #endif /* _LINUX_FIREWIRE_H */
-- 
cgit v1.2.3-71-gd317


From f91e3bd842ec6f5cea245993926ee8ff26250467 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 7 Jun 2009 22:57:53 +0200
Subject: firewire: net: style changes

Change names of types, variables, functions.
Omit debug code.
Use get_unaligned*, put_unaligned*.
Annotate big endian data.
Handle errors in __init.
Change whitespace.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c |    2 +-
 drivers/firewire/net.c       | 2041 ++++++++++++++++++++----------------------
 include/linux/firewire.h     |    9 +-
 3 files changed, 969 insertions(+), 1083 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index cdab32b20675..8c45e43da7c5 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -430,7 +430,7 @@ void fw_card_initialize(struct fw_card *card,
 
 	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
 	card->netdev = NULL;
-	INIT_LIST_HEAD(&card->ipv4_nodes);
+	INIT_LIST_HEAD(&card->peer_list);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 15353886bd80..ba6f924b1b13 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -6,6 +6,7 @@
  * based on eth1394 by Ben Collins et al
  */
 
+#include <linux/bug.h>
 #include <linux/device.h>
 #include <linux/ethtool.h>
 #include <linux/firewire.h>
@@ -13,6 +14,7 @@
 #include <linux/highmem.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/jiffies.h>
 #include <linux/mod_devicetable.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -22,181 +24,109 @@
 #include <asm/unaligned.h>
 #include <net/arp.h>
 
-/* Things to potentially make runtime cofigurable */
-/* must be at least as large as our maximum receive size */
-#define FIFO_SIZE 4096
-/* Network timeout in glibbles */
-#define IPV4_TIMEOUT       100000
+#define FWNET_MAX_FRAGMENTS	25	/* arbitrary limit */
+#define FWNET_ISO_PAGE_COUNT	(PAGE_SIZE < 16 * 1024 ? 4 : 2)
 
-/* Runitme configurable paramaters */
-static int ipv4_mpd = 25;
-static int ipv4_max_xmt = 0;
-/* 16k for receiving arp and broadcast packets.  Enough? */
-static int ipv4_iso_page_count = 4;
+#define IEEE1394_BROADCAST_CHANNEL	31
+#define IEEE1394_ALL_NODES		(0xffc0 | 0x003f)
+#define IEEE1394_MAX_PAYLOAD_S100	512
+#define FWNET_NO_FIFO_ADDR		(~0ULL)
 
-MODULE_AUTHOR("Jay Fenlason (fenlason@redhat.com)");
-MODULE_DESCRIPTION("Firewire IPv4 Driver (IPv4-over-IEEE1394 as per RFC 2734)");
-MODULE_LICENSE("GPL");
-MODULE_DEVICE_TABLE(ieee1394, ipv4_id_table);
-module_param_named(max_partial_datagrams, ipv4_mpd, int, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(max_partial_datagrams, "Maximum number of received"
- " incomplete fragmented datagrams (default = 25).");
-
-/* Max xmt is useful for forcing fragmentation, which makes testing easier. */
-module_param_named(max_transmit, ipv4_max_xmt, int, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(max_transmit, "Maximum datagram size to transmit"
- " (larger datagrams will be fragmented) (default = 0 (use hardware defaults).");
-
-/* iso page count controls how many pages will be used for receiving broadcast packets. */
-module_param_named(iso_pages, ipv4_iso_page_count, int, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(iso_pages, "Number of pages to use for receiving broadcast packets"
- " (default = 4).");
-
-/* uncomment this line to do debugging */
-#define fw_debug(s, args...) printk(KERN_DEBUG KBUILD_MODNAME ": " s, ## args)
-
-/* comment out these lines to do debugging. */
-/* #undef fw_debug */
-/* #define fw_debug(s...) */
-/* #define print_hex_dump(l...) */
-
-/* Define a fake hardware header format for the networking core.  Note that
- * header size cannot exceed 16 bytes as that is the size of the header cache.
- * Also, we do not need the source address in the header so we omit it and
- * keep the header to under 16 bytes */
-#define IPV4_ALEN (8)
-/* This must equal sizeof(struct ipv4_ether_hdr) */
-#define IPV4_HLEN (10)
-
-/* FIXME: what's a good size for this? */
-#define INVALID_FIFO_ADDR (u64)~0ULL
-
-/* Things specified by standards */
-#define BROADCAST_CHANNEL 31
-
-#define S100_BUFFER_SIZE 512
-#define MAX_BUFFER_SIZE 4096
-
-#define IPV4_GASP_SPECIFIER_ID	0x00005EU
-#define IPV4_GASP_VERSION	0x00000001U
-
-#define IPV4_GASP_OVERHEAD (2 * sizeof(u32)) /* for GASP header */
-
-#define IPV4_UNFRAG_HDR_SIZE	sizeof(u32)
-#define IPV4_FRAG_HDR_SIZE	(2 * sizeof(u32))
-#define IPV4_FRAG_OVERHEAD	sizeof(u32)
-
-#define ALL_NODES (0xffc0 | 0x003f)
-
-#define IPV4_HDR_UNFRAG		0	/* unfragmented		*/
-#define IPV4_HDR_FIRSTFRAG	1	/* first fragment	*/
-#define IPV4_HDR_LASTFRAG	2	/* last fragment	*/
-#define IPV4_HDR_INTFRAG	3	/* interior fragment	*/
-
-/* Our arp packet (ARPHRD_IEEE1394) */
-/* FIXME: note that this is probably bogus on weird-endian machines */
-struct ipv4_arp {
-	u16 hw_type;		/* 0x0018	*/
-	u16 proto_type;		/* 0x0806       */
-	u8 hw_addr_len;		/* 16		*/
-	u8 ip_addr_len;         /* 4		*/
-	u16 opcode;	        /* ARP Opcode	*/
-	/* Above is exactly the same format as struct arphdr */
-
-	u64 s_uniq_id;		/* Sender's 64bit EUI			*/
-	u8 max_rec;             /* Sender's max packet size		*/
-	u8 sspd;		/* Sender's max speed			*/
-	u16 fifo_hi;            /* hi 16bits of sender's FIFO addr	*/
-	u32 fifo_lo;            /* lo 32bits of sender's FIFO addr	*/
-	u32 sip;		/* Sender's IP Address			*/
-	u32 tip;		/* IP Address of requested hw addr	*/
-} __attribute__((packed));
+#define IANA_SPECIFIER_ID		0x00005eU
+#define RFC2734_SW_VERSION		0x000001U
 
-struct ipv4_ether_hdr {
-	unsigned char	h_dest[IPV4_ALEN];	/* destination address */
-	unsigned short  h_proto;                /* packet type ID field */
-}  __attribute__((packed));
+#define IEEE1394_GASP_HDR_SIZE	8
 
-static inline struct ipv4_ether_hdr *ipv4_ether_hdr(const struct sk_buff *skb)
-{
-	return (struct ipv4_ether_hdr *)skb_mac_header(skb);
-}
+#define RFC2374_UNFRAG_HDR_SIZE	4
+#define RFC2374_FRAG_HDR_SIZE	8
+#define RFC2374_FRAG_OVERHEAD	4
 
-enum ipv4_tx_type {
-	IPV4_UNKNOWN = 0,
-	IPV4_GASP = 1,
-	IPV4_WRREQ = 2,
-};
+#define RFC2374_HDR_UNFRAG	0	/* unfragmented		*/
+#define RFC2374_HDR_FIRSTFRAG	1	/* first fragment	*/
+#define RFC2374_HDR_LASTFRAG	2	/* last fragment	*/
+#define RFC2374_HDR_INTFRAG	3	/* interior fragment	*/
 
-enum ipv4_broadcast_state {
-	IPV4_BROADCAST_ERROR,
-	IPV4_BROADCAST_RUNNING,
-	IPV4_BROADCAST_STOPPED,
-};
+#define RFC2734_HW_ADDR_LEN	16
 
-#define ipv4_get_hdr_lf(h)		(((h)->w0&0xC0000000)>>30)
-#define ipv4_get_hdr_ether_type(h)	(((h)->w0&0x0000FFFF)    )
-#define ipv4_get_hdr_dg_size(h)		(((h)->w0&0x0FFF0000)>>16)
-#define ipv4_get_hdr_fg_off(h)		(((h)->w0&0x00000FFF)    )
-#define ipv4_get_hdr_dgl(h)		(((h)->w1&0xFFFF0000)>>16)
+struct rfc2734_arp {
+	__be16 hw_type;		/* 0x0018	*/
+	__be16 proto_type;	/* 0x0806       */
+	u8 hw_addr_len;		/* 16		*/
+	u8 ip_addr_len;		/* 4		*/
+	__be16 opcode;		/* ARP Opcode	*/
+	/* Above is exactly the same format as struct arphdr */
 
-#define ipv4_set_hdr_lf(lf)		(( lf)<<30)
-#define ipv4_set_hdr_ether_type(et)	(( et)    )
-#define ipv4_set_hdr_dg_size(dgs)	((dgs)<<16)
-#define ipv4_set_hdr_fg_off(fgo)	((fgo)    )
+	__be64 s_uniq_id;	/* Sender's 64bit EUI			*/
+	u8 max_rec;		/* Sender's max packet size		*/
+	u8 sspd;		/* Sender's max speed			*/
+	__be16 fifo_hi;		/* hi 16bits of sender's FIFO addr	*/
+	__be32 fifo_lo;		/* lo 32bits of sender's FIFO addr	*/
+	__be32 sip;		/* Sender's IP Address			*/
+	__be32 tip;		/* IP Address of requested hw addr	*/
+} __attribute__((packed));
 
-#define ipv4_set_hdr_dgl(dgl)		((dgl)<<16)
+/* This header format is specific to this driver implementation. */
+#define FWNET_ALEN	8
+#define FWNET_HLEN	10
+struct fwnet_header {
+	u8 h_dest[FWNET_ALEN];	/* destination address */
+	__be16 h_proto;		/* packet type ID field */
+} __attribute__((packed));
 
-struct ipv4_hdr {
+/* IPv4 and IPv6 encapsulation header */
+struct rfc2734_header {
 	u32 w0;
 	u32 w1;
 };
 
-static inline void ipv4_make_uf_hdr( struct ipv4_hdr *hdr, unsigned ether_type) {
-	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_UNFRAG)
-		   |ipv4_set_hdr_ether_type(ether_type);
-	fw_debug ( "Setting unfragmented header %p to %x\n", hdr, hdr->w0 );
-}
+#define fwnet_get_hdr_lf(h)		(((h)->w0 & 0xc0000000) >> 30)
+#define fwnet_get_hdr_ether_type(h)	(((h)->w0 & 0x0000ffff))
+#define fwnet_get_hdr_dg_size(h)	(((h)->w0 & 0x0fff0000) >> 16)
+#define fwnet_get_hdr_fg_off(h)		(((h)->w0 & 0x00000fff))
+#define fwnet_get_hdr_dgl(h)		(((h)->w1 & 0xffff0000) >> 16)
 
-static inline void ipv4_make_ff_hdr ( struct ipv4_hdr *hdr, unsigned ether_type, unsigned dg_size, unsigned dgl ) {
-	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_FIRSTFRAG)
-		   |ipv4_set_hdr_dg_size(dg_size)
-		   |ipv4_set_hdr_ether_type(ether_type);
-	hdr->w1 = ipv4_set_hdr_dgl(dgl);
-	fw_debug ( "Setting fragmented header %p to first_frag %x,%x (et %x, dgs %x, dgl %x)\n", hdr, hdr->w0, hdr->w1,
- ether_type, dg_size, dgl );
-}
+#define fwnet_set_hdr_lf(lf)		((lf)  << 30)
+#define fwnet_set_hdr_ether_type(et)	(et)
+#define fwnet_set_hdr_dg_size(dgs)	((dgs) << 16)
+#define fwnet_set_hdr_fg_off(fgo)	(fgo)
 
-static inline void ipv4_make_sf_hdr ( struct ipv4_hdr *hdr, unsigned lf, unsigned dg_size, unsigned fg_off, unsigned dgl) {
-	hdr->w0 = ipv4_set_hdr_lf(lf)
-		 |ipv4_set_hdr_dg_size(dg_size)
-		 |ipv4_set_hdr_fg_off(fg_off);
-	hdr->w1 = ipv4_set_hdr_dgl(dgl);
-	fw_debug ( "Setting fragmented header %p to %x,%x (lf %x, dgs %x, fo %x dgl %x)\n",
- hdr, hdr->w0, hdr->w1,
- lf, dg_size, fg_off, dgl );
-}
+#define fwnet_set_hdr_dgl(dgl)		((dgl) << 16)
 
-/* End of IP1394 headers */
+static inline void fwnet_make_uf_hdr(struct rfc2734_header *hdr,
+		unsigned ether_type)
+{
+	hdr->w0 = fwnet_set_hdr_lf(RFC2374_HDR_UNFRAG)
+		  | fwnet_set_hdr_ether_type(ether_type);
+}
 
-/* Fragment types */
-#define ETH1394_HDR_LF_UF	0	/* unfragmented		*/
-#define ETH1394_HDR_LF_FF	1	/* first fragment	*/
-#define ETH1394_HDR_LF_LF	2	/* last fragment	*/
-#define ETH1394_HDR_LF_IF	3	/* interior fragment	*/
+static inline void fwnet_make_ff_hdr(struct rfc2734_header *hdr,
+		unsigned ether_type, unsigned dg_size, unsigned dgl)
+{
+	hdr->w0 = fwnet_set_hdr_lf(RFC2374_HDR_FIRSTFRAG)
+		  | fwnet_set_hdr_dg_size(dg_size)
+		  | fwnet_set_hdr_ether_type(ether_type);
+	hdr->w1 = fwnet_set_hdr_dgl(dgl);
+}
 
-#define IP1394_HW_ADDR_LEN	16	/* As per RFC		*/
+static inline void fwnet_make_sf_hdr(struct rfc2734_header *hdr,
+		unsigned lf, unsigned dg_size, unsigned fg_off, unsigned dgl)
+{
+	hdr->w0 = fwnet_set_hdr_lf(lf)
+		  | fwnet_set_hdr_dg_size(dg_size)
+		  | fwnet_set_hdr_fg_off(fg_off);
+	hdr->w1 = fwnet_set_hdr_dgl(dgl);
+}
 
 /* This list keeps track of what parts of the datagram have been filled in */
-struct ipv4_fragment_info {
-        struct list_head fragment_info;
+struct fwnet_fragment_info {
+	struct list_head fi_link;
 	u16 offset;
 	u16 len;
 };
 
-struct ipv4_partial_datagram {
-	struct list_head pdg_list;
-	struct list_head fragment_info;
+struct fwnet_partial_datagram {
+	struct list_head pd_link;
+	struct list_head fi_list;
 	struct sk_buff *skb;
 	/* FIXME Why not use skb->data? */
 	char *pbuf;
@@ -208,40 +138,43 @@ struct ipv4_partial_datagram {
 /*
  * We keep one of these for each IPv4 capable device attached to a fw_card.
  * The list of them is stored in the fw_card structure rather than in the
- * ipv4_priv because the remote IPv4 nodes may be probed before the card is,
- * so we need a place to store them before the ipv4_priv structure is
+ * fwnet_device because the remote IPv4 nodes may be probed before the card is,
+ * so we need a place to store them before the fwnet_device structure is
  * allocated.
  */
-struct ipv4_node {
-	struct list_head ipv4_nodes;
-	/* guid of the remote node */
+struct fwnet_peer {
+	struct list_head peer_link;
+	/* guid of the remote peer */
 	u64 guid;
-	/* FIFO address to transmit datagrams to, or INVALID_FIFO_ADDR */
+	/* FIFO address to transmit datagrams to, or FWNET_NO_FIFO_ADDR */
 	u64 fifo;
 
 	spinlock_t pdg_lock;	/* partial datagram lock		*/
-	/* List of partial datagrams received from this node */
-	struct list_head pdg_list;
-	/* Number of entries in pdg_list at the moment */
+	/* List of partial datagrams received from this peer */
+	struct list_head pd_list;
+	/* Number of entries in pd_list at the moment */
 	unsigned pdg_size;
 
-	/* max payload to transmit to this remote node */
-	/* This already includes the IPV4_FRAG_HDR_SIZE overhead */
+	/* max payload to transmit to this remote peer */
+	/* This already includes the RFC2374_FRAG_HDR_SIZE overhead */
 	u16 max_payload;
 	/* outgoing datagram label */
 	u16 datagram_label;
-	/* Current node_id of the remote node */
-	u16 nodeid;
-	/* current generation of the remote node */
+	/* Current node_id of the remote peer */
+	u16 node_id;
+	/* current generation of the remote peer */
 	u8 generation;
-	/* max speed that this node can receive at */
+	/* max speed that this peer can receive at */
 	u8 xmt_speed;
 };
 
-struct ipv4_priv {
+struct fwnet_device {
 	spinlock_t lock;
-
-	enum ipv4_broadcast_state broadcast_state;
+	enum {
+		FWNET_BROADCAST_ERROR,
+		FWNET_BROADCAST_RUNNING,
+		FWNET_BROADCAST_STOPPED,
+	} broadcast_state;
 	struct fw_iso_context *broadcast_rcv_context;
 	struct fw_iso_buffer broadcast_rcv_buffer;
 	void **broadcast_rcv_buffer_ptrs;
@@ -257,14 +190,12 @@ struct ipv4_priv {
 	u16 broadcast_xmt_datagramlabel;
 
 	/*
-	 * The csr address that remote nodes must send datagrams to for us to
+	 * The CSR address that remote nodes must send datagrams to for us to
 	 * receive them.
 	 */
 	struct fw_address_handler handler;
 	u64 local_fifo;
 
-	/* Wake up to xmt	 */
-        /* struct work_struct wake;*/
 	/* List of packets to be sent */
 	struct list_head packet_list;
 	/*
@@ -279,17 +210,17 @@ struct ipv4_priv {
 };
 
 /* This is our task struct. It's used for the packet complete callback.  */
-struct ipv4_packet_task {
+struct fwnet_packet_task {
 	/*
-	 * ptask can actually be on priv->packet_list, priv->broadcasted_list,
-	 * or priv->sent_list depending on its current state.
+	 * ptask can actually be on dev->packet_list, dev->broadcasted_list,
+	 * or dev->sent_list depending on its current state.
 	 */
-	struct list_head packet_list;
+	struct list_head pt_link;
 	struct fw_transaction transaction;
-	struct ipv4_hdr hdr;
+	struct rfc2734_header hdr;
 	struct sk_buff *skb;
-	struct ipv4_priv *priv;
-	enum ipv4_tx_type tx_type;
+	struct fwnet_device *dev;
+
 	int outstanding_pkts;
 	unsigned max_payload;
 	u64 fifo_addr;
@@ -298,243 +229,192 @@ struct ipv4_packet_task {
 	u8 speed;
 };
 
-static struct kmem_cache *ipv4_packet_task_cache;
-
-static const char ipv4_driver_name[] = "firewire-ipv4";
-
-static const struct ieee1394_device_id ipv4_id_table[] = {
-	{
-		.match_flags  = IEEE1394_MATCH_SPECIFIER_ID |
-				IEEE1394_MATCH_VERSION,
-		.specifier_id = IPV4_GASP_SPECIFIER_ID,
-		.version      = IPV4_GASP_VERSION,
-	},
-	{ }
-};
-
-static u32 ipv4_unit_directory_data[] = {
-	0x00040000,					/* unit directory */
-	0x12000000 | IPV4_GASP_SPECIFIER_ID,	/* specifier ID */
-	0x81000003,					/* text descriptor */
-	0x13000000 | IPV4_GASP_VERSION,		/* version */
-	0x81000005,					/* text descriptor */
-
-	0x00030000,					/* Three quadlets */
-	0x00000000,					/* Text */
-	0x00000000,					/* Language 0 */
-	0x49414e41,					/* I A N A */
-	0x00030000,					/* Three quadlets */
-	0x00000000,					/* Text */
-	0x00000000,					/* Language 0 */
-	0x49507634,					/* I P v 4 */
-};
-
-static struct fw_descriptor ipv4_unit_directory = {
-	.length = ARRAY_SIZE(ipv4_unit_directory_data),
-	.key = 0xd1000000,
-	.data = ipv4_unit_directory_data
-};
-
-static int ipv4_send_packet(struct ipv4_packet_task *ptask );
-
-/* ------------------------------------------------------------------ */
-/******************************************
- * HW Header net device functions
- ******************************************/
-  /* These functions have been adapted from net/ethernet/eth.c */
-
-/* Create a fake MAC header for an arbitrary protocol layer.
- * saddr=NULL means use device source address
- * daddr=NULL means leave destination address (eg unresolved arp). */
+/*
+ * saddr == NULL means use device source address.
+ * daddr == NULL means leave destination address (eg unresolved arp).
+ */
+static int fwnet_header_create(struct sk_buff *skb, struct net_device *net,
+			unsigned short type, const void *daddr,
+			const void *saddr, unsigned len)
+{
+	struct fwnet_header *h;
 
-static int ipv4_header ( struct sk_buff *skb, struct net_device *dev,
-		       unsigned short type, const void *daddr,
-		       const void *saddr, unsigned len) {
-	struct ipv4_ether_hdr *eth;
+	h = (struct fwnet_header *)skb_push(skb, sizeof(*h));
+	put_unaligned_be16(type, &h->h_proto);
 
-	eth = (struct ipv4_ether_hdr *)skb_push(skb, sizeof(*eth));
-	eth->h_proto = htons(type);
+	if (net->flags & (IFF_LOOPBACK | IFF_NOARP)) {
+		memset(h->h_dest, 0, net->addr_len);
 
-	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
-		memset(eth->h_dest, 0, dev->addr_len);
-		return dev->hard_header_len;
+		return net->hard_header_len;
 	}
 
 	if (daddr) {
-		memcpy(eth->h_dest, daddr, dev->addr_len);
-		return dev->hard_header_len;
+		memcpy(h->h_dest, daddr, net->addr_len);
+
+		return net->hard_header_len;
 	}
 
-	return -dev->hard_header_len;
+	return -net->hard_header_len;
 }
 
-/* Rebuild the faked MAC header. This is called after an ARP
- * (or in future other address resolution) has completed on this
- * sk_buff. We now let ARP fill in the other fields.
- *
- * This routine CANNOT use cached dst->neigh!
- * Really, it is used only when dst->neigh is wrong.
- */
-
-static int ipv4_rebuild_header(struct sk_buff *skb)
+static int fwnet_header_rebuild(struct sk_buff *skb)
 {
-	struct ipv4_ether_hdr *eth;
+	struct fwnet_header *h = (struct fwnet_header *)skb->data;
 
-	eth = (struct ipv4_ether_hdr *)skb->data;
-	if (eth->h_proto == htons(ETH_P_IP))
-		return arp_find((unsigned char *)&eth->h_dest, skb);
+	if (get_unaligned_be16(&h->h_proto) == ETH_P_IP)
+		return arp_find((unsigned char *)&h->h_dest, skb);
 
-	fw_notify ( "%s: unable to resolve type %04x addresses\n",
-		   skb->dev->name,ntohs(eth->h_proto) );
+	fw_notify("%s: unable to resolve type %04x addresses\n",
+		  skb->dev->name, be16_to_cpu(h->h_proto));
 	return 0;
 }
 
-static int ipv4_header_cache(const struct neighbour *neigh, struct hh_cache *hh) {
-	unsigned short type = hh->hh_type;
-	struct net_device *dev;
-	struct ipv4_ether_hdr *eth;
+static int fwnet_header_cache(const struct neighbour *neigh,
+			      struct hh_cache *hh)
+{
+	struct net_device *net;
+	struct fwnet_header *h;
 
-	if (type == htons(ETH_P_802_3))
+	if (hh->hh_type == cpu_to_be16(ETH_P_802_3))
 		return -1;
-	dev = neigh->dev;
-	eth = (struct ipv4_ether_hdr *)((u8 *)hh->hh_data + 16 - sizeof(*eth));
-	eth->h_proto = type;
-	memcpy(eth->h_dest, neigh->ha, dev->addr_len);
+	net = neigh->dev;
+	h = (struct fwnet_header *)((u8 *)hh->hh_data + 16 - sizeof(*h));
+	h->h_proto = hh->hh_type;
+	memcpy(h->h_dest, neigh->ha, net->addr_len);
+	hh->hh_len = FWNET_HLEN;
 
-	hh->hh_len = IPV4_HLEN;
 	return 0;
 }
 
 /* Called by Address Resolution module to notify changes in address. */
-static void ipv4_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char * haddr ) {
-	memcpy((u8 *)hh->hh_data + 16 - IPV4_HLEN, haddr, dev->addr_len);
+static void fwnet_header_cache_update(struct hh_cache *hh,
+		const struct net_device *net, const unsigned char *haddr)
+{
+	memcpy((u8 *)hh->hh_data + 16 - FWNET_HLEN, haddr, net->addr_len);
 }
 
-static int ipv4_header_parse(const struct sk_buff *skb, unsigned char *haddr) {
-	memcpy(haddr, skb->dev->dev_addr, IPV4_ALEN);
-	return IPV4_ALEN;
+static int fwnet_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+	memcpy(haddr, skb->dev->dev_addr, FWNET_ALEN);
+
+	return FWNET_ALEN;
 }
 
-static const struct header_ops ipv4_header_ops = {
-	.create         = ipv4_header,
-	.rebuild        = ipv4_rebuild_header,
-	.cache		= ipv4_header_cache,
-	.cache_update	= ipv4_header_cache_update,
-	.parse          = ipv4_header_parse,
+static const struct header_ops fwnet_header_ops = {
+	.create         = fwnet_header_create,
+	.rebuild        = fwnet_header_rebuild,
+	.cache		= fwnet_header_cache,
+	.cache_update	= fwnet_header_cache_update,
+	.parse          = fwnet_header_parse,
 };
 
-/* ------------------------------------------------------------------ */
-
 /* FIXME: is this correct for all cases? */
-static bool ipv4_frag_overlap(struct ipv4_partial_datagram *pd, unsigned offset, unsigned len)
+static bool fwnet_frag_overlap(struct fwnet_partial_datagram *pd,
+			       unsigned offset, unsigned len)
 {
-        struct ipv4_fragment_info *fi;
+	struct fwnet_fragment_info *fi;
 	unsigned end = offset + len;
 
-	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
-		if (offset < fi->offset + fi->len && end > fi->offset) {
-			fw_debug ( "frag_overlap pd %p fi %p (%x@%x) with %x@%x\n", pd, fi, fi->len, fi->offset, len, offset );
+	list_for_each_entry(fi, &pd->fi_list, fi_link)
+		if (offset < fi->offset + fi->len && end > fi->offset)
 			return true;
-		}
-	}
-	fw_debug ( "frag_overlap %p does not overlap with %x@%x\n", pd, len, offset );
+
 	return false;
 }
 
 /* Assumes that new fragment does not overlap any existing fragments */
-static struct ipv4_fragment_info *ipv4_frag_new ( struct ipv4_partial_datagram *pd, unsigned offset, unsigned len ) {
-	struct ipv4_fragment_info *fi, *fi2, *new;
+static struct fwnet_fragment_info *fwnet_frag_new(
+	struct fwnet_partial_datagram *pd, unsigned offset, unsigned len)
+{
+	struct fwnet_fragment_info *fi, *fi2, *new;
 	struct list_head *list;
 
-	fw_debug ( "frag_new pd %p %x@%x\n", pd, len, offset );
-	list = &pd->fragment_info;
-	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
+	list = &pd->fi_list;
+	list_for_each_entry(fi, &pd->fi_list, fi_link) {
 		if (fi->offset + fi->len == offset) {
 			/* The new fragment can be tacked on to the end */
 			/* Did the new fragment plug a hole? */
-			fi2 = list_entry(fi->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+			fi2 = list_entry(fi->fi_link.next,
+					 struct fwnet_fragment_info, fi_link);
 			if (fi->offset + fi->len == fi2->offset) {
-				fw_debug ( "pd %p: hole filling %p (%x@%x) and %p(%x@%x): now %x@%x\n", pd, fi, fi->len, fi->offset,
-				fi2, fi2->len, fi2->offset, fi->len + len + fi2->len, fi->offset );
 				/* glue fragments together */
 				fi->len += len + fi2->len;
-				list_del(&fi2->fragment_info);
+				list_del(&fi2->fi_link);
 				kfree(fi2);
 			} else {
-				fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, fi->len+len, fi->offset );
 				fi->len += len;
 			}
+
 			return fi;
 		}
 		if (offset + len == fi->offset) {
 			/* The new fragment can be tacked on to the beginning */
 			/* Did the new fragment plug a hole? */
-			fi2 = list_entry(fi->fragment_info.prev, struct ipv4_fragment_info, fragment_info);
+			fi2 = list_entry(fi->fi_link.prev,
+					 struct fwnet_fragment_info, fi_link);
 			if (fi2->offset + fi2->len == fi->offset) {
 				/* glue fragments together */
-				fw_debug ( "pd %p: extending %p and merging with %p from %x@%x to %x@%x\n",
- pd, fi2, fi, fi2->len, fi2->offset, fi2->len + fi->len + len, fi2->offset );
 				fi2->len += fi->len + len;
-				list_del(&fi->fragment_info);
+				list_del(&fi->fi_link);
 				kfree(fi);
+
 				return fi2;
 			}
-			fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, offset, fi->len + len );
 			fi->offset = offset;
 			fi->len += len;
+
 			return fi;
 		}
 		if (offset > fi->offset + fi->len) {
-			list = &fi->fragment_info;
+			list = &fi->fi_link;
 			break;
 		}
 		if (offset + len < fi->offset) {
-			list = fi->fragment_info.prev;
+			list = fi->fi_link.prev;
 			break;
 		}
 	}
 
 	new = kmalloc(sizeof(*new), GFP_ATOMIC);
 	if (!new) {
-		fw_error ( "out of memory in fragment handling!\n" );
+		fw_error("out of memory\n");
 		return NULL;
 	}
 
 	new->offset = offset;
 	new->len = len;
-	list_add(&new->fragment_info, list);
-	fw_debug ( "pd %p: new frag %p %x@%x\n", pd, new, new->len, new->offset );
-	list_for_each_entry( fi, &pd->fragment_info, fragment_info )
-		fw_debug ( "fi %p %x@%x\n", fi, fi->len, fi->offset );
+	list_add(&new->fi_link, list);
+
 	return new;
 }
 
-/* ------------------------------------------------------------------ */
-
-static struct ipv4_partial_datagram *ipv4_pd_new(struct net_device *netdev,
- struct ipv4_node *node, u16 datagram_label, unsigned dg_size, u32 *frag_buf,
- unsigned frag_off, unsigned frag_len) {
-	struct ipv4_partial_datagram *new;
-	struct ipv4_fragment_info *fi;
+static struct fwnet_partial_datagram *fwnet_pd_new(struct net_device *net,
+		struct fwnet_peer *peer, u16 datagram_label, unsigned dg_size,
+		void *frag_buf, unsigned frag_off, unsigned frag_len)
+{
+	struct fwnet_partial_datagram *new;
+	struct fwnet_fragment_info *fi;
 
 	new = kmalloc(sizeof(*new), GFP_ATOMIC);
 	if (!new)
 		goto fail;
-	INIT_LIST_HEAD(&new->fragment_info);
-	fi = ipv4_frag_new ( new, frag_off, frag_len);
-	if ( fi == NULL )
+
+	INIT_LIST_HEAD(&new->fi_list);
+	fi = fwnet_frag_new(new, frag_off, frag_len);
+	if (fi == NULL)
 		goto fail_w_new;
+
 	new->datagram_label = datagram_label;
 	new->datagram_size = dg_size;
-	new->skb = dev_alloc_skb(dg_size + netdev->hard_header_len + 15);
-	if ( new->skb == NULL )
+	new->skb = dev_alloc_skb(dg_size + net->hard_header_len + 15);
+	if (new->skb == NULL)
 		goto fail_w_fi;
-	skb_reserve(new->skb, (netdev->hard_header_len + 15) & ~15);
+
+	skb_reserve(new->skb, (net->hard_header_len + 15) & ~15);
 	new->pbuf = skb_put(new->skb, dg_size);
 	memcpy(new->pbuf + frag_off, frag_buf, frag_len);
-	list_add_tail(&new->pdg_list, &node->pdg_list);
-	fw_debug ( "pd_new: new pd %p { dgl %u, dg_size %u, skb %p, pbuf %p } on node %p\n",
- new, new->datagram_label, new->datagram_size, new->skb, new->pbuf, node );
+	list_add_tail(&new->pd_link, &peer->pd_list);
+
 	return new;
 
 fail_w_fi:
@@ -542,174 +422,171 @@ fail_w_fi:
 fail_w_new:
 	kfree(new);
 fail:
-	fw_error("ipv4_pd_new: no memory\n");
+	fw_error("out of memory\n");
+
 	return NULL;
 }
 
-static struct ipv4_partial_datagram *ipv4_pd_find(struct ipv4_node *node, u16 datagram_label) {
-	struct ipv4_partial_datagram *pd;
+static struct fwnet_partial_datagram *fwnet_pd_find(struct fwnet_peer *peer,
+						    u16 datagram_label)
+{
+	struct fwnet_partial_datagram *pd;
 
-	list_for_each_entry(pd, &node->pdg_list, pdg_list) {
-	        if ( pd->datagram_label == datagram_label ) {
-			fw_debug ( "pd_find(node %p, label %u): pd %p\n", node, datagram_label, pd );
+	list_for_each_entry(pd, &peer->pd_list, pd_link)
+		if (pd->datagram_label == datagram_label)
 			return pd;
-		}
-	}
-	fw_debug ( "pd_find(node %p, label %u) no entry\n", node, datagram_label );
+
 	return NULL;
 }
 
 
-static void ipv4_pd_delete ( struct ipv4_partial_datagram *old ) {
-	struct ipv4_fragment_info *fi, *n;
+static void fwnet_pd_delete(struct fwnet_partial_datagram *old)
+{
+	struct fwnet_fragment_info *fi, *n;
 
-	fw_debug ( "pd_delete %p\n", old );
-	list_for_each_entry_safe(fi, n, &old->fragment_info, fragment_info) {
-		fw_debug ( "Freeing fi %p\n", fi );
+	list_for_each_entry_safe(fi, n, &old->fi_list, fi_link)
 		kfree(fi);
-	}
-	list_del(&old->pdg_list);
+
+	list_del(&old->pd_link);
 	dev_kfree_skb_any(old->skb);
 	kfree(old);
 }
 
-static bool ipv4_pd_update ( struct ipv4_node *node, struct ipv4_partial_datagram *pd,
- u32 *frag_buf, unsigned frag_off, unsigned frag_len) {
-	fw_debug ( "pd_update node %p, pd %p, frag_buf %p, %x@%x\n", node, pd, frag_buf, frag_len, frag_off );
-	if ( ipv4_frag_new ( pd, frag_off, frag_len ) == NULL)
+static bool fwnet_pd_update(struct fwnet_peer *peer,
+		struct fwnet_partial_datagram *pd, void *frag_buf,
+		unsigned frag_off, unsigned frag_len)
+{
+	if (fwnet_frag_new(pd, frag_off, frag_len) == NULL)
 		return false;
+
 	memcpy(pd->pbuf + frag_off, frag_buf, frag_len);
 
 	/*
 	 * Move list entry to beginnig of list so that oldest partial
 	 * datagrams percolate to the end of the list
 	 */
-	list_move_tail(&pd->pdg_list, &node->pdg_list);
-	fw_debug ( "New pd list:\n" );
-	list_for_each_entry ( pd, &node->pdg_list, pdg_list ) {
-		fw_debug ( "pd %p\n", pd );
-	}
+	list_move_tail(&pd->pd_link, &peer->pd_list);
+
 	return true;
 }
 
-static bool ipv4_pd_is_complete ( struct ipv4_partial_datagram *pd ) {
-	struct ipv4_fragment_info *fi;
-	bool ret;
+static bool fwnet_pd_is_complete(struct fwnet_partial_datagram *pd)
+{
+	struct fwnet_fragment_info *fi;
 
-	fi = list_entry(pd->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+	fi = list_entry(pd->fi_list.next, struct fwnet_fragment_info, fi_link);
 
-	ret = (fi->len == pd->datagram_size);
-	fw_debug ( "pd_is_complete (pd %p, dgs %x): fi %p (%x@%x) %s\n", pd, pd->datagram_size, fi, fi->len, fi->offset, ret ? "yes" : "no" );
-	return ret;
+	return fi->len == pd->datagram_size;
 }
 
-/* ------------------------------------------------------------------ */
+static int fwnet_peer_new(struct fw_card *card, struct fw_device *device)
+{
+	struct fwnet_peer *peer;
 
-static int ipv4_node_new ( struct fw_card *card, struct fw_device *device ) {
-	struct ipv4_node *node;
+	peer = kmalloc(sizeof(*peer), GFP_KERNEL);
+	if (!peer) {
+		fw_error("out of memory\n");
 
-	node = kmalloc ( sizeof(*node), GFP_KERNEL );
-	if ( ! node ) {
-		fw_error ( "allocate new node failed\n" );
 		return -ENOMEM;
 	}
-	node->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	node->fifo = INVALID_FIFO_ADDR;
-	INIT_LIST_HEAD(&node->pdg_list);
-	spin_lock_init(&node->pdg_lock);
-	node->pdg_size = 0;
-	node->generation = device->generation;
+	peer->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	peer->fifo = FWNET_NO_FIFO_ADDR;
+	INIT_LIST_HEAD(&peer->pd_list);
+	spin_lock_init(&peer->pdg_lock);
+	peer->pdg_size = 0;
+	peer->generation = device->generation;
 	rmb();
-	node->nodeid = device->node_id;
+	peer->node_id = device->node_id;
 	 /* FIXME what should it really be? */
-	node->max_payload = S100_BUFFER_SIZE - IPV4_UNFRAG_HDR_SIZE;
-	node->datagram_label = 0U;
-	node->xmt_speed = device->max_speed;
-	list_add_tail ( &node->ipv4_nodes, &card->ipv4_nodes );
-	fw_debug ( "node_new: %p { guid %016llx, generation %u, nodeid %x, max_payload %x, xmt_speed %x } added\n",
- node, (unsigned long long)node->guid, node->generation, node->nodeid, node->max_payload, node->xmt_speed );
+	peer->max_payload = IEEE1394_MAX_PAYLOAD_S100 - RFC2374_UNFRAG_HDR_SIZE;
+	peer->datagram_label = 0U;
+	peer->xmt_speed = device->max_speed;
+	list_add_tail(&peer->peer_link, &card->peer_list);
+
 	return 0;
 }
 
-static struct ipv4_node *ipv4_node_find_by_guid(struct ipv4_priv *priv, u64 guid) {
-	struct ipv4_node *node;
+/* FIXME caller must take the lock, or peer needs to be reference-counted */
+static struct fwnet_peer *fwnet_peer_find_by_guid(struct fwnet_device *dev,
+						  u64 guid)
+{
+	struct fwnet_peer *p, *peer = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&priv->lock, flags);
-	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
-		if (node->guid == guid) {
-			/* FIXME: lock the node first? */
-			spin_unlock_irqrestore ( &priv->lock, flags );
-			fw_debug ( "node_find_by_guid (%016llx) found %p\n", (unsigned long long)guid, node );
-			return node;
+	spin_lock_irqsave(&dev->lock, flags);
+	list_for_each_entry(p, &dev->card->peer_list, peer_link)
+		if (p->guid == guid) {
+			peer = p;
+			break;
 		}
+	spin_unlock_irqrestore(&dev->lock, flags);
 
-	spin_unlock_irqrestore ( &priv->lock, flags );
-	fw_debug ( "node_find_by_guid (%016llx) not found\n", (unsigned long long)guid );
-	return NULL;
+	return peer;
 }
 
-static struct ipv4_node *ipv4_node_find_by_nodeid(struct ipv4_priv *priv, u16 nodeid) {
-	struct ipv4_node *node;
+/* FIXME caller must take the lock, or peer needs to be reference-counted */
+/* FIXME node_id doesn't mean anything without generation */
+static struct fwnet_peer *fwnet_peer_find_by_node_id(struct fwnet_device *dev,
+						     u16 node_id)
+{
+	struct fwnet_peer *p, *peer = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&priv->lock, flags);
-	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
-		if (node->nodeid == nodeid) {
-			/* FIXME: lock the node first? */
-			spin_unlock_irqrestore ( &priv->lock, flags );
-			fw_debug ( "node_find_by_nodeid (%x) found %p\n", nodeid, node );
-			return node;
+	spin_lock_irqsave(&dev->lock, flags);
+	list_for_each_entry(p, &dev->card->peer_list, peer_link)
+		if (p->node_id == node_id) {
+			peer = p;
+			break;
 		}
-	fw_debug ( "node_find_by_nodeid (%x) not found\n", nodeid );
-	spin_unlock_irqrestore ( &priv->lock, flags );
-	return NULL;
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	return peer;
 }
 
-/* This is only complicated because we can't assume priv exists */
-static void ipv4_node_delete ( struct fw_card *card, struct fw_device *device ) {
-	struct net_device *netdev;
-	struct ipv4_priv *priv;
-	struct ipv4_node *node;
+/* FIXME */
+static void fwnet_peer_delete(struct fw_card *card, struct fw_device *device)
+{
+	struct net_device *net;
+	struct fwnet_device *dev;
+	struct fwnet_peer *peer;
 	u64 guid;
 	unsigned long flags;
-	struct ipv4_partial_datagram *pd, *pd_next;
+	struct fwnet_partial_datagram *pd, *pd_next;
 
 	guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	netdev = card->netdev;
-	if ( netdev )
-		priv = netdev_priv ( netdev );
+	net = card->netdev;
+	if (net)
+		dev = netdev_priv(net);
 	else
-		priv = NULL;
-	if ( priv )
-		spin_lock_irqsave ( &priv->lock, flags );
-	list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
-		if ( node->guid == guid ) {
-			list_del ( &node->ipv4_nodes );
-			list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
-				ipv4_pd_delete ( pd );
+		dev = NULL;
+	if (dev)
+		spin_lock_irqsave(&dev->lock, flags);
+
+	list_for_each_entry(peer, &card->peer_list, peer_link) {
+		if (peer->guid == guid) {
+			list_del(&peer->peer_link);
+			list_for_each_entry_safe(pd, pd_next, &peer->pd_list,
+						 pd_link)
+				fwnet_pd_delete(pd);
 			break;
 		}
 	}
-	if ( priv )
-		spin_unlock_irqrestore ( &priv->lock, flags );
+	if (dev)
+		spin_unlock_irqrestore(&dev->lock, flags);
 }
 
-/* ------------------------------------------------------------------ */
-
-
-static int ipv4_finish_incoming_packet ( struct net_device *netdev,
- struct sk_buff *skb, u16 source_node_id, bool is_broadcast, u16 ether_type ) {
-	struct ipv4_priv *priv;
-	static u64 broadcast_hw = ~0ULL;
+static int fwnet_finish_incoming_packet(struct net_device *net,
+					struct sk_buff *skb, u16 source_node_id,
+					bool is_broadcast, u16 ether_type)
+{
+	struct fwnet_device *dev;
+	static const __be64 broadcast_hw = cpu_to_be64(~0ULL);
 	int status;
-	u64 guid;
+	__be64 guid;
 
-	fw_debug ( "ipv4_finish_incoming_packet(%p, %p, %x, %s, %x\n",
- netdev, skb, source_node_id, is_broadcast ? "true" : "false", ether_type );
-	priv = netdev_priv(netdev);
+	dev = netdev_priv(net);
 	/* Write metadata, and then pass to the receive level */
-	skb->dev = netdev;
+	skb->dev = net;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;  /* don't check it */
 
 	/*
@@ -724,73 +601,75 @@ static int ipv4_finish_incoming_packet ( struct net_device *netdev,
 	 * about the sending machine.
 	 */
 	if (ether_type == ETH_P_ARP) {
-		struct ipv4_arp *arp1394;
+		struct rfc2734_arp *arp1394;
 		struct arphdr *arp;
 		unsigned char *arp_ptr;
 		u64 fifo_addr;
+		u64 peer_guid;
 		u8 max_rec;
 		u8 sspd;
 		u16 max_payload;
-		struct ipv4_node *node;
-		static const u16 ipv4_speed_to_max_payload[] = {
+		struct fwnet_peer *peer;
+		static const u16 fwnet_speed_to_max_payload[] = {
 			/* S100, S200, S400, S800, S1600, S3200 */
 			    512, 1024, 2048, 4096,  4096,  4096
 		};
 
-		/* fw_debug ( "ARP packet\n" ); */
-		arp1394 = (struct ipv4_arp *)skb->data;
+		arp1394 = (struct rfc2734_arp *)skb->data;
 		arp = (struct arphdr *)skb->data;
 		arp_ptr = (unsigned char *)(arp + 1);
-		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32 |
- ntohl(arp1394->fifo_lo);
-		max_rec = priv->card->max_receive;
-		if ( arp1394->max_rec < max_rec )
+		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32
+				| ntohl(arp1394->fifo_lo);
+		max_rec = dev->card->max_receive;
+		if (arp1394->max_rec < max_rec)
 			max_rec = arp1394->max_rec;
 		sspd = arp1394->sspd;
-		/*
-		 * Sanity check. MacOSX seems to be sending us 131 in this
-		 * field (atleast on my Panther G5). Not sure why.
-		 */
-		if (sspd > 5 ) {
-			fw_notify ( "sspd %x out of range\n", sspd );
+		/* Sanity check.  OS X 10.3 PPC reportedly sends 131. */
+		if (sspd > SCODE_3200) {
+			fw_notify("sspd %x out of range\n", sspd);
 			sspd = 0;
 		}
 
-		max_payload = min(ipv4_speed_to_max_payload[sspd],
- (u16)(1 << (max_rec + 1))) - IPV4_UNFRAG_HDR_SIZE;
+		max_payload = min(fwnet_speed_to_max_payload[sspd],
+			(u16)(1 << (max_rec + 1))) - RFC2374_UNFRAG_HDR_SIZE;
 
-		guid = be64_to_cpu(get_unaligned(&arp1394->s_uniq_id));
-		node = ipv4_node_find_by_guid(priv, guid);
-		if (!node) {
-			fw_notify ( "No node for ARP packet from %llx\n", guid );
+		peer_guid = get_unaligned_be64(&arp1394->s_uniq_id);
+		peer = fwnet_peer_find_by_guid(dev, peer_guid);
+		if (!peer) {
+			fw_notify("No peer for ARP packet from %016llx\n",
+				  (unsigned long long)peer_guid);
 			goto failed_proto;
 		}
-		if ( node->nodeid != source_node_id || node->generation != priv->card->generation ) {
-			fw_notify ( "Internal error: node->nodeid (%x) != soucre_node_id (%x) or node->generation (%x) != priv->card->generation(%x)\n",
- node->nodeid, source_node_id, node->generation, priv->card->generation );
-			node->nodeid = source_node_id;
-			node->generation = priv->card->generation;
+
+		/* FIXME don't use card->generation */
+		if (peer->node_id != source_node_id ||
+		    peer->generation != dev->card->generation) {
+			fw_notify("Internal error: peer->node_id (%x) != "
+				  "source_node_id (%x) or peer->generation (%x)"
+				  " != dev->card->generation(%x)\n",
+				  peer->node_id, source_node_id,
+				  peer->generation, dev->card->generation);
+			peer->node_id = source_node_id;
+			peer->generation = dev->card->generation;
 		}
 
 		/* FIXME: for debugging */
-		if ( sspd > SCODE_400 )
+		if (sspd > SCODE_400)
 			sspd = SCODE_400;
 		/* Update our speed/payload/fifo_offset table */
 		/*
 		 * FIXME: this does not handle cases where two high-speed endpoints must use a slower speed because of
 		 * a lower speed hub between them.  We need to look at the actual topology map here.
 		 */
-		fw_debug ( "Setting node %p fifo %llx (was %llx), max_payload %x (was %x), speed %x (was %x)\n",
- node, fifo_addr, node->fifo, max_payload, node->max_payload, sspd, node->xmt_speed );
-		node->fifo =	fifo_addr;
-		node->max_payload = max_payload;
+		peer->fifo = fifo_addr;
+		peer->max_payload = max_payload;
 		/*
 		 * Only allow speeds to go down from their initial value.
-		 * Otherwise a local node that can only do S400 or slower may
-		 * be told to transmit at S800 to a faster remote node.
+		 * Otherwise a local peer that can only do S400 or slower may
+		 * be told to transmit at S800 to a faster remote peer.
 		 */
-		if ( node->xmt_speed > sspd )
-			node->xmt_speed = sspd;
+		if (peer->xmt_speed > sspd)
+			peer->xmt_speed = sspd;
 
 		/*
 		 * Now that we're done with the 1394 specific stuff, we'll
@@ -805,248 +684,257 @@ static int ipv4_finish_incoming_packet ( struct net_device *netdev,
 		 */
 
 		arp->ar_hln = 8;
-		arp_ptr += arp->ar_hln;		/* skip over sender unique id */
-		*(u32 *)arp_ptr = arp1394->sip; /* move sender IP addr */
-		arp_ptr += arp->ar_pln;		/* skip over sender IP addr */
+		/* skip over sender unique id */
+		arp_ptr += arp->ar_hln;
+		/* move sender IP addr */
+		put_unaligned(arp1394->sip, (u32 *)arp_ptr);
+		/* skip over sender IP addr */
+		arp_ptr += arp->ar_pln;
 
 		if (arp->ar_op == htons(ARPOP_REQUEST))
 			memset(arp_ptr, 0, sizeof(u64));
 		else
-			memcpy(arp_ptr, netdev->dev_addr, sizeof(u64));
+			memcpy(arp_ptr, net->dev_addr, sizeof(u64));
 	}
 
 	/* Now add the ethernet header. */
-	guid = cpu_to_be64(priv->card->guid);
-	if (dev_hard_header(skb, netdev, ether_type, is_broadcast ? &broadcast_hw : &guid, NULL,
- skb->len) >= 0) {
-		struct ipv4_ether_hdr *eth;
+	guid = cpu_to_be64(dev->card->guid);
+	if (dev_hard_header(skb, net, ether_type,
+			   is_broadcast ? &broadcast_hw : &guid,
+			   NULL, skb->len) >= 0) {
+		struct fwnet_header *eth;
 		u16 *rawp;
 		__be16 protocol;
 
 		skb_reset_mac_header(skb);
 		skb_pull(skb, sizeof(*eth));
-		eth = ipv4_ether_hdr(skb);
+		eth = (struct fwnet_header *)skb_mac_header(skb);
 		if (*eth->h_dest & 1) {
-			if (memcmp(eth->h_dest, netdev->broadcast, netdev->addr_len) == 0) {
-				fw_debug ( "Broadcast\n" );
+			if (memcmp(eth->h_dest, net->broadcast,
+				   net->addr_len) == 0)
 				skb->pkt_type = PACKET_BROADCAST;
-			}
 #if 0
 			else
 				skb->pkt_type = PACKET_MULTICAST;
 #endif
 		} else {
-			if (memcmp(eth->h_dest, netdev->dev_addr, netdev->addr_len)) {
+			if (memcmp(eth->h_dest, net->dev_addr, net->addr_len)) {
 				u64 a1, a2;
 
-				memcpy ( &a1, eth->h_dest, sizeof(u64));
-				memcpy ( &a2, netdev->dev_addr, sizeof(u64));
-				fw_debug ( "Otherhost %llx %llx %x\n", a1, a2, netdev->addr_len );
+				memcpy(&a1, eth->h_dest, sizeof(u64));
+				memcpy(&a2, net->dev_addr, sizeof(u64));
 				skb->pkt_type = PACKET_OTHERHOST;
 			}
 		}
 		if (ntohs(eth->h_proto) >= 1536) {
-			fw_debug ( " proto %x %x\n", eth->h_proto, ntohs(eth->h_proto) );
 			protocol = eth->h_proto;
 		} else {
 			rawp = (u16 *)skb->data;
-			if (*rawp == 0xFFFF) {
-				fw_debug ( "proto 802_3\n" );
+			if (*rawp == 0xffff)
 				protocol = htons(ETH_P_802_3);
-			} else {
-				fw_debug ( "proto 802_2\n" );
+			else
 				protocol = htons(ETH_P_802_2);
-			}
 		}
 		skb->protocol = protocol;
 	}
 	status = netif_rx(skb);
-	if ( status == NET_RX_DROP) {
-		netdev->stats.rx_errors++;
-		netdev->stats.rx_dropped++;
+	if (status == NET_RX_DROP) {
+		net->stats.rx_errors++;
+		net->stats.rx_dropped++;
 	} else {
-		netdev->stats.rx_packets++;
-		netdev->stats.rx_bytes += skb->len;
+		net->stats.rx_packets++;
+		net->stats.rx_bytes += skb->len;
 	}
-	if (netif_queue_stopped(netdev))
-		netif_wake_queue(netdev);
+	if (netif_queue_stopped(net))
+		netif_wake_queue(net);
+
 	return 0;
 
  failed_proto:
-	netdev->stats.rx_errors++;
-	netdev->stats.rx_dropped++;
+	net->stats.rx_errors++;
+	net->stats.rx_dropped++;
+
 	dev_kfree_skb_any(skb);
-	if (netif_queue_stopped(netdev))
-		netif_wake_queue(netdev);
-	netdev->last_rx = jiffies;
+	if (netif_queue_stopped(net))
+		netif_wake_queue(net);
+
+	net->last_rx = jiffies;
+
 	return 0;
 }
 
-/* ------------------------------------------------------------------ */
-
-static int ipv4_incoming_packet ( struct ipv4_priv *priv, u32 *buf, int len, u16 source_node_id, bool is_broadcast ) {
+static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
+				 u16 source_node_id, bool is_broadcast)
+{
 	struct sk_buff *skb;
-	struct net_device *netdev;
-	struct ipv4_hdr hdr;
+	struct net_device *net;
+	struct rfc2734_header hdr;
 	unsigned lf;
 	unsigned long flags;
-	struct ipv4_node *node;
-	struct ipv4_partial_datagram *pd;
+	struct fwnet_peer *peer;
+	struct fwnet_partial_datagram *pd;
 	int fg_off;
 	int dg_size;
 	u16 datagram_label;
 	int retval;
 	u16 ether_type;
 
-	fw_debug ( "ipv4_incoming_packet(%p, %p, %d, %x, %s)\n", priv, buf, len, source_node_id, is_broadcast ? "true" : "false" );
-	netdev = priv->card->netdev;
+	net = dev->card->netdev;
 
-	hdr.w0 = ntohl(buf[0]);
-	lf = ipv4_get_hdr_lf(&hdr);
-	if ( lf == IPV4_HDR_UNFRAG ) {
+	hdr.w0 = be32_to_cpu(buf[0]);
+	lf = fwnet_get_hdr_lf(&hdr);
+	if (lf == RFC2374_HDR_UNFRAG) {
 		/*
 		 * An unfragmented datagram has been received by the ieee1394
 		 * bus. Build an skbuff around it so we can pass it to the
 		 * high level network layer.
 		 */
-		ether_type = ipv4_get_hdr_ether_type(&hdr);
-		fw_debug ( "header w0 = %x, lf = %x, ether_type = %x\n", hdr.w0, lf, ether_type );
+		ether_type = fwnet_get_hdr_ether_type(&hdr);
 		buf++;
-		len -= IPV4_UNFRAG_HDR_SIZE;
+		len -= RFC2374_UNFRAG_HDR_SIZE;
 
-		skb = dev_alloc_skb(len + netdev->hard_header_len + 15);
+		skb = dev_alloc_skb(len + net->hard_header_len + 15);
 		if (unlikely(!skb)) {
-			fw_error ( "Out of memory for incoming packet\n");
-			netdev->stats.rx_dropped++;
+			fw_error("out of memory\n");
+			net->stats.rx_dropped++;
+
 			return -1;
 		}
-		skb_reserve(skb, (netdev->hard_header_len + 15) & ~15);
-		memcpy(skb_put(skb, len), buf, len );
-		return ipv4_finish_incoming_packet(netdev, skb, source_node_id, is_broadcast, ether_type );
+		skb_reserve(skb, (net->hard_header_len + 15) & ~15);
+		memcpy(skb_put(skb, len), buf, len);
+
+		return fwnet_finish_incoming_packet(net, skb, source_node_id,
+						    is_broadcast, ether_type);
 	}
 	/* A datagram fragment has been received, now the fun begins. */
 	hdr.w1 = ntohl(buf[1]);
-	buf +=2;
-	len -= IPV4_FRAG_HDR_SIZE;
-	if ( lf ==IPV4_HDR_FIRSTFRAG ) {
-		ether_type = ipv4_get_hdr_ether_type(&hdr);
+	buf += 2;
+	len -= RFC2374_FRAG_HDR_SIZE;
+	if (lf == RFC2374_HDR_FIRSTFRAG) {
+		ether_type = fwnet_get_hdr_ether_type(&hdr);
 		fg_off = 0;
 	} else {
-		fg_off = ipv4_get_hdr_fg_off(&hdr);
-		ether_type = 0; /* Shut up compiler! */
+		ether_type = 0;
+		fg_off = fwnet_get_hdr_fg_off(&hdr);
 	}
-	datagram_label = ipv4_get_hdr_dgl(&hdr);
-	dg_size = ipv4_get_hdr_dg_size(&hdr); /* ??? + 1 */
-	fw_debug ( "fragmented: %x.%x = lf %x, ether_type %x, fg_off %x, dgl %x, dg_size %x\n", hdr.w0, hdr.w1, lf, ether_type, fg_off, datagram_label, dg_size );
-	node = ipv4_node_find_by_nodeid ( priv, source_node_id);
-	spin_lock_irqsave(&node->pdg_lock, flags);
-	pd = ipv4_pd_find( node, datagram_label );
+	datagram_label = fwnet_get_hdr_dgl(&hdr);
+	dg_size = fwnet_get_hdr_dg_size(&hdr); /* ??? + 1 */
+	peer = fwnet_peer_find_by_node_id(dev, source_node_id);
+
+	spin_lock_irqsave(&peer->pdg_lock, flags);
+
+	pd = fwnet_pd_find(peer, datagram_label);
 	if (pd == NULL) {
-		while ( node->pdg_size >= ipv4_mpd ) {
+		while (peer->pdg_size >= FWNET_MAX_FRAGMENTS) {
 			/* remove the oldest */
-			ipv4_pd_delete ( list_first_entry(&node->pdg_list, struct ipv4_partial_datagram, pdg_list) );
-			node->pdg_size--;
+			fwnet_pd_delete(list_first_entry(&peer->pd_list,
+				struct fwnet_partial_datagram, pd_link));
+			peer->pdg_size--;
 		}
-		pd = ipv4_pd_new ( netdev, node, datagram_label, dg_size,
- buf, fg_off, len);
-		if ( pd == NULL) {
+		pd = fwnet_pd_new(net, peer, datagram_label,
+				  dg_size, buf, fg_off, len);
+		if (pd == NULL) {
 			retval = -ENOMEM;
 			goto bad_proto;
 		}
-		node->pdg_size++;
+		peer->pdg_size++;
 	} else {
-		if (ipv4_frag_overlap(pd, fg_off, len) || pd->datagram_size != dg_size) {
+		if (fwnet_frag_overlap(pd, fg_off, len) ||
+		    pd->datagram_size != dg_size) {
 			/*
 			 * Differing datagram sizes or overlapping fragments,
-			 * Either way the remote machine is playing silly buggers
-			 * with us: obliterate the old datagram and start a new one.
+			 * discard old datagram and start a new one.
 			 */
-			ipv4_pd_delete ( pd );
-			pd = ipv4_pd_new ( netdev, node, datagram_label,
- dg_size, buf, fg_off, len);
-			if ( pd == NULL ) {
+			fwnet_pd_delete(pd);
+			pd = fwnet_pd_new(net, peer, datagram_label,
+					  dg_size, buf, fg_off, len);
+			if (pd == NULL) {
 				retval = -ENOMEM;
-				node->pdg_size--;
+				peer->pdg_size--;
 				goto bad_proto;
 			}
 		} else {
-			bool worked;
-
-			worked = ipv4_pd_update ( node, pd,
- buf, fg_off, len );
-			if ( ! worked ) {
+			if (!fwnet_pd_update(peer, pd, buf, fg_off, len)) {
 				/*
 				 * Couldn't save off fragment anyway
 				 * so might as well obliterate the
 				 * datagram now.
 				 */
-				ipv4_pd_delete ( pd );
-				node->pdg_size--;
+				fwnet_pd_delete(pd);
+				peer->pdg_size--;
 				goto bad_proto;
 			}
 		}
 	} /* new datagram or add to existing one */
 
-	if ( lf == IPV4_HDR_FIRSTFRAG )
+	if (lf == RFC2374_HDR_FIRSTFRAG)
 		pd->ether_type = ether_type;
-	if ( ipv4_pd_is_complete ( pd ) ) {
+
+	if (fwnet_pd_is_complete(pd)) {
 		ether_type = pd->ether_type;
-		node->pdg_size--;
+		peer->pdg_size--;
 		skb = skb_get(pd->skb);
-		ipv4_pd_delete ( pd );
-		spin_unlock_irqrestore(&node->pdg_lock, flags);
-		return ipv4_finish_incoming_packet ( netdev, skb, source_node_id, false, ether_type );
+		fwnet_pd_delete(pd);
+
+		spin_unlock_irqrestore(&peer->pdg_lock, flags);
+
+		return fwnet_finish_incoming_packet(net, skb, source_node_id,
+						    false, ether_type);
 	}
 	/*
 	 * Datagram is not complete, we're done for the
 	 * moment.
 	 */
-	spin_unlock_irqrestore(&node->pdg_lock, flags);
+	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+
 	return 0;
 
  bad_proto:
-	spin_unlock_irqrestore(&node->pdg_lock, flags);
-	if (netif_queue_stopped(netdev))
-		netif_wake_queue(netdev);
+	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+
+	if (netif_queue_stopped(net))
+		netif_wake_queue(net);
+
 	return 0;
 }
 
-static void ipv4_receive_packet ( struct fw_card *card, struct fw_request *r,
- int tcode, int destination, int source, int generation, int speed,
- unsigned long long offset, void *payload, size_t length, void *callback_data ) {
-	struct ipv4_priv *priv;
+static void fwnet_receive_packet(struct fw_card *card, struct fw_request *r,
+		int tcode, int destination, int source, int generation,
+		int speed, unsigned long long offset, void *payload,
+		size_t length, void *callback_data)
+{
+	struct fwnet_device *dev;
 	int status;
 
-	fw_debug ( "ipv4_receive_packet(%p,%p,%x,%x,%x,%x,%x,%llx,%p,%lx,%p)\n",
- card, r, tcode, destination, source, generation, speed, offset, payload,
- (unsigned long)length, callback_data);
-	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, payload, length, false );
-	priv = callback_data;
-	if (   tcode != TCODE_WRITE_BLOCK_REQUEST
-	    || destination != card->node_id
-	    || generation != card->generation
-	    || offset != priv->handler.offset ) {
+	dev = callback_data;
+	if (tcode != TCODE_WRITE_BLOCK_REQUEST
+	    || destination != card->node_id	/* <- FIXME */
+	    || generation != card->generation	/* <- FIXME */
+	    || offset != dev->handler.offset) {
 		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
-		fw_debug("Conflict error card node_id=%x, card generation=%x, local offset %llx\n",
- card->node_id, card->generation, (unsigned long long)priv->handler.offset );
+
 		return;
 	}
-	status = ipv4_incoming_packet ( priv, payload, length, source, false );
-	if ( status != 0 ) {
-		fw_error ( "Incoming packet failure\n" );
-		fw_send_response ( card, r, RCODE_CONFLICT_ERROR );
+
+	status = fwnet_incoming_packet(dev, payload, length, source, false);
+	if (status != 0) {
+		fw_error("Incoming packet failure\n");
+		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
+
 		return;
 	}
-	fw_send_response ( card, r, RCODE_COMPLETE );
+
+	fw_send_response(card, r, RCODE_COMPLETE);
 }
 
-static void ipv4_receive_broadcast(struct fw_iso_context *context, u32 cycle,
- size_t header_length, void *header, void *data) {
-	struct ipv4_priv *priv;
+static void fwnet_receive_broadcast(struct fw_iso_context *context,
+		u32 cycle, size_t header_length, void *header, void *data)
+{
+	struct fwnet_device *dev;
 	struct fw_iso_packet packet;
 	struct fw_card *card;
-	u16 *hdr_ptr;
-	u32 *buf_ptr;
+	__be16 *hdr_ptr;
+	__be32 *buf_ptr;
 	int retval;
 	u32 length;
 	u16 source_node_id;
@@ -1055,70 +943,68 @@ static void ipv4_receive_broadcast(struct fw_iso_context *context, u32 cycle,
 	unsigned long offset;
 	unsigned long flags;
 
-	fw_debug ( "ipv4_receive_broadcast ( context=%p, cycle=%x, header_length=%lx, header=%p, data=%p )\n", context, cycle, (unsigned long)header_length, header, data );
-	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, header, header_length, false );
-	priv = data;
-	card = priv->card;
+	dev = data;
+	card = dev->card;
 	hdr_ptr = header;
-	length = ntohs(hdr_ptr[0]);
-	spin_lock_irqsave(&priv->lock,flags);
-	offset = priv->rcv_buffer_size * priv->broadcast_rcv_next_ptr;
-	buf_ptr = priv->broadcast_rcv_buffer_ptrs[priv->broadcast_rcv_next_ptr++];
-	if ( priv->broadcast_rcv_next_ptr == priv->num_broadcast_rcv_ptrs )
-		priv->broadcast_rcv_next_ptr = 0;
-	spin_unlock_irqrestore(&priv->lock,flags);
-	fw_debug ( "length %u at %p\n", length, buf_ptr );
-	print_hex_dump ( KERN_DEBUG, "buffer: ", DUMP_PREFIX_OFFSET, 32, 1, buf_ptr, length, false );
+	length = be16_to_cpup(hdr_ptr);
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	offset = dev->rcv_buffer_size * dev->broadcast_rcv_next_ptr;
+	buf_ptr = dev->broadcast_rcv_buffer_ptrs[dev->broadcast_rcv_next_ptr++];
+	if (dev->broadcast_rcv_next_ptr == dev->num_broadcast_rcv_ptrs)
+		dev->broadcast_rcv_next_ptr = 0;
+
+	spin_unlock_irqrestore(&dev->lock, flags);
 
 	specifier_id =    (be32_to_cpu(buf_ptr[0]) & 0xffff) << 8
 			| (be32_to_cpu(buf_ptr[1]) & 0xff000000) >> 24;
-	ver = be32_to_cpu(buf_ptr[1]) & 0xFFFFFF;
+	ver = be32_to_cpu(buf_ptr[1]) & 0xffffff;
 	source_node_id = be32_to_cpu(buf_ptr[0]) >> 16;
-	/* fw_debug ( "source %x SpecID %x ver %x\n", source_node_id, specifier_id, ver ); */
-	if ( specifier_id == IPV4_GASP_SPECIFIER_ID && ver == IPV4_GASP_VERSION ) {
+
+	if (specifier_id == IANA_SPECIFIER_ID && ver == RFC2734_SW_VERSION) {
 		buf_ptr += 2;
-		length -= IPV4_GASP_OVERHEAD;
-		ipv4_incoming_packet(priv, buf_ptr, length, source_node_id, true);
-	} else
-		fw_debug ( "Ignoring packet: not GASP\n" );
-	packet.payload_length = priv->rcv_buffer_size;
+		length -= IEEE1394_GASP_HDR_SIZE;
+		fwnet_incoming_packet(dev, buf_ptr, length,
+				      source_node_id, true);
+	}
+
+	packet.payload_length = dev->rcv_buffer_size;
 	packet.interrupt = 1;
 	packet.skip = 0;
 	packet.tag = 3;
 	packet.sy = 0;
-	packet.header_length = IPV4_GASP_OVERHEAD;
-	spin_lock_irqsave(&priv->lock,flags);
-	retval = fw_iso_context_queue ( priv->broadcast_rcv_context, &packet,
- &priv->broadcast_rcv_buffer, offset );
-	spin_unlock_irqrestore(&priv->lock,flags);
-	if ( retval < 0 )
-		fw_error ( "requeue failed\n" );
-}
+	packet.header_length = IEEE1394_GASP_HDR_SIZE;
+
+	spin_lock_irqsave(&dev->lock, flags);
 
-static void debug_ptask ( struct ipv4_packet_task *ptask ) {
-	static const char *tx_types[] = { "Unknown", "GASP", "Write" };
-
-	fw_debug ( "packet %p { hdr { w0 %x w1 %x }, skb %p, priv %p,"
- " tx_type %s, outstanding_pkts %d, max_payload %x, fifo %llx,"
- " speed %x, dest_node %x, generation %x }\n",
- ptask, ptask->hdr.w0, ptask->hdr.w1, ptask->skb, ptask->priv,
- ptask->tx_type > IPV4_WRREQ ? "Invalid" : tx_types[ptask->tx_type],
- ptask->outstanding_pkts,  ptask->max_payload,
- ptask->fifo_addr, ptask->speed, ptask->dest_node, ptask->generation );
-	print_hex_dump ( KERN_DEBUG, "packet :", DUMP_PREFIX_OFFSET, 32, 1,
- ptask->skb->data, ptask->skb->len, false );
+	retval = fw_iso_context_queue(dev->broadcast_rcv_context, &packet,
+				      &dev->broadcast_rcv_buffer, offset);
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	if (retval < 0)
+		fw_error("requeue failed\n");
 }
 
-static void ipv4_transmit_packet_done ( struct ipv4_packet_task *ptask ) {
-	struct ipv4_priv *priv;
+static struct kmem_cache *fwnet_packet_task_cache;
+
+static int fwnet_send_packet(struct fwnet_packet_task *ptask);
+
+static void fwnet_transmit_packet_done(struct fwnet_packet_task *ptask)
+{
+	struct fwnet_device *dev;
 	unsigned long flags;
 
-	priv = ptask->priv;
-	spin_lock_irqsave ( &priv->lock, flags );
-	list_del ( &ptask->packet_list );
-	spin_unlock_irqrestore ( &priv->lock, flags );
-	ptask->outstanding_pkts--;
-	if ( ptask->outstanding_pkts > 0 ) {
+	dev = ptask->dev;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	list_del(&ptask->pt_link);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	ptask->outstanding_pkts--; /* FIXME access inside lock */
+
+	if (ptask->outstanding_pkts > 0) {
 		u16 dg_size;
 		u16 fg_off;
 		u16 datagram_label;
@@ -1126,133 +1012,139 @@ static void ipv4_transmit_packet_done ( struct ipv4_packet_task *ptask ) {
 		struct sk_buff *skb;
 
 		/* Update the ptask to point to the next fragment and send it */
-		lf = ipv4_get_hdr_lf(&ptask->hdr);
+		lf = fwnet_get_hdr_lf(&ptask->hdr);
 		switch (lf) {
-		case IPV4_HDR_LASTFRAG:
-		case IPV4_HDR_UNFRAG:
+		case RFC2374_HDR_LASTFRAG:
+		case RFC2374_HDR_UNFRAG:
 		default:
-			fw_error ( "Outstanding packet %x lf %x, header %x,%x\n", ptask->outstanding_pkts, lf, ptask->hdr.w0, ptask->hdr.w1 );
+			fw_error("Outstanding packet %x lf %x, header %x,%x\n",
+				 ptask->outstanding_pkts, lf, ptask->hdr.w0,
+				 ptask->hdr.w1);
 			BUG();
 
-		case IPV4_HDR_FIRSTFRAG:
+		case RFC2374_HDR_FIRSTFRAG:
 			/* Set frag type here for future interior fragments */
-			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
-			fg_off = ptask->max_payload - IPV4_FRAG_HDR_SIZE;
-			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+			dg_size = fwnet_get_hdr_dg_size(&ptask->hdr);
+			fg_off = ptask->max_payload - RFC2374_FRAG_HDR_SIZE;
+			datagram_label = fwnet_get_hdr_dgl(&ptask->hdr);
 			break;
 
-		case IPV4_HDR_INTFRAG:
-			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
-			fg_off = ipv4_get_hdr_fg_off(&ptask->hdr) + ptask->max_payload - IPV4_FRAG_HDR_SIZE;
-			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+		case RFC2374_HDR_INTFRAG:
+			dg_size = fwnet_get_hdr_dg_size(&ptask->hdr);
+			fg_off = fwnet_get_hdr_fg_off(&ptask->hdr)
+				  + ptask->max_payload - RFC2374_FRAG_HDR_SIZE;
+			datagram_label = fwnet_get_hdr_dgl(&ptask->hdr);
 			break;
 		}
 		skb = ptask->skb;
-		skb_pull ( skb, ptask->max_payload );
-		if ( ptask->outstanding_pkts > 1 ) {
-			ipv4_make_sf_hdr ( &ptask->hdr,
-  IPV4_HDR_INTFRAG, dg_size, fg_off, datagram_label );
+		skb_pull(skb, ptask->max_payload);
+		if (ptask->outstanding_pkts > 1) {
+			fwnet_make_sf_hdr(&ptask->hdr, RFC2374_HDR_INTFRAG,
+					  dg_size, fg_off, datagram_label);
 		} else {
-			ipv4_make_sf_hdr ( &ptask->hdr,
-  IPV4_HDR_LASTFRAG, dg_size, fg_off, datagram_label );
-			ptask->max_payload = skb->len + IPV4_FRAG_HDR_SIZE;
-
+			fwnet_make_sf_hdr(&ptask->hdr, RFC2374_HDR_LASTFRAG,
+					  dg_size, fg_off, datagram_label);
+			ptask->max_payload = skb->len + RFC2374_FRAG_HDR_SIZE;
 		}
-		ipv4_send_packet ( ptask );
+		fwnet_send_packet(ptask);
 	} else {
-		dev_kfree_skb_any ( ptask->skb );
-		kmem_cache_free( ipv4_packet_task_cache, ptask );
+		dev_kfree_skb_any(ptask->skb);
+		kmem_cache_free(fwnet_packet_task_cache, ptask);
 	}
 }
 
-static void ipv4_write_complete ( struct fw_card *card, int rcode,
- void *payload, size_t length, void *data ) {
-	struct ipv4_packet_task *ptask;
+static void fwnet_write_complete(struct fw_card *card, int rcode,
+				 void *payload, size_t length, void *data)
+{
+	struct fwnet_packet_task *ptask;
 
 	ptask = data;
-	fw_debug ( "ipv4_write_complete ( %p, %x, %p, %lx, %p )\n",
- card, rcode, payload, (unsigned long)length, data );
-	debug_ptask ( ptask );
 
-	if ( rcode == RCODE_COMPLETE ) {
-		ipv4_transmit_packet_done ( ptask );
-	} else {
-		fw_error ( "ipv4_write_complete: failed: %x\n", rcode );
+	if (rcode == RCODE_COMPLETE)
+		fwnet_transmit_packet_done(ptask);
+	else
+		fw_error("fwnet_write_complete: failed: %x\n", rcode);
 		/* ??? error recovery */
-	}
 }
 
-static int ipv4_send_packet ( struct ipv4_packet_task *ptask ) {
-	struct ipv4_priv *priv;
+static int fwnet_send_packet(struct fwnet_packet_task *ptask)
+{
+	struct fwnet_device *dev;
 	unsigned tx_len;
-	struct ipv4_hdr *bufhdr;
+	struct rfc2734_header *bufhdr;
 	unsigned long flags;
-	struct net_device *netdev;
-#if 0 /* stefanr */
-	int retval;
-#endif
+	struct net_device *net;
 
-	fw_debug ( "ipv4_send_packet\n" );
-	debug_ptask ( ptask );
-	priv = ptask->priv;
+	dev = ptask->dev;
 	tx_len = ptask->max_payload;
-	switch (ipv4_get_hdr_lf(&ptask->hdr)) {
-	case IPV4_HDR_UNFRAG:
-		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_UNFRAG_HDR_SIZE);
-		bufhdr->w0 = htonl(ptask->hdr.w0);
+	switch (fwnet_get_hdr_lf(&ptask->hdr)) {
+	case RFC2374_HDR_UNFRAG:
+		bufhdr = (struct rfc2734_header *)
+				skb_push(ptask->skb, RFC2374_UNFRAG_HDR_SIZE);
+		put_unaligned_be32(ptask->hdr.w0, &bufhdr->w0);
 		break;
 
-	case IPV4_HDR_FIRSTFRAG:
-	case IPV4_HDR_INTFRAG:
-	case IPV4_HDR_LASTFRAG:
-		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_FRAG_HDR_SIZE);
-		bufhdr->w0 = htonl(ptask->hdr.w0);
-		bufhdr->w1 = htonl(ptask->hdr.w1);
+	case RFC2374_HDR_FIRSTFRAG:
+	case RFC2374_HDR_INTFRAG:
+	case RFC2374_HDR_LASTFRAG:
+		bufhdr = (struct rfc2734_header *)
+				skb_push(ptask->skb, RFC2374_FRAG_HDR_SIZE);
+		put_unaligned_be32(ptask->hdr.w0, &bufhdr->w0);
+		put_unaligned_be32(ptask->hdr.w1, &bufhdr->w1);
 		break;
 
 	default:
 		BUG();
 	}
-	if ( ptask->tx_type == IPV4_GASP ) {
-		u32 *packets;
+	if (ptask->dest_node == IEEE1394_ALL_NODES) {
+		u8 *p;
 		int generation;
-		int nodeid;
+		int node_id;
 
 		/* ptask->generation may not have been set yet */
-		generation = priv->card->generation;
+		generation = dev->card->generation;
 		smp_rmb();
-		nodeid = priv->card->node_id;
-		packets = (u32 *)skb_push(ptask->skb, sizeof(u32)*2);
-		packets[0] = htonl(nodeid << 16 | (IPV4_GASP_SPECIFIER_ID>>8));
-		packets[1] = htonl((IPV4_GASP_SPECIFIER_ID & 0xFF) << 24 | IPV4_GASP_VERSION);
-		fw_send_request ( priv->card, &ptask->transaction, TCODE_STREAM_DATA,
- fw_stream_packet_destination_id(3, BROADCAST_CHANNEL, 0),
- generation, SCODE_100, 0ULL, ptask->skb->data, tx_len + 8, ipv4_write_complete, ptask );
-		spin_lock_irqsave(&priv->lock,flags);
-		list_add_tail ( &ptask->packet_list, &priv->broadcasted_list );
-		spin_unlock_irqrestore(&priv->lock,flags);
-#if 0 /* stefanr */
-		return retval;
-#else
+		node_id = dev->card->node_id;
+
+		p = skb_push(ptask->skb, 8);
+		put_unaligned_be32(node_id << 16 | IANA_SPECIFIER_ID >> 8, p);
+		put_unaligned_be32((IANA_SPECIFIER_ID & 0xff) << 24
+						| RFC2734_SW_VERSION, &p[4]);
+
+		/* We should not transmit if broadcast_channel.valid == 0. */
+		fw_send_request(dev->card, &ptask->transaction,
+				TCODE_STREAM_DATA,
+				fw_stream_packet_destination_id(3,
+						IEEE1394_BROADCAST_CHANNEL, 0),
+				generation, SCODE_100, 0ULL, ptask->skb->data,
+				tx_len + 8, fwnet_write_complete, ptask);
+
+		/* FIXME race? */
+		spin_lock_irqsave(&dev->lock, flags);
+		list_add_tail(&ptask->pt_link, &dev->broadcasted_list);
+		spin_unlock_irqrestore(&dev->lock, flags);
+
 		return 0;
-#endif
 	}
-	fw_debug("send_request (%p, %p, WRITE_BLOCK, %x, %x, %x, %llx, %p, %d, %p, %p\n",
- priv->card, &ptask->transaction, ptask->dest_node, ptask->generation,
- ptask->speed, (unsigned long long)ptask->fifo_addr, ptask->skb->data, tx_len,
- ipv4_write_complete, ptask );
-	fw_send_request ( priv->card, &ptask->transaction,
- TCODE_WRITE_BLOCK_REQUEST, ptask->dest_node, ptask->generation, ptask->speed,
- ptask->fifo_addr, ptask->skb->data, tx_len, ipv4_write_complete, ptask );
-	spin_lock_irqsave(&priv->lock,flags);
-	list_add_tail ( &ptask->packet_list, &priv->sent_list );
-	spin_unlock_irqrestore(&priv->lock,flags);
-	netdev = priv->card->netdev;
-	netdev->trans_start = jiffies;
+
+	fw_send_request(dev->card, &ptask->transaction,
+			TCODE_WRITE_BLOCK_REQUEST, ptask->dest_node,
+			ptask->generation, ptask->speed, ptask->fifo_addr,
+			ptask->skb->data, tx_len, fwnet_write_complete, ptask);
+
+	/* FIXME race? */
+	spin_lock_irqsave(&dev->lock, flags);
+	list_add_tail(&ptask->pt_link, &dev->sent_list);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	net = dev->card->netdev;
+	net->trans_start = jiffies;
+
 	return 0;
 }
 
-static int ipv4_broadcast_start ( struct ipv4_priv *priv ) {
+static int fwnet_broadcast_start(struct fwnet_device *dev)
+{
 	struct fw_iso_context *context;
 	int retval;
 	unsigned num_packets;
@@ -1260,150 +1152,151 @@ static int ipv4_broadcast_start ( struct ipv4_priv *priv ) {
 	struct fw_iso_packet packet;
 	unsigned long offset;
 	unsigned u;
-	/* unsigned transmit_speed; */
 
-#if 0 /* stefanr */
-	if ( priv->card->broadcast_channel != (BROADCAST_CHANNEL_VALID|BROADCAST_CHANNEL_INITIAL)) {
-		fw_notify ( "Invalid broadcast channel %x\n", priv->card->broadcast_channel );
-		/* FIXME: try again later? */
-		/* return -EINVAL; */
-	}
-#endif
-	if ( priv->local_fifo == INVALID_FIFO_ADDR ) {
-		struct fw_address_region region;
-
-		priv->handler.length = FIFO_SIZE;
-		priv->handler.address_callback = ipv4_receive_packet;
-		priv->handler.callback_data = priv;
-		/* FIXME: this is OHCI, but what about others? */
-		region.start = 0xffff00000000ULL;
-		region.end =   0xfffffffffffcULL;
-
-		retval = fw_core_add_address_handler ( &priv->handler, &region );
-		if ( retval < 0 )
+	if (dev->local_fifo == FWNET_NO_FIFO_ADDR) {
+		/* outside OHCI posted write area? */
+		static const struct fw_address_region region = {
+			.start = 0xffff00000000ULL,
+			.end   = CSR_REGISTER_BASE,
+		};
+
+		dev->handler.length = 4096;
+		dev->handler.address_callback = fwnet_receive_packet;
+		dev->handler.callback_data = dev;
+
+		retval = fw_core_add_address_handler(&dev->handler, &region);
+		if (retval < 0)
 			goto failed_initial;
-		priv->local_fifo = priv->handler.offset;
+
+		dev->local_fifo = dev->handler.offset;
 	}
 
-	/*
-	 * FIXME: rawiso limits us to PAGE_SIZE.  This only matters if we ever have
-	 * a machine with PAGE_SIZE < 4096
-	 */
-	max_receive = 1U << (priv->card->max_receive + 1);
-	num_packets = ( ipv4_iso_page_count * PAGE_SIZE ) / max_receive;
-	if ( ! priv->broadcast_rcv_context ) {
+	max_receive = 1U << (dev->card->max_receive + 1);
+	num_packets = (FWNET_ISO_PAGE_COUNT * PAGE_SIZE) / max_receive;
+
+	if (!dev->broadcast_rcv_context) {
 		void **ptrptr;
 
-		context = fw_iso_context_create ( priv->card,
- FW_ISO_CONTEXT_RECEIVE, BROADCAST_CHANNEL,
- priv->card->link_speed, 8, ipv4_receive_broadcast, priv );
+		context = fw_iso_context_create(dev->card,
+		    FW_ISO_CONTEXT_RECEIVE, IEEE1394_BROADCAST_CHANNEL,
+		    dev->card->link_speed, 8, fwnet_receive_broadcast, dev);
 		if (IS_ERR(context)) {
 			retval = PTR_ERR(context);
 			goto failed_context_create;
 		}
-		retval = fw_iso_buffer_init ( &priv->broadcast_rcv_buffer,
- priv->card, ipv4_iso_page_count, DMA_FROM_DEVICE );
-		if ( retval < 0 )
+
+		retval = fw_iso_buffer_init(&dev->broadcast_rcv_buffer,
+		    dev->card, FWNET_ISO_PAGE_COUNT, DMA_FROM_DEVICE);
+		if (retval < 0)
 			goto failed_buffer_init;
-		ptrptr = kmalloc ( sizeof(void*)*num_packets, GFP_KERNEL );
-		if ( ! ptrptr ) {
+
+		ptrptr = kmalloc(sizeof(void *) * num_packets, GFP_KERNEL);
+		if (!ptrptr) {
 			retval = -ENOMEM;
 			goto failed_ptrs_alloc;
 		}
-		priv->broadcast_rcv_buffer_ptrs = ptrptr;
-		for ( u = 0; u < ipv4_iso_page_count; u++ ) {
+
+		dev->broadcast_rcv_buffer_ptrs = ptrptr;
+		for (u = 0; u < FWNET_ISO_PAGE_COUNT; u++) {
 			void *ptr;
 			unsigned v;
 
-			ptr = kmap ( priv->broadcast_rcv_buffer.pages[u] );
-			for ( v = 0; v < num_packets / ipv4_iso_page_count; v++ )
-				*ptrptr++ = (void *)((char *)ptr + v * max_receive);
+			ptr = kmap(dev->broadcast_rcv_buffer.pages[u]);
+			for (v = 0; v < num_packets / FWNET_ISO_PAGE_COUNT; v++)
+				*ptrptr++ = (void *)
+						((char *)ptr + v * max_receive);
 		}
-		priv->broadcast_rcv_context = context;
-	} else
-		context = priv->broadcast_rcv_context;
+		dev->broadcast_rcv_context = context;
+	} else {
+		context = dev->broadcast_rcv_context;
+	}
 
 	packet.payload_length = max_receive;
 	packet.interrupt = 1;
 	packet.skip = 0;
 	packet.tag = 3;
 	packet.sy = 0;
-	packet.header_length = IPV4_GASP_OVERHEAD;
+	packet.header_length = IEEE1394_GASP_HDR_SIZE;
 	offset = 0;
-	for ( u = 0; u < num_packets; u++ ) {
-		retval = fw_iso_context_queue ( context, &packet,
- &priv->broadcast_rcv_buffer, offset );
-		if ( retval < 0 )
+
+	for (u = 0; u < num_packets; u++) {
+		retval = fw_iso_context_queue(context, &packet,
+				&dev->broadcast_rcv_buffer, offset);
+		if (retval < 0)
 			goto failed_rcv_queue;
+
 		offset += max_receive;
 	}
-	priv->num_broadcast_rcv_ptrs = num_packets;
-	priv->rcv_buffer_size = max_receive;
-	priv->broadcast_rcv_next_ptr = 0U;
-	retval = fw_iso_context_start ( context, -1, 0, FW_ISO_CONTEXT_MATCH_ALL_TAGS ); /* ??? sync */
-	if ( retval < 0 )
+	dev->num_broadcast_rcv_ptrs = num_packets;
+	dev->rcv_buffer_size = max_receive;
+	dev->broadcast_rcv_next_ptr = 0U;
+	retval = fw_iso_context_start(context, -1, 0,
+			FW_ISO_CONTEXT_MATCH_ALL_TAGS); /* ??? sync */
+	if (retval < 0)
 		goto failed_rcv_queue;
-	/* FIXME: adjust this when we know the max receive speeds of all other IP nodes on the bus. */
-	/* since we only xmt at S100 ??? */
-	priv->broadcast_xmt_max_payload = S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD - IPV4_UNFRAG_HDR_SIZE;
-	priv->broadcast_state = IPV4_BROADCAST_RUNNING;
+
+	/* FIXME: adjust it according to the min. speed of all known peers? */
+	dev->broadcast_xmt_max_payload = IEEE1394_MAX_PAYLOAD_S100
+			- IEEE1394_GASP_HDR_SIZE - RFC2374_UNFRAG_HDR_SIZE;
+	dev->broadcast_state = FWNET_BROADCAST_RUNNING;
+
 	return 0;
 
  failed_rcv_queue:
-	kfree ( priv->broadcast_rcv_buffer_ptrs );
-	priv->broadcast_rcv_buffer_ptrs = NULL;
+	kfree(dev->broadcast_rcv_buffer_ptrs);
+	dev->broadcast_rcv_buffer_ptrs = NULL;
  failed_ptrs_alloc:
-	fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
+	fw_iso_buffer_destroy(&dev->broadcast_rcv_buffer, dev->card);
  failed_buffer_init:
-	fw_iso_context_destroy ( context );
-	priv->broadcast_rcv_context = NULL;
+	fw_iso_context_destroy(context);
+	dev->broadcast_rcv_context = NULL;
  failed_context_create:
-	fw_core_remove_address_handler ( &priv->handler );
+	fw_core_remove_address_handler(&dev->handler);
  failed_initial:
-	priv->local_fifo = INVALID_FIFO_ADDR;
+	dev->local_fifo = FWNET_NO_FIFO_ADDR;
+
 	return retval;
 }
 
-/* This is called after an "ifup" */
-static int ipv4_open(struct net_device *dev) {
-	struct ipv4_priv *priv;
+/* ifup */
+static int fwnet_open(struct net_device *net)
+{
+	struct fwnet_device *dev = netdev_priv(net);
 	int ret;
 
-	priv = netdev_priv(dev);
-	if (priv->broadcast_state == IPV4_BROADCAST_ERROR) {
-		ret = ipv4_broadcast_start ( priv );
+	if (dev->broadcast_state == FWNET_BROADCAST_ERROR) {
+		ret = fwnet_broadcast_start(dev);
 		if (ret)
 			return ret;
 	}
-	netif_start_queue(dev);
+	netif_start_queue(net);
+
 	return 0;
 }
 
-/* This is called after an "ifdown" */
-static int ipv4_stop(struct net_device *netdev)
+/* ifdown */
+static int fwnet_stop(struct net_device *net)
 {
-	/* flush priv->wake */
-	/* flush_scheduled_work(); */
+	netif_stop_queue(net);
+
+	/* Deallocate iso context for use by other applications? */
 
-	netif_stop_queue(netdev);
 	return 0;
 }
 
-/* Transmit a packet (called by kernel) */
-static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
+static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 {
-	struct ipv4_ether_hdr hdr_buf;
-	struct ipv4_priv *priv = netdev_priv(netdev);
+	struct fwnet_header hdr_buf;
+	struct fwnet_device *dev = netdev_priv(net);
 	__be16 proto;
 	u16 dest_node;
-	enum ipv4_tx_type tx_type;
 	unsigned max_payload;
 	u16 dg_size;
 	u16 *datagram_label_ptr;
-	struct ipv4_packet_task *ptask;
-	struct ipv4_node *node = NULL;
+	struct fwnet_packet_task *ptask;
+	struct fwnet_peer *peer = NULL;
 
-	ptask = kmem_cache_alloc(ipv4_packet_task_cache, GFP_ATOMIC);
+	ptask = kmem_cache_alloc(fwnet_packet_task_cache, GFP_ATOMIC);
 	if (ptask == NULL)
 		goto fail;
 
@@ -1412,7 +1305,7 @@ static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
 		goto fail;
 
 	/*
-	 * Get rid of the fake ipv4 header, but first make a copy.
+	 * Make a copy of the driver-specific header.
 	 * We might need to rebuild the header on tx failure.
 	 */
 	memcpy(&hdr_buf, skb->data, sizeof(hdr_buf));
@@ -1425,110 +1318,95 @@ static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
 	 * Set the transmission type for the packet.  ARP packets and IP
 	 * broadcast packets are sent via GASP.
 	 */
-	if (   memcmp(hdr_buf.h_dest, netdev->broadcast, IPV4_ALEN) == 0
+	if (memcmp(hdr_buf.h_dest, net->broadcast, FWNET_ALEN) == 0
 	    || proto == htons(ETH_P_ARP)
-	    || (   proto == htons(ETH_P_IP)
-		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)) ) ) {
-		/* fw_debug ( "transmitting arp or multicast packet\n" );*/
-		tx_type = IPV4_GASP;
-		dest_node = ALL_NODES;
-		max_payload = priv->broadcast_xmt_max_payload;
-		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD); */
-		datagram_label_ptr = &priv->broadcast_xmt_datagramlabel;
-		ptask->fifo_addr = INVALID_FIFO_ADDR;
-		ptask->generation = 0U;
-		ptask->dest_node = 0U;
-		ptask->speed = 0;
+	    || (proto == htons(ETH_P_IP)
+		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)))) {
+		max_payload = dev->broadcast_xmt_max_payload;
+		datagram_label_ptr = &dev->broadcast_xmt_datagramlabel;
+
+		ptask->fifo_addr = FWNET_NO_FIFO_ADDR;
+		ptask->generation = 0;
+		ptask->dest_node = IEEE1394_ALL_NODES;
+		ptask->speed = SCODE_100;
 	} else {
-		__be64 guid = get_unaligned((u64 *)hdr_buf.h_dest);
+		__be64 guid = get_unaligned((__be64 *)hdr_buf.h_dest);
 		u8 generation;
 
-		node = ipv4_node_find_by_guid(priv, be64_to_cpu(guid));
-		if (!node) {
-			fw_debug ( "Normal packet but no node\n" );
+		peer = fwnet_peer_find_by_guid(dev, be64_to_cpu(guid));
+		if (!peer)
 			goto fail;
-		}
 
-		if (node->fifo == INVALID_FIFO_ADDR) {
-			fw_debug ( "Normal packet but no fifo addr\n" );
+		if (peer->fifo == FWNET_NO_FIFO_ADDR)
 			goto fail;
-		}
 
-		/* fw_debug ( "Transmitting normal packet to %x at %llxx\n", node->nodeid, node->fifo ); */
-		generation = node->generation;
-		dest_node = node->nodeid;
-		max_payload = node->max_payload;
-		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_FRAG_HDR_SIZE); */
+		generation = peer->generation;
+		smp_rmb();
+		dest_node = peer->node_id;
+
+		max_payload = peer->max_payload;
+		datagram_label_ptr = &peer->datagram_label;
 
-		datagram_label_ptr = &node->datagram_label;
-		tx_type = IPV4_WRREQ;
-		ptask->fifo_addr = node->fifo;
+		ptask->fifo_addr = peer->fifo;
 		ptask->generation = generation;
 		ptask->dest_node = dest_node;
-		ptask->speed = node->xmt_speed;
+		ptask->speed = peer->xmt_speed;
 	}
 
 	/* If this is an ARP packet, convert it */
 	if (proto == htons(ETH_P_ARP)) {
-		/* Convert a standard ARP packet to 1394 ARP. The first 8 bytes (the entire
-		 * arphdr) is the same format as the ip1394 header, so they overlap.  The rest
-		 * needs to be munged a bit.  The remainder of the arphdr is formatted based
-		 * on hwaddr len and ipaddr len.  We know what they'll be, so it's easy to
-		 * judge.
-		 *
-		 * Now that the EUI is used for the hardware address all we need to do to make
-		 * this work for 1394 is to insert 2 quadlets that contain max_rec size,
-		 * speed, and unicast FIFO address information between the sender_unique_id
-		 * and the IP addresses.
-		 */
 		struct arphdr *arp = (struct arphdr *)skb->data;
 		unsigned char *arp_ptr = (unsigned char *)(arp + 1);
-		struct ipv4_arp *arp1394 = (struct ipv4_arp *)skb->data;
-		u32 ipaddr;
-
-		ipaddr = *(u32*)(arp_ptr + IPV4_ALEN);
-		arp1394->hw_addr_len    = 16;
-		arp1394->max_rec        = priv->card->max_receive;
-		arp1394->sspd		= priv->card->link_speed;
-		arp1394->fifo_hi	= htons(priv->local_fifo >> 32);
-		arp1394->fifo_lo        = htonl(priv->local_fifo & 0xFFFFFFFF);
-		arp1394->sip		= ipaddr;
+		struct rfc2734_arp *arp1394 = (struct rfc2734_arp *)skb->data;
+		__be32 ipaddr;
+
+		ipaddr = get_unaligned((__be32 *)(arp_ptr + FWNET_ALEN));
+
+		arp1394->hw_addr_len    = RFC2734_HW_ADDR_LEN;
+		arp1394->max_rec        = dev->card->max_receive;
+		arp1394->sspd		= dev->card->link_speed;
+
+		put_unaligned_be16(dev->local_fifo >> 32,
+				   &arp1394->fifo_hi);
+		put_unaligned_be32(dev->local_fifo & 0xffffffff,
+				   &arp1394->fifo_lo);
+		put_unaligned(ipaddr, &arp1394->sip);
 	}
-	if ( ipv4_max_xmt && max_payload > ipv4_max_xmt )
-		max_payload = ipv4_max_xmt;
 
 	ptask->hdr.w0 = 0;
 	ptask->hdr.w1 = 0;
 	ptask->skb = skb;
-	ptask->priv = priv;
-        ptask->tx_type = tx_type;
+	ptask->dev = dev;
+
 	/* Does it all fit in one packet? */
-	if ( dg_size <= max_payload ) {
-		ipv4_make_uf_hdr(&ptask->hdr, be16_to_cpu(proto));
+	if (dg_size <= max_payload) {
+		fwnet_make_uf_hdr(&ptask->hdr, ntohs(proto));
 		ptask->outstanding_pkts = 1;
-		max_payload = dg_size + IPV4_UNFRAG_HDR_SIZE;
+		max_payload = dg_size + RFC2374_UNFRAG_HDR_SIZE;
 	} else {
 		u16 datagram_label;
 
-		max_payload -= IPV4_FRAG_OVERHEAD;
+		max_payload -= RFC2374_FRAG_OVERHEAD;
 		datagram_label = (*datagram_label_ptr)++;
-		ipv4_make_ff_hdr(&ptask->hdr, be16_to_cpu(proto), dg_size, datagram_label );
+		fwnet_make_ff_hdr(&ptask->hdr, ntohs(proto), dg_size,
+				  datagram_label);
 		ptask->outstanding_pkts = DIV_ROUND_UP(dg_size, max_payload);
-		max_payload += IPV4_FRAG_HDR_SIZE;
+		max_payload += RFC2374_FRAG_HDR_SIZE;
 	}
 	ptask->max_payload = max_payload;
-	ipv4_send_packet ( ptask );
+	fwnet_send_packet(ptask);
+
 	return NETDEV_TX_OK;
 
  fail:
 	if (ptask)
-		kmem_cache_free(ipv4_packet_task_cache, ptask);
+		kmem_cache_free(fwnet_packet_task_cache, ptask);
 
 	if (skb != NULL)
 		dev_kfree_skb(skb);
 
-	netdev->stats.tx_dropped++;
-	netdev->stats.tx_errors++;
+	net->stats.tx_dropped++;
+	net->stats.tx_errors++;
 
 	/*
 	 * FIXME: According to a patch from 2003-02-26, "returning non-zero
@@ -1540,280 +1418,291 @@ static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 }
 
-/*
- * FIXME: What to do if we timeout? I think a host reset is probably in order,
- * so that's what we do. Should we increment the stat counters too?
- */
-static void ipv4_tx_timeout(struct net_device *dev) {
-	struct ipv4_priv *priv;
+static void fwnet_tx_timeout(struct net_device *net)
+{
+	fw_error("%s: timeout\n", net->name);
 
-	priv = netdev_priv(dev);
-	fw_error ( "%s: Timeout, resetting host\n", dev->name );
-#if 0 /* stefanr */
-	fw_core_initiate_bus_reset ( priv->card, 1 );
-#endif
+	/* FIXME: What to do if we timeout? */
 }
 
-static int ipv4_change_mtu ( struct net_device *dev, int new_mtu ) {
-#if 0
-	int max_mtu;
-	struct ipv4_priv *priv;
-#endif
-
+static int fwnet_change_mtu(struct net_device *net, int new_mtu)
+{
 	if (new_mtu < 68)
 		return -EINVAL;
 
-#if 0
-	priv = netdev_priv(dev);
-	/* This is not actually true because we can fragment packets at the firewire layer */
-	max_mtu = (1 << (priv->card->max_receive + 1))
-		                - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
-	if (new_mtu > max_mtu) {
-		fw_notify ( "%s: Local node constrains MTU to %d\n", dev->name, max_mtu);
-		return -ERANGE;
-	}
-#endif
-	dev->mtu = new_mtu;
+	net->mtu = new_mtu;
 	return 0;
 }
 
-static void ipv4_get_drvinfo(struct net_device *dev,
-struct ethtool_drvinfo *info) {
-	strcpy(info->driver, ipv4_driver_name);
-	strcpy(info->bus_info, "ieee1394"); /* FIXME provide more detail? */
+static void fwnet_get_drvinfo(struct net_device *net,
+			      struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, KBUILD_MODNAME);
+	strcpy(info->bus_info, "ieee1394");
 }
 
-static struct ethtool_ops ipv4_ethtool_ops = {
-	.get_drvinfo = ipv4_get_drvinfo,
+static struct ethtool_ops fwnet_ethtool_ops = {
+	.get_drvinfo = fwnet_get_drvinfo,
 };
 
-static const struct net_device_ops ipv4_netdev_ops = {
-	.ndo_open       = ipv4_open,
-	.ndo_stop	= ipv4_stop,
-	.ndo_start_xmit = ipv4_tx,
-	.ndo_tx_timeout = ipv4_tx_timeout,
-	.ndo_change_mtu = ipv4_change_mtu,
+static const struct net_device_ops fwnet_netdev_ops = {
+	.ndo_open       = fwnet_open,
+	.ndo_stop	= fwnet_stop,
+	.ndo_start_xmit = fwnet_tx,
+	.ndo_tx_timeout = fwnet_tx_timeout,
+	.ndo_change_mtu = fwnet_change_mtu,
 };
 
-static void ipv4_init_dev ( struct net_device *dev ) {
-	dev->header_ops		= &ipv4_header_ops;
-	dev->netdev_ops         = &ipv4_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &ipv4_ethtool_ops);
-
-	dev->watchdog_timeo	= IPV4_TIMEOUT;
-	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
-	dev->features		= NETIF_F_HIGHDMA;
-	dev->addr_len		= IPV4_ALEN;
-	dev->hard_header_len	= IPV4_HLEN;
-	dev->type		= ARPHRD_IEEE1394;
-
-	/* FIXME: This value was copied from ether_setup(). Is it too much? */
-	dev->tx_queue_len	= 1000;
+static void fwnet_init_dev(struct net_device *net)
+{
+	net->header_ops		= &fwnet_header_ops;
+	net->netdev_ops		= &fwnet_netdev_ops;
+	net->watchdog_timeo	= 100000; /* ? FIXME */
+	net->flags		= IFF_BROADCAST | IFF_MULTICAST;
+	net->features		= NETIF_F_HIGHDMA;
+	net->addr_len		= FWNET_ALEN;
+	net->hard_header_len	= FWNET_HLEN;
+	net->type		= ARPHRD_IEEE1394;
+	net->tx_queue_len	= 1000; /* ? FIXME */
+	SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops);
 }
 
-static int ipv4_probe ( struct device *dev ) {
-	struct fw_unit * unit;
-	struct fw_device *device;
-	struct fw_card *card;
-	struct net_device *netdev;
-	struct ipv4_priv *priv;
+/* FIXME create netdev upon first fw_unit of a card, not upon local fw_unit */
+static int fwnet_probe(struct device *_dev)
+{
+	struct fw_unit *unit = fw_unit(_dev);
+	struct fw_device *device = fw_parent_device(unit);
+	struct fw_card *card = device->card;
+	struct net_device *net;
+	struct fwnet_device *dev;
 	unsigned max_mtu;
-	__be64 guid;
-
-	fw_debug("ipv4 Probing\n" );
-	unit = fw_unit ( dev );
-	device = fw_device ( unit->device.parent );
-	card = device->card;
 
-	if ( ! device->is_local ) {
+	if (!device->is_local) {
 		int added;
 
-		fw_debug ( "Non-local, adding remote node entry\n" );
-		added = ipv4_node_new ( card, device );
+		added = fwnet_peer_new(card, device);
 		return added;
 	}
-	fw_debug("ipv4 Local: adding netdev\n" );
-	netdev = alloc_netdev ( sizeof(*priv), "firewire%d", ipv4_init_dev );
-	if ( netdev == NULL) {
-		fw_error( "Out of memory\n");
+	net = alloc_netdev(sizeof(*dev), "firewire%d", fwnet_init_dev);
+	if (net == NULL) {
+		fw_error("out of memory\n");
 		goto out;
 	}
 
-	SET_NETDEV_DEV(netdev, card->device);
-	priv = netdev_priv(netdev);
+	SET_NETDEV_DEV(net, card->device);
+	dev = netdev_priv(net);
 
-	spin_lock_init(&priv->lock);
-	priv->broadcast_state = IPV4_BROADCAST_ERROR;
-	priv->broadcast_rcv_context = NULL;
-	priv->broadcast_xmt_max_payload = 0;
-	priv->broadcast_xmt_datagramlabel = 0;
+	spin_lock_init(&dev->lock);
+	dev->broadcast_state = FWNET_BROADCAST_ERROR;
+	dev->broadcast_rcv_context = NULL;
+	dev->broadcast_xmt_max_payload = 0;
+	dev->broadcast_xmt_datagramlabel = 0;
 
-	priv->local_fifo = INVALID_FIFO_ADDR;
+	dev->local_fifo = FWNET_NO_FIFO_ADDR;
 
-	/* INIT_WORK(&priv->wake, ipv4_handle_queue);*/
-	INIT_LIST_HEAD(&priv->packet_list);
-	INIT_LIST_HEAD(&priv->broadcasted_list);
-	INIT_LIST_HEAD(&priv->sent_list );
+	/* INIT_WORK(&dev->wake, fwnet_handle_queue);*/
+	INIT_LIST_HEAD(&dev->packet_list);
+	INIT_LIST_HEAD(&dev->broadcasted_list);
+	INIT_LIST_HEAD(&dev->sent_list);
 
-	priv->card = card;
+	dev->card = card;
 
 	/*
 	 * Use the RFC 2734 default 1500 octets or the maximum payload
 	 * as initial MTU
 	 */
 	max_mtu = (1 << (card->max_receive + 1))
-		  - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
-	netdev->mtu = min(1500U, max_mtu);
+		  - sizeof(struct rfc2734_header) - IEEE1394_GASP_HDR_SIZE;
+	net->mtu = min(1500U, max_mtu);
 
 	/* Set our hardware address while we're at it */
-	guid = cpu_to_be64(card->guid);
-	memcpy(netdev->dev_addr, &guid, sizeof(u64));
-	memset(netdev->broadcast, 0xff, sizeof(u64));
-	if ( register_netdev ( netdev ) ) {
-		fw_error ( "Cannot register the driver\n");
+	put_unaligned_be64(card->guid, net->dev_addr);
+	put_unaligned_be64(~0ULL, net->broadcast);
+	if (register_netdev(net)) {
+		fw_error("Cannot register the driver\n");
 		goto out;
 	}
 
-	fw_notify ( "%s: IPv4 over Firewire on device %016llx\n",
- netdev->name, card->guid );
-	card->netdev = netdev;
+	fw_notify("%s: IPv4 over FireWire on device %016llx\n",
+		  net->name, (unsigned long long)card->guid);
+	card->netdev = net;
 
-	return 0 /* ipv4_new_node ( ud ) */;
+	return 0;
  out:
-	if ( netdev )
-		free_netdev ( netdev );
+	if (net)
+		free_netdev(net);
+
 	return -ENOENT;
 }
 
+static int fwnet_remove(struct device *_dev)
+{
+	struct fw_unit *unit = fw_unit(_dev);
+	struct fw_device *device = fw_parent_device(unit);
+	struct fw_card *card = device->card;
+	struct net_device *net;
+	struct fwnet_device *dev;
+	struct fwnet_peer *peer;
+	struct fwnet_partial_datagram *pd, *pd_next;
+	struct fwnet_packet_task *ptask, *pt_next;
+
+	if (!device->is_local) {
+		fwnet_peer_delete(card, device);
 
-static int ipv4_remove ( struct device *dev ) {
-	struct fw_unit * unit;
-	struct fw_device *device;
-	struct fw_card *card;
-	struct net_device *netdev;
-	struct ipv4_priv *priv;
-	struct ipv4_node *node;
-	struct ipv4_partial_datagram *pd, *pd_next;
-	struct ipv4_packet_task *ptask, *pt_next;
-
-	fw_debug("ipv4 Removing\n" );
-	unit = fw_unit ( dev );
-	device = fw_device ( unit->device.parent );
-	card = device->card;
-
-	if ( ! device->is_local ) {
-		fw_debug ( "Node %x is non-local, removing remote node entry\n", device->node_id );
-		ipv4_node_delete ( card, device );
 		return 0;
 	}
-	netdev = card->netdev;
-	if ( netdev ) {
-		fw_debug ( "Node %x is local: deleting netdev\n", device->node_id );
-		priv = netdev_priv ( netdev );
-		unregister_netdev ( netdev );
-		fw_debug ( "unregistered\n" );
-		if ( priv->local_fifo != INVALID_FIFO_ADDR )
-			fw_core_remove_address_handler ( &priv->handler );
-		fw_debug ( "address handler gone\n" );
-		if ( priv->broadcast_rcv_context ) {
-			fw_iso_context_stop ( priv->broadcast_rcv_context );
-			fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
-			fw_iso_context_destroy ( priv->broadcast_rcv_context );
-			fw_debug ( "rcv stopped\n" );
+
+	net = card->netdev;
+	if (net) {
+		dev = netdev_priv(net);
+		unregister_netdev(net);
+
+		if (dev->local_fifo != FWNET_NO_FIFO_ADDR)
+			fw_core_remove_address_handler(&dev->handler);
+		if (dev->broadcast_rcv_context) {
+			fw_iso_context_stop(dev->broadcast_rcv_context);
+			fw_iso_buffer_destroy(&dev->broadcast_rcv_buffer,
+					      dev->card);
+			fw_iso_context_destroy(dev->broadcast_rcv_context);
 		}
-		list_for_each_entry_safe( ptask, pt_next, &priv->packet_list, packet_list ) {
-			dev_kfree_skb_any ( ptask->skb );
-			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		list_for_each_entry_safe(ptask, pt_next,
+					 &dev->packet_list, pt_link) {
+			dev_kfree_skb_any(ptask->skb);
+			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		list_for_each_entry_safe( ptask, pt_next, &priv->broadcasted_list, packet_list ) {
-			dev_kfree_skb_any ( ptask->skb );
-			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		list_for_each_entry_safe(ptask, pt_next,
+					 &dev->broadcasted_list, pt_link) {
+			dev_kfree_skb_any(ptask->skb);
+			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		list_for_each_entry_safe( ptask, pt_next, &priv->sent_list, packet_list ) {
-			dev_kfree_skb_any ( ptask->skb );
-			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		list_for_each_entry_safe(ptask, pt_next,
+					 &dev->sent_list, pt_link) {
+			dev_kfree_skb_any(ptask->skb);
+			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		fw_debug ( "lists emptied\n" );
-		list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
-			if ( node->pdg_size ) {
-				list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
-					ipv4_pd_delete ( pd );
-				node->pdg_size = 0;
+		list_for_each_entry(peer, &card->peer_list, peer_link) {
+			if (peer->pdg_size) {
+				list_for_each_entry_safe(pd, pd_next,
+						&peer->pd_list, pd_link)
+					fwnet_pd_delete(pd);
+				peer->pdg_size = 0;
 			}
-			node->fifo = INVALID_FIFO_ADDR;
+			peer->fifo = FWNET_NO_FIFO_ADDR;
 		}
-		fw_debug ( "nodes cleaned up\n" );
-		free_netdev ( netdev );
+		free_netdev(net);
 		card->netdev = NULL;
-		fw_debug ( "done\n" );
 	}
+
 	return 0;
 }
 
-static void ipv4_update ( struct fw_unit *unit ) {
-	struct fw_device *device;
-	struct fw_card *card;
+/*
+ * FIXME abort partially sent fragmented datagrams,
+ * discard partially received fragmented datagrams
+ */
+static void fwnet_update(struct fw_unit *unit)
+{
+	struct fw_device *device = fw_parent_device(unit);
+	struct net_device *net = device->card->netdev;
+	struct fwnet_device *dev;
+	struct fwnet_peer *peer;
+	u64 guid;
 
-	fw_debug ( "ipv4_update unit %p\n", unit );
-	device = fw_device ( unit->device.parent );
-	card = device->card;
-	if ( ! device->is_local ) {
-		struct ipv4_node *node;
-		u64 guid;
-		struct net_device *netdev;
-		struct ipv4_priv *priv;
-
-		netdev = card->netdev;
-		if ( netdev ) {
-			priv = netdev_priv ( netdev );
-			guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-			node = ipv4_node_find_by_guid ( priv, guid );
-			if ( ! node ) {
-				fw_error ( "ipv4_update: no node for device %llx\n", guid );
-				return;
-			}
-			fw_debug ( "Non-local, updating remote node entry for guid %llx old generation %x, old nodeid %x\n", guid, node->generation, node->nodeid );
-			node->generation = device->generation;
-			rmb();
-			node->nodeid = device->node_id;
-			fw_debug ( "New generation %x, new nodeid %x\n", node->generation, node->nodeid );
-		} else
-			fw_error ( "nonlocal, but no netdev?  How can that be?\n" );
-	} else {
-		/* FIXME: What do we need to do on bus reset? */
-		fw_debug ( "Local, doing nothing\n" );
+	if (net && !device->is_local) {
+		dev = netdev_priv(net);
+		guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+		peer = fwnet_peer_find_by_guid(dev, guid);
+		if (!peer) {
+			fw_error("fwnet_update: no peer for device %016llx\n",
+				 (unsigned long long)guid);
+			return;
+		}
+		peer->generation = device->generation;
+		rmb();
+		peer->node_id = device->node_id;
 	}
 }
 
-static struct fw_driver ipv4_driver = {
+static const struct ieee1394_device_id fwnet_id_table[] = {
+	{
+		.match_flags  = IEEE1394_MATCH_SPECIFIER_ID |
+				IEEE1394_MATCH_VERSION,
+		.specifier_id = IANA_SPECIFIER_ID,
+		.version      = RFC2734_SW_VERSION,
+	},
+	{ }
+};
+
+static struct fw_driver fwnet_driver = {
 	.driver = {
-		.owner = THIS_MODULE,
-		.name = ipv4_driver_name,
-		.bus = &fw_bus_type,
-		.probe = ipv4_probe,
-		.remove = ipv4_remove,
+		.owner  = THIS_MODULE,
+		.name   = "net",
+		.bus    = &fw_bus_type,
+		.probe  = fwnet_probe,
+		.remove = fwnet_remove,
 	},
-	.update = ipv4_update,
-	.id_table = ipv4_id_table,
+	.update   = fwnet_update,
+	.id_table = fwnet_id_table,
+};
+
+static const u32 rfc2374_unit_directory_data[] = {
+	0x00040000,	/* directory_length		*/
+	0x1200005e,	/* unit_specifier_id: IANA	*/
+	0x81000003,	/* textual descriptor offset	*/
+	0x13000001,	/* unit_sw_version: RFC 2734	*/
+	0x81000005,	/* textual descriptor offset	*/
+	0x00030000,	/* descriptor_length		*/
+	0x00000000,	/* text				*/
+	0x00000000,	/* minimal ASCII, en		*/
+	0x49414e41,	/* I A N A			*/
+	0x00030000,	/* descriptor_length		*/
+	0x00000000,	/* text				*/
+	0x00000000,	/* minimal ASCII, en		*/
+	0x49507634,	/* I P v 4			*/
+};
+
+static struct fw_descriptor rfc2374_unit_directory = {
+	.length = ARRAY_SIZE(rfc2374_unit_directory_data),
+	.key    = (CSR_DIRECTORY | CSR_UNIT) << 24,
+	.data   = rfc2374_unit_directory_data
 };
 
-static int __init ipv4_init ( void ) {
-	int added;
+static int __init fwnet_init(void)
+{
+	int err;
+
+	err = fw_core_add_descriptor(&rfc2374_unit_directory);
+	if (err)
+		return err;
 
-	added = fw_core_add_descriptor ( &ipv4_unit_directory );
-	if ( added < 0 )
-		fw_error ( "Failed to add descriptor" );
-	ipv4_packet_task_cache = kmem_cache_create("packet_task",
- sizeof(struct ipv4_packet_task), 0, 0, NULL);
-	fw_debug("Adding ipv4 module\n" );
-	return driver_register ( &ipv4_driver.driver );
+	fwnet_packet_task_cache = kmem_cache_create("packet_task",
+			sizeof(struct fwnet_packet_task), 0, 0, NULL);
+	if (!fwnet_packet_task_cache) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = driver_register(&fwnet_driver.driver);
+	if (!err)
+		return 0;
+
+	kmem_cache_destroy(fwnet_packet_task_cache);
+out:
+	fw_core_remove_descriptor(&rfc2374_unit_directory);
+
+	return err;
 }
+module_init(fwnet_init);
 
-static void __exit ipv4_cleanup ( void ) {
-	fw_core_remove_descriptor ( &ipv4_unit_directory );
-	fw_debug("Removing ipv4 module\n" );
-	driver_unregister ( &ipv4_driver.driver );
+static void __exit fwnet_cleanup(void)
+{
+	driver_unregister(&fwnet_driver.driver);
+	kmem_cache_destroy(fwnet_packet_task_cache);
+	fw_core_remove_descriptor(&rfc2374_unit_directory);
 }
+module_exit(fwnet_cleanup);
 
-module_init(ipv4_init);
-module_exit(ipv4_cleanup);
+MODULE_AUTHOR("Jay Fenlason <fenlason@redhat.com>");
+MODULE_DESCRIPTION("IPv4 over IEEE1394 as per RFC 2734");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(ieee1394, fwnet_id_table);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index d44f47d3b2d9..5cb0c1549ff1 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -131,13 +131,10 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
-	/* Only non-NULL if firewire-ipv4 is active on this card. */
+
+	/* firewire-net driver data */
 	void *netdev;
-	/*
-	 * The nodes get probed before the card, so we need a place to store
-	 * them independent of card->netdev
-	 */
-	struct list_head ipv4_nodes;
+	struct list_head peer_list;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
-- 
cgit v1.2.3-71-gd317


From 5a124d382ea5c97be43c779e4f481455e0287654 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 14 Jun 2009 11:45:27 +0200
Subject: firewire: net: allow for unordered unit discovery

Decouple the creation and destruction of the net_device from the order
of discovery and removal of nodes with RFC 2734 unit directories since
there is no reliable order.  The net_device is now created when the
first RFC 2734 unit on a card is discovered, and destroyed when the last
RFC 2734 unit on a card went away.  This includes all remote units as
well as the local unit, which is therefore tracked as a peer now too.

Also, locking around the list of peers is slightly extended to guard
against peer removal.  As a side effect, fwnet_peer.pdg_lock has become
superfluous and is deleted.

Peer data (max_rec, speed, node ID, generation) are updated more
carefully.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c |   2 -
 drivers/firewire/net.c       | 454 ++++++++++++++++++++-----------------------
 include/linux/firewire.h     |   4 -
 3 files changed, 207 insertions(+), 253 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 8c45e43da7c5..667603ac14b1 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -429,8 +429,6 @@ void fw_card_initialize(struct fw_card *card,
 	card->local_node = NULL;
 
 	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
-	card->netdev = NULL;
-	INIT_LIST_HEAD(&card->peer_list);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index ba6f924b1b13..d83c54587a63 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -18,8 +18,10 @@
 #include <linux/mod_devicetable.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/spinlock.h>
 
 #include <asm/unaligned.h>
 #include <net/arp.h>
@@ -135,40 +137,11 @@ struct fwnet_partial_datagram {
 	u16 datagram_size;
 };
 
-/*
- * We keep one of these for each IPv4 capable device attached to a fw_card.
- * The list of them is stored in the fw_card structure rather than in the
- * fwnet_device because the remote IPv4 nodes may be probed before the card is,
- * so we need a place to store them before the fwnet_device structure is
- * allocated.
- */
-struct fwnet_peer {
-	struct list_head peer_link;
-	/* guid of the remote peer */
-	u64 guid;
-	/* FIFO address to transmit datagrams to, or FWNET_NO_FIFO_ADDR */
-	u64 fifo;
-
-	spinlock_t pdg_lock;	/* partial datagram lock		*/
-	/* List of partial datagrams received from this peer */
-	struct list_head pd_list;
-	/* Number of entries in pd_list at the moment */
-	unsigned pdg_size;
-
-	/* max payload to transmit to this remote peer */
-	/* This already includes the RFC2374_FRAG_HDR_SIZE overhead */
-	u16 max_payload;
-	/* outgoing datagram label */
-	u16 datagram_label;
-	/* Current node_id of the remote peer */
-	u16 node_id;
-	/* current generation of the remote peer */
-	u8 generation;
-	/* max speed that this peer can receive at */
-	u8 xmt_speed;
-};
+static DEFINE_MUTEX(fwnet_device_mutex);
+static LIST_HEAD(fwnet_device_list);
 
 struct fwnet_device {
+	struct list_head dev_link;
 	spinlock_t lock;
 	enum {
 		FWNET_BROADCAST_ERROR,
@@ -206,7 +179,26 @@ struct fwnet_device {
 	/* List of packets that have been sent but not yet acked */
 	struct list_head sent_list;
 
+	struct list_head peer_list;
 	struct fw_card *card;
+	struct net_device *netdev;
+};
+
+struct fwnet_peer {
+	struct list_head peer_link;
+	struct fwnet_device *dev;
+	u64 guid;
+	u64 fifo;
+
+	/* guarded by dev->lock */
+	struct list_head pd_list; /* received partial datagrams */
+	unsigned pdg_size;        /* pd_list size */
+
+	u16 datagram_label;       /* outgoing datagram label */
+	unsigned max_payload;     /* includes RFC2374_FRAG_HDR_SIZE overhead */
+	int node_id;
+	int generation;
+	unsigned speed;
 };
 
 /* This is our task struct. It's used for the packet complete callback.  */
@@ -479,102 +471,47 @@ static bool fwnet_pd_is_complete(struct fwnet_partial_datagram *pd)
 	return fi->len == pd->datagram_size;
 }
 
-static int fwnet_peer_new(struct fw_card *card, struct fw_device *device)
-{
-	struct fwnet_peer *peer;
-
-	peer = kmalloc(sizeof(*peer), GFP_KERNEL);
-	if (!peer) {
-		fw_error("out of memory\n");
-
-		return -ENOMEM;
-	}
-	peer->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	peer->fifo = FWNET_NO_FIFO_ADDR;
-	INIT_LIST_HEAD(&peer->pd_list);
-	spin_lock_init(&peer->pdg_lock);
-	peer->pdg_size = 0;
-	peer->generation = device->generation;
-	rmb();
-	peer->node_id = device->node_id;
-	 /* FIXME what should it really be? */
-	peer->max_payload = IEEE1394_MAX_PAYLOAD_S100 - RFC2374_UNFRAG_HDR_SIZE;
-	peer->datagram_label = 0U;
-	peer->xmt_speed = device->max_speed;
-	list_add_tail(&peer->peer_link, &card->peer_list);
-
-	return 0;
-}
-
-/* FIXME caller must take the lock, or peer needs to be reference-counted */
+/* caller must hold dev->lock */
 static struct fwnet_peer *fwnet_peer_find_by_guid(struct fwnet_device *dev,
 						  u64 guid)
 {
-	struct fwnet_peer *p, *peer = NULL;
-	unsigned long flags;
+	struct fwnet_peer *peer;
 
-	spin_lock_irqsave(&dev->lock, flags);
-	list_for_each_entry(p, &dev->card->peer_list, peer_link)
-		if (p->guid == guid) {
-			peer = p;
-			break;
-		}
-	spin_unlock_irqrestore(&dev->lock, flags);
+	list_for_each_entry(peer, &dev->peer_list, peer_link)
+		if (peer->guid == guid)
+			return peer;
 
-	return peer;
+	return NULL;
 }
 
-/* FIXME caller must take the lock, or peer needs to be reference-counted */
-/* FIXME node_id doesn't mean anything without generation */
+/* caller must hold dev->lock */
 static struct fwnet_peer *fwnet_peer_find_by_node_id(struct fwnet_device *dev,
-						     u16 node_id)
+						int node_id, int generation)
 {
-	struct fwnet_peer *p, *peer = NULL;
-	unsigned long flags;
+	struct fwnet_peer *peer;
 
-	spin_lock_irqsave(&dev->lock, flags);
-	list_for_each_entry(p, &dev->card->peer_list, peer_link)
-		if (p->node_id == node_id) {
-			peer = p;
-			break;
-		}
-	spin_unlock_irqrestore(&dev->lock, flags);
+	list_for_each_entry(peer, &dev->peer_list, peer_link)
+		if (peer->node_id    == node_id &&
+		    peer->generation == generation)
+			return peer;
 
-	return peer;
+	return NULL;
 }
 
-/* FIXME */
-static void fwnet_peer_delete(struct fw_card *card, struct fw_device *device)
+/* See IEEE 1394-2008 table 6-4, table 8-8, table 16-18. */
+static unsigned fwnet_max_payload(unsigned max_rec, unsigned speed)
 {
-	struct net_device *net;
-	struct fwnet_device *dev;
-	struct fwnet_peer *peer;
-	u64 guid;
-	unsigned long flags;
-	struct fwnet_partial_datagram *pd, *pd_next;
-
-	guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	net = card->netdev;
-	if (net)
-		dev = netdev_priv(net);
-	else
-		dev = NULL;
-	if (dev)
-		spin_lock_irqsave(&dev->lock, flags);
-
-	list_for_each_entry(peer, &card->peer_list, peer_link) {
-		if (peer->guid == guid) {
-			list_del(&peer->peer_link);
-			list_for_each_entry_safe(pd, pd_next, &peer->pd_list,
-						 pd_link)
-				fwnet_pd_delete(pd);
-			break;
-		}
+	max_rec = min(max_rec, speed + 8);
+	max_rec = min(max_rec, 0xbU); /* <= 4096 */
+	if (max_rec < 8) {
+		fw_notify("max_rec %x out of range\n", max_rec);
+		max_rec = 8;
 	}
-	if (dev)
-		spin_unlock_irqrestore(&dev->lock, flags);
+
+	return (1 << (max_rec + 1)) - RFC2374_FRAG_HDR_SIZE;
 }
 
+
 static int fwnet_finish_incoming_packet(struct net_device *net,
 					struct sk_buff *skb, u16 source_node_id,
 					bool is_broadcast, u16 ether_type)
@@ -606,71 +543,44 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 		unsigned char *arp_ptr;
 		u64 fifo_addr;
 		u64 peer_guid;
-		u8 max_rec;
-		u8 sspd;
+		unsigned sspd;
 		u16 max_payload;
 		struct fwnet_peer *peer;
-		static const u16 fwnet_speed_to_max_payload[] = {
-			/* S100, S200, S400, S800, S1600, S3200 */
-			    512, 1024, 2048, 4096,  4096,  4096
-		};
+		unsigned long flags;
+
+		arp1394   = (struct rfc2734_arp *)skb->data;
+		arp       = (struct arphdr *)skb->data;
+		arp_ptr   = (unsigned char *)(arp + 1);
+		peer_guid = get_unaligned_be64(&arp1394->s_uniq_id);
+		fifo_addr = (u64)get_unaligned_be16(&arp1394->fifo_hi) << 32
+				| get_unaligned_be32(&arp1394->fifo_lo);
 
-		arp1394 = (struct rfc2734_arp *)skb->data;
-		arp = (struct arphdr *)skb->data;
-		arp_ptr = (unsigned char *)(arp + 1);
-		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32
-				| ntohl(arp1394->fifo_lo);
-		max_rec = dev->card->max_receive;
-		if (arp1394->max_rec < max_rec)
-			max_rec = arp1394->max_rec;
 		sspd = arp1394->sspd;
 		/* Sanity check.  OS X 10.3 PPC reportedly sends 131. */
 		if (sspd > SCODE_3200) {
 			fw_notify("sspd %x out of range\n", sspd);
-			sspd = 0;
+			sspd = SCODE_3200;
 		}
+		max_payload = fwnet_max_payload(arp1394->max_rec, sspd);
 
-		max_payload = min(fwnet_speed_to_max_payload[sspd],
-			(u16)(1 << (max_rec + 1))) - RFC2374_UNFRAG_HDR_SIZE;
-
-		peer_guid = get_unaligned_be64(&arp1394->s_uniq_id);
+		spin_lock_irqsave(&dev->lock, flags);
 		peer = fwnet_peer_find_by_guid(dev, peer_guid);
+		if (peer) {
+			peer->fifo = fifo_addr;
+
+			if (peer->speed > sspd)
+				peer->speed = sspd;
+			if (peer->max_payload > max_payload)
+				peer->max_payload = max_payload;
+		}
+		spin_unlock_irqrestore(&dev->lock, flags);
+
 		if (!peer) {
 			fw_notify("No peer for ARP packet from %016llx\n",
 				  (unsigned long long)peer_guid);
 			goto failed_proto;
 		}
 
-		/* FIXME don't use card->generation */
-		if (peer->node_id != source_node_id ||
-		    peer->generation != dev->card->generation) {
-			fw_notify("Internal error: peer->node_id (%x) != "
-				  "source_node_id (%x) or peer->generation (%x)"
-				  " != dev->card->generation(%x)\n",
-				  peer->node_id, source_node_id,
-				  peer->generation, dev->card->generation);
-			peer->node_id = source_node_id;
-			peer->generation = dev->card->generation;
-		}
-
-		/* FIXME: for debugging */
-		if (sspd > SCODE_400)
-			sspd = SCODE_400;
-		/* Update our speed/payload/fifo_offset table */
-		/*
-		 * FIXME: this does not handle cases where two high-speed endpoints must use a slower speed because of
-		 * a lower speed hub between them.  We need to look at the actual topology map here.
-		 */
-		peer->fifo = fifo_addr;
-		peer->max_payload = max_payload;
-		/*
-		 * Only allow speeds to go down from their initial value.
-		 * Otherwise a local peer that can only do S400 or slower may
-		 * be told to transmit at S800 to a faster remote peer.
-		 */
-		if (peer->xmt_speed > sspd)
-			peer->xmt_speed = sspd;
-
 		/*
 		 * Now that we're done with the 1394 specific stuff, we'll
 		 * need to alter some of the data.  Believe it or not, all
@@ -764,10 +674,11 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 }
 
 static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
-				 u16 source_node_id, bool is_broadcast)
+				 int source_node_id, int generation,
+				 bool is_broadcast)
 {
 	struct sk_buff *skb;
-	struct net_device *net;
+	struct net_device *net = dev->netdev;
 	struct rfc2734_header hdr;
 	unsigned lf;
 	unsigned long flags;
@@ -779,8 +690,6 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 	int retval;
 	u16 ether_type;
 
-	net = dev->card->netdev;
-
 	hdr.w0 = be32_to_cpu(buf[0]);
 	lf = fwnet_get_hdr_lf(&hdr);
 	if (lf == RFC2374_HDR_UNFRAG) {
@@ -819,9 +728,12 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 	}
 	datagram_label = fwnet_get_hdr_dgl(&hdr);
 	dg_size = fwnet_get_hdr_dg_size(&hdr); /* ??? + 1 */
-	peer = fwnet_peer_find_by_node_id(dev, source_node_id);
 
-	spin_lock_irqsave(&peer->pdg_lock, flags);
+	spin_lock_irqsave(&dev->lock, flags);
+
+	peer = fwnet_peer_find_by_node_id(dev, source_node_id, generation);
+	if (!peer)
+		goto bad_proto;
 
 	pd = fwnet_pd_find(peer, datagram_label);
 	if (pd == NULL) {
@@ -876,7 +788,7 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 		skb = skb_get(pd->skb);
 		fwnet_pd_delete(pd);
 
-		spin_unlock_irqrestore(&peer->pdg_lock, flags);
+		spin_unlock_irqrestore(&dev->lock, flags);
 
 		return fwnet_finish_incoming_packet(net, skb, source_node_id,
 						    false, ether_type);
@@ -885,12 +797,12 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 	 * Datagram is not complete, we're done for the
 	 * moment.
 	 */
-	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+	spin_unlock_irqrestore(&dev->lock, flags);
 
 	return 0;
 
  bad_proto:
-	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+	spin_unlock_irqrestore(&dev->lock, flags);
 
 	if (netif_queue_stopped(net))
 		netif_wake_queue(net);
@@ -916,7 +828,8 @@ static void fwnet_receive_packet(struct fw_card *card, struct fw_request *r,
 		return;
 	}
 
-	status = fwnet_incoming_packet(dev, payload, length, source, false);
+	status = fwnet_incoming_packet(dev, payload, length,
+				       source, generation, false);
 	if (status != 0) {
 		fw_error("Incoming packet failure\n");
 		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
@@ -966,7 +879,7 @@ static void fwnet_receive_broadcast(struct fw_iso_context *context,
 		buf_ptr += 2;
 		length -= IEEE1394_GASP_HDR_SIZE;
 		fwnet_incoming_packet(dev, buf_ptr, length,
-				      source_node_id, true);
+				      source_node_id, -1, true);
 	}
 
 	packet.payload_length = dev->rcv_buffer_size;
@@ -1073,7 +986,6 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask)
 	unsigned tx_len;
 	struct rfc2734_header *bufhdr;
 	unsigned long flags;
-	struct net_device *net;
 
 	dev = ptask->dev;
 	tx_len = ptask->max_payload;
@@ -1137,8 +1049,7 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask)
 	list_add_tail(&ptask->pt_link, &dev->sent_list);
 	spin_unlock_irqrestore(&dev->lock, flags);
 
-	net = dev->card->netdev;
-	net->trans_start = jiffies;
+	dev->netdev->trans_start = jiffies;
 
 	return 0;
 }
@@ -1294,7 +1205,8 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 	u16 dg_size;
 	u16 *datagram_label_ptr;
 	struct fwnet_packet_task *ptask;
-	struct fwnet_peer *peer = NULL;
+	struct fwnet_peer *peer;
+	unsigned long flags;
 
 	ptask = kmem_cache_alloc(fwnet_packet_task_cache, GFP_ATOMIC);
 	if (ptask == NULL)
@@ -1314,6 +1226,9 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 	proto = hdr_buf.h_proto;
 	dg_size = skb->len;
 
+	/* serialize access to peer, including peer->datagram_label */
+	spin_lock_irqsave(&dev->lock, flags);
+
 	/*
 	 * Set the transmission type for the packet.  ARP packets and IP
 	 * broadcast packets are sent via GASP.
@@ -1322,35 +1237,30 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 	    || proto == htons(ETH_P_ARP)
 	    || (proto == htons(ETH_P_IP)
 		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)))) {
-		max_payload = dev->broadcast_xmt_max_payload;
+		max_payload        = dev->broadcast_xmt_max_payload;
 		datagram_label_ptr = &dev->broadcast_xmt_datagramlabel;
 
-		ptask->fifo_addr = FWNET_NO_FIFO_ADDR;
-		ptask->generation = 0;
-		ptask->dest_node = IEEE1394_ALL_NODES;
-		ptask->speed = SCODE_100;
+		ptask->fifo_addr   = FWNET_NO_FIFO_ADDR;
+		ptask->generation  = 0;
+		ptask->dest_node   = IEEE1394_ALL_NODES;
+		ptask->speed       = SCODE_100;
 	} else {
 		__be64 guid = get_unaligned((__be64 *)hdr_buf.h_dest);
 		u8 generation;
 
 		peer = fwnet_peer_find_by_guid(dev, be64_to_cpu(guid));
-		if (!peer)
-			goto fail;
-
-		if (peer->fifo == FWNET_NO_FIFO_ADDR)
-			goto fail;
+		if (!peer || peer->fifo == FWNET_NO_FIFO_ADDR)
+			goto fail_unlock;
 
-		generation = peer->generation;
-		smp_rmb();
-		dest_node = peer->node_id;
-
-		max_payload = peer->max_payload;
+		generation         = peer->generation;
+		dest_node          = peer->node_id;
+		max_payload        = peer->max_payload;
 		datagram_label_ptr = &peer->datagram_label;
 
-		ptask->fifo_addr = peer->fifo;
-		ptask->generation = generation;
-		ptask->dest_node = dest_node;
-		ptask->speed = peer->xmt_speed;
+		ptask->fifo_addr   = peer->fifo;
+		ptask->generation  = generation;
+		ptask->dest_node   = dest_node;
+		ptask->speed       = peer->speed;
 	}
 
 	/* If this is an ARP packet, convert it */
@@ -1393,11 +1303,16 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 		ptask->outstanding_pkts = DIV_ROUND_UP(dg_size, max_payload);
 		max_payload += RFC2374_FRAG_HDR_SIZE;
 	}
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
 	ptask->max_payload = max_payload;
 	fwnet_send_packet(ptask);
 
 	return NETDEV_TX_OK;
 
+ fail_unlock:
+	spin_unlock_irqrestore(&dev->lock, flags);
  fail:
 	if (ptask)
 		kmem_cache_free(fwnet_packet_task_cache, ptask);
@@ -1467,7 +1382,48 @@ static void fwnet_init_dev(struct net_device *net)
 	SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops);
 }
 
-/* FIXME create netdev upon first fw_unit of a card, not upon local fw_unit */
+/* caller must hold fwnet_device_mutex */
+static struct fwnet_device *fwnet_dev_find(struct fw_card *card)
+{
+	struct fwnet_device *dev;
+
+	list_for_each_entry(dev, &fwnet_device_list, dev_link)
+		if (dev->card == card)
+			return dev;
+
+	return NULL;
+}
+
+static int fwnet_add_peer(struct fwnet_device *dev,
+			  struct fw_unit *unit, struct fw_device *device)
+{
+	struct fwnet_peer *peer;
+
+	peer = kmalloc(sizeof(*peer), GFP_KERNEL);
+	if (!peer)
+		return -ENOMEM;
+
+	unit->device.driver_data = peer;
+	peer->dev = dev;
+	peer->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	peer->fifo = FWNET_NO_FIFO_ADDR;
+	INIT_LIST_HEAD(&peer->pd_list);
+	peer->pdg_size = 0;
+	peer->datagram_label = 0;
+	peer->speed = device->max_speed;
+	peer->max_payload = fwnet_max_payload(device->max_rec, peer->speed);
+
+	peer->generation = device->generation;
+	smp_rmb();
+	peer->node_id = device->node_id;
+
+	spin_lock_irq(&dev->lock);
+	list_add_tail(&peer->peer_link, &dev->peer_list);
+	spin_unlock_irq(&dev->lock);
+
+	return 0;
+}
+
 static int fwnet_probe(struct device *_dev)
 {
 	struct fw_unit *unit = fw_unit(_dev);
@@ -1476,16 +1432,22 @@ static int fwnet_probe(struct device *_dev)
 	struct net_device *net;
 	struct fwnet_device *dev;
 	unsigned max_mtu;
+	bool new_netdev;
+	int ret;
 
-	if (!device->is_local) {
-		int added;
+	mutex_lock(&fwnet_device_mutex);
 
-		added = fwnet_peer_new(card, device);
-		return added;
+	dev = fwnet_dev_find(card);
+	if (dev) {
+		new_netdev = false;
+		net = dev->netdev;
+		goto have_dev;
 	}
+
+	new_netdev = true;
 	net = alloc_netdev(sizeof(*dev), "firewire%d", fwnet_init_dev);
 	if (net == NULL) {
-		fw_error("out of memory\n");
+		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -1500,12 +1462,13 @@ static int fwnet_probe(struct device *_dev)
 
 	dev->local_fifo = FWNET_NO_FIFO_ADDR;
 
-	/* INIT_WORK(&dev->wake, fwnet_handle_queue);*/
 	INIT_LIST_HEAD(&dev->packet_list);
 	INIT_LIST_HEAD(&dev->broadcasted_list);
 	INIT_LIST_HEAD(&dev->sent_list);
+	INIT_LIST_HEAD(&dev->peer_list);
 
 	dev->card = card;
+	dev->netdev = net;
 
 	/*
 	 * Use the RFC 2734 default 1500 octets or the maximum payload
@@ -1518,43 +1481,57 @@ static int fwnet_probe(struct device *_dev)
 	/* Set our hardware address while we're at it */
 	put_unaligned_be64(card->guid, net->dev_addr);
 	put_unaligned_be64(~0ULL, net->broadcast);
-	if (register_netdev(net)) {
+	ret = register_netdev(net);
+	if (ret) {
 		fw_error("Cannot register the driver\n");
 		goto out;
 	}
 
+	list_add_tail(&dev->dev_link, &fwnet_device_list);
 	fw_notify("%s: IPv4 over FireWire on device %016llx\n",
 		  net->name, (unsigned long long)card->guid);
-	card->netdev = net;
-
-	return 0;
+ have_dev:
+	ret = fwnet_add_peer(dev, unit, device);
+	if (ret && new_netdev) {
+		unregister_netdev(net);
+		list_del(&dev->dev_link);
+	}
  out:
-	if (net)
+	if (ret && new_netdev)
 		free_netdev(net);
 
-	return -ENOENT;
+	mutex_unlock(&fwnet_device_mutex);
+
+	return ret;
+}
+
+static void fwnet_remove_peer(struct fwnet_peer *peer)
+{
+	struct fwnet_partial_datagram *pd, *pd_next;
+
+	spin_lock_irq(&peer->dev->lock);
+	list_del(&peer->peer_link);
+	spin_unlock_irq(&peer->dev->lock);
+
+	list_for_each_entry_safe(pd, pd_next, &peer->pd_list, pd_link)
+		fwnet_pd_delete(pd);
+
+	kfree(peer);
 }
 
 static int fwnet_remove(struct device *_dev)
 {
-	struct fw_unit *unit = fw_unit(_dev);
-	struct fw_device *device = fw_parent_device(unit);
-	struct fw_card *card = device->card;
+	struct fwnet_peer *peer = _dev->driver_data;
+	struct fwnet_device *dev = peer->dev;
 	struct net_device *net;
-	struct fwnet_device *dev;
-	struct fwnet_peer *peer;
-	struct fwnet_partial_datagram *pd, *pd_next;
 	struct fwnet_packet_task *ptask, *pt_next;
 
-	if (!device->is_local) {
-		fwnet_peer_delete(card, device);
+	mutex_lock(&fwnet_device_mutex);
 
-		return 0;
-	}
+	fwnet_remove_peer(peer);
 
-	net = card->netdev;
-	if (net) {
-		dev = netdev_priv(net);
+	if (list_empty(&dev->peer_list)) {
+		net = dev->netdev;
 		unregister_netdev(net);
 
 		if (dev->local_fifo != FWNET_NO_FIFO_ADDR)
@@ -1580,19 +1557,11 @@ static int fwnet_remove(struct device *_dev)
 			dev_kfree_skb_any(ptask->skb);
 			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		list_for_each_entry(peer, &card->peer_list, peer_link) {
-			if (peer->pdg_size) {
-				list_for_each_entry_safe(pd, pd_next,
-						&peer->pd_list, pd_link)
-					fwnet_pd_delete(pd);
-				peer->pdg_size = 0;
-			}
-			peer->fifo = FWNET_NO_FIFO_ADDR;
-		}
 		free_netdev(net);
-		card->netdev = NULL;
 	}
 
+	mutex_unlock(&fwnet_device_mutex);
+
 	return 0;
 }
 
@@ -1603,24 +1572,15 @@ static int fwnet_remove(struct device *_dev)
 static void fwnet_update(struct fw_unit *unit)
 {
 	struct fw_device *device = fw_parent_device(unit);
-	struct net_device *net = device->card->netdev;
-	struct fwnet_device *dev;
-	struct fwnet_peer *peer;
-	u64 guid;
+	struct fwnet_peer *peer = unit->device.driver_data;
+	int generation;
 
-	if (net && !device->is_local) {
-		dev = netdev_priv(net);
-		guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-		peer = fwnet_peer_find_by_guid(dev, guid);
-		if (!peer) {
-			fw_error("fwnet_update: no peer for device %016llx\n",
-				 (unsigned long long)guid);
-			return;
-		}
-		peer->generation = device->generation;
-		rmb();
-		peer->node_id = device->node_id;
-	}
+	generation = device->generation;
+
+	spin_lock_irq(&peer->dev->lock);
+	peer->node_id    = device->node_id;
+	peer->generation = generation;
+	spin_unlock_irq(&peer->dev->lock);
 }
 
 static const struct ieee1394_device_id fwnet_id_table[] = {
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 5cb0c1549ff1..9823946adbc5 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -131,10 +131,6 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
-
-	/* firewire-net driver data */
-	void *netdev;
-	struct list_head peer_list;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
-- 
cgit v1.2.3-71-gd317


From 6c18ba9f5e506b8115b89b1aa7bdc25178f40b0a Mon Sep 17 00:00:00 2001
From: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
Date: Tue, 16 Jun 2009 04:19:13 +0300
Subject: nfsd41: move channel attributes from nfsd4_session to a
 nfsd4_channel_attr struct

the change is valid for both the forechannel and the backchannel (currently dummy)

Signed-off-by: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 28 +++++++++++++++-------------
 fs/nfsd/nfs4xdr.c          |  2 +-
 include/linux/nfsd/state.h | 18 +++++++++++++-----
 include/linux/nfsd/xdr4.h  | 11 -----------
 4 files changed, 29 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 89d9ac55c034..d5caf2a709d2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -444,8 +444,8 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
  * fchan holds the client values on input, and the server values on output
  */
 static int init_forechannel_attrs(struct svc_rqst *rqstp,
-				    struct nfsd4_session *session,
-				    struct nfsd4_channel_attrs *fchan)
+				  struct nfsd4_channel_attrs *session_fchan,
+				  struct nfsd4_channel_attrs *fchan)
 {
 	int status = 0;
 	__u32   maxcount = svc_max_payload(rqstp);
@@ -455,21 +455,21 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 	/* Use the client's max request and max response size if possible */
 	if (fchan->maxreq_sz > maxcount)
 		fchan->maxreq_sz = maxcount;
-	session->se_fmaxreq_sz = fchan->maxreq_sz;
+	session_fchan->maxreq_sz = fchan->maxreq_sz;
 
 	if (fchan->maxresp_sz > maxcount)
 		fchan->maxresp_sz = maxcount;
-	session->se_fmaxresp_sz = fchan->maxresp_sz;
+	session_fchan->maxresp_sz = fchan->maxresp_sz;
 
 	/* Set the max response cached size our default which is
 	 * a multiple of PAGE_SIZE and small */
-	session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
-	fchan->maxresp_cached = session->se_fmaxresp_cached;
+	session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+	fchan->maxresp_cached = session_fchan->maxresp_cached;
 
 	/* Use the client's maxops if possible */
 	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
 		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
-	session->se_fmaxops = fchan->maxops;
+	session_fchan->maxops = fchan->maxops;
 
 	/* try to use the client requested number of slots */
 	if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
@@ -481,7 +481,7 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 	 */
 	status = set_forechannel_maxreqs(fchan);
 
-	session->se_fnumslots = fchan->maxreqs;
+	session_fchan->maxreqs = fchan->maxreqs;
 	return status;
 }
 
@@ -495,12 +495,14 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
 	memset(&tmp, 0, sizeof(tmp));
 
 	/* FIXME: For now, we just accept the client back channel attributes. */
-	status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+	tmp.se_bchannel = cses->back_channel;
+	status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
+					&cses->fore_channel);
 	if (status)
 		goto out;
 
 	/* allocate struct nfsd4_session and slot table in one piece */
-	slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+	slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
 	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
 	if (!new)
 		goto out;
@@ -574,7 +576,7 @@ free_session(struct kref *kref)
 	int i;
 
 	ses = container_of(kref, struct nfsd4_session, se_ref);
-	for (i = 0; i < ses->se_fnumslots; i++) {
+	for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
 		struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
 		nfsd4_release_respages(e->ce_respages, e->ce_resused);
 	}
@@ -1130,7 +1132,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 	 * is sent (lease renewal).
 	 */
 	if (seq && nfsd4_not_cached(resp)) {
-		seq->maxslots = resp->cstate.session->se_fnumslots;
+		seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
 		return nfs_ok;
 	}
 
@@ -1473,7 +1475,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 		goto out;
 
 	status = nfserr_badslot;
-	if (seq->slotid >= session->se_fnumslots)
+	if (seq->slotid >= session->se_fchannel.maxreqs)
 		goto out;
 
 	slot = &session->se_slots[seq->slotid];
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d07f704a2ac9..2dcc7feaa6ff 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3183,7 +3183,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
 	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
 		length, xb->page_len, tlen, pad);
 
-	if (length <= session->se_fmaxresp_cached)
+	if (length <= session->se_fchannel.maxresp_cached)
 		return status;
 	else
 		return nfserr_rep_too_big_to_cache;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index c0c49215ddc5..105cc100de05 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -115,6 +115,17 @@ struct nfsd4_slot {
 	struct nfsd4_cache_entry	sl_cache_entry;
 };
 
+struct nfsd4_channel_attrs {
+	u32		headerpadsz;
+	u32		maxreq_sz;
+	u32		maxresp_sz;
+	u32		maxresp_cached;
+	u32		maxops;
+	u32		maxreqs;
+	u32		nr_rdma_attrs;
+	u32		rdma_attrs;
+};
+
 struct nfsd4_session {
 	struct kref		se_ref;
 	struct list_head	se_hash;	/* hash by sessionid */
@@ -122,11 +133,8 @@ struct nfsd4_session {
 	u32			se_flags;
 	struct nfs4_client	*se_client;	/* for expire_client */
 	struct nfs4_sessionid	se_sessionid;
-	u32			se_fmaxreq_sz;
-	u32			se_fmaxresp_sz;
-	u32			se_fmaxresp_cached;
-	u32			se_fmaxops;
-	u32			se_fnumslots;
+	struct nfsd4_channel_attrs se_fchannel;
+	struct nfsd4_channel_attrs se_bchannel;
 	struct nfsd4_slot	se_slots[];	/* forward channel slots */
 };
 
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index d0f050f01eca..2bacf7535069 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -366,17 +366,6 @@ struct nfsd4_exchange_id {
 	int		spa_how;
 };
 
-struct nfsd4_channel_attrs {
-	u32		headerpadsz;
-	u32		maxreq_sz;
-	u32		maxresp_sz;
-	u32		maxresp_cached;
-	u32		maxops;
-	u32		maxreqs;
-	u32		nr_rdma_attrs;
-	u32		rdma_attrs;
-};
-
 struct nfsd4_create_session {
 	clientid_t		clientid;
 	struct nfs4_sessionid	sessionid;
-- 
cgit v1.2.3-71-gd317


From 84845c070ce3ac4d3bd2c148fa20ba8ce5409167 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 26 May 2009 16:05:06 +0900
Subject: PCI: use pci_is_root_bus() in acpi_pci_get_bridge_handle()

Use pci_is_root_bus() in acpi_pci_get_bridge_handle() to check if the
pci bus is root, for code consistency.

Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Reviewed-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 092e82e0048c..df67c78dfe24 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -23,7 +23,7 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 {
-	if (pbus->parent)
+	if (!pci_is_root_bus(pbus))
 		return DEVICE_ACPI_HANDLE(&(pbus->self->dev));
 	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
 					      pbus->number);
-- 
cgit v1.2.3-71-gd317


From a222b8f83b995e9c6fe2aff2a8125facb49f658e Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 26 May 2009 16:05:33 +0900
Subject: PCI: use pci_is_root_bus() in acpi_find_root_bridge_handle()

Use pci_is_root_bus() in acpi_find_root_bridge_handle() to check if
the pci bus is root, for code consistency.

Reviewed-by: Alex Chiang <achiang@hp.com>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index df67c78dfe24..93a7c08f869d 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -15,7 +15,7 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	struct pci_bus *pbus = pdev->bus;
 	/* Find a PCI root bus */
-	while (pbus->parent)
+	while (!pci_is_root_bus(pbus))
 		pbus = pbus->parent;
 	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
 					      pbus->number);
-- 
cgit v1.2.3-71-gd317


From a72b46c3849cdb05993015991bde548ab8b6d7ac Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 24 Apr 2009 10:45:17 +0800
Subject: PCI: Add pci_bus_set_ops

pci_bus_set_ops changes pci_ops associated with a pci_bus. This can be
used by debug tools such as PCIE AER error injection to fake some PCI
configuration registers.

Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/access.c | 19 +++++++++++++++++++
 include/linux/pci.h  |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 0f3706512686..db23200c4874 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -66,6 +66,25 @@ EXPORT_SYMBOL(pci_bus_write_config_byte);
 EXPORT_SYMBOL(pci_bus_write_config_word);
 EXPORT_SYMBOL(pci_bus_write_config_dword);
 
+/**
+ * pci_bus_set_ops - Set raw operations of pci bus
+ * @bus:	pci bus struct
+ * @ops:	new raw operations
+ *
+ * Return previous raw operations
+ */
+struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops)
+{
+	struct pci_ops *old_ops;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_lock, flags);
+	old_ops = bus->ops;
+	bus->ops = ops;
+	spin_unlock_irqrestore(&pci_lock, flags);
+	return old_ops;
+}
+EXPORT_SYMBOL(pci_bus_set_ops);
 
 /**
  * pci_read_vpd - Read one entry from Vital Product Data
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ec03b90d3510..ea2a153a9126 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -637,6 +637,7 @@ int pci_bus_write_config_word(struct pci_bus *bus, unsigned int devfn,
 			      int where, u16 val);
 int pci_bus_write_config_dword(struct pci_bus *bus, unsigned int devfn,
 			       int where, u32 val);
+struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops);
 
 static inline int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val)
 {
-- 
cgit v1.2.3-71-gd317


From bd3d99c17039fd05a29587db3f4a180c48da115a Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 2 Jun 2009 13:52:26 +0900
Subject: PCI: Remove untested Electromechanical Interlock (EMI) support in
 pciehp.

The EMI support in pciehp is obviously broken. It is implemented using
struct hotplug_slot_attribute, but sysfs_ops for pci_slot_ktype is NOT
for struct hotplug_slot_attribute, but for struct pci_slot_attribute.
This bug had been there for a long time, maybe it was introduced when
PCI slot framework was introduced. The reason why this bug didn't
cause any problem is maybe the EMI support is not tested at all
because of lack of test environment.

As described above, the EMI support in pciehp seems not to be tested
at all. So this patch removes EMI support from pciehp, instead of
fixing the bug.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp.h      |   3 --
 drivers/pci/hotplug/pciehp_core.c | 111 +-------------------------------------
 drivers/pci/hotplug/pciehp_hpc.c  |  31 -----------
 include/linux/pci_hotplug.h       |   8 ---
 4 files changed, 1 insertion(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 0a368547e633..e6cf096498be 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -81,7 +81,6 @@ struct slot {
 	struct hpc_ops *hpc_ops;
 	struct hotplug_slot *hotplug_slot;
 	struct list_head	slot_list;
-	unsigned long last_emi_toggle;
 	struct delayed_work work;	/* work for button event */
 	struct mutex lock;
 };
@@ -203,8 +202,6 @@ struct hpc_ops {
 	int (*set_attention_status)(struct slot *slot, u8 status);
 	int (*get_latch_status)(struct slot *slot, u8 *status);
 	int (*get_adapter_status)(struct slot *slot, u8 *status);
-	int (*get_emi_status)(struct slot *slot, u8 *status);
-	int (*toggle_emi)(struct slot *slot);
 	int (*get_max_bus_speed)(struct slot *slot, enum pci_bus_speed *speed);
 	int (*get_cur_bus_speed)(struct slot *slot, enum pci_bus_speed *speed);
 	int (*get_max_lnk_width)(struct slot *slot, enum pcie_link_width *val);
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index fb254b2454de..eb183d1d0912 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -85,99 +85,6 @@ static struct hotplug_slot_ops pciehp_hotplug_slot_ops = {
   	.get_cur_bus_speed =	get_cur_bus_speed,
 };
 
-/*
- * Check the status of the Electro Mechanical Interlock (EMI)
- */
-static int get_lock_status(struct hotplug_slot *hotplug_slot, u8 *value)
-{
-	struct slot *slot = hotplug_slot->private;
-	return (slot->hpc_ops->get_emi_status(slot, value));
-}
-
-/*
- * sysfs interface for the Electro Mechanical Interlock (EMI)
- * 1 == locked, 0 == unlocked
- */
-static ssize_t lock_read_file(struct hotplug_slot *slot, char *buf)
-{
-	int retval;
-	u8 value;
-
-	retval = get_lock_status(slot, &value);
-	if (retval)
-		goto lock_read_exit;
-	retval = sprintf (buf, "%d\n", value);
-
-lock_read_exit:
-	return retval;
-}
-
-/*
- * Change the status of the Electro Mechanical Interlock (EMI)
- * This is a toggle - in addition there must be at least 1 second
- * in between toggles.
- */
-static int set_lock_status(struct hotplug_slot *hotplug_slot, u8 status)
-{
-	struct slot *slot = hotplug_slot->private;
-	int retval;
-	u8 value;
-
-	mutex_lock(&slot->ctrl->crit_sect);
-
-	/* has it been >1 sec since our last toggle? */
-	if ((get_seconds() - slot->last_emi_toggle) < 1) {
-		mutex_unlock(&slot->ctrl->crit_sect);
-		return -EINVAL;
-	}
-
-	/* see what our current state is */
-	retval = get_lock_status(hotplug_slot, &value);
-	if (retval || (value == status))
-		goto set_lock_exit;
-
-	slot->hpc_ops->toggle_emi(slot);
-set_lock_exit:
-	mutex_unlock(&slot->ctrl->crit_sect);
-	return 0;
-}
-
-/*
- * sysfs interface which allows the user to toggle the Electro Mechanical
- * Interlock.  Valid values are either 0 or 1.  0 == unlock, 1 == lock
- */
-static ssize_t lock_write_file(struct hotplug_slot *hotplug_slot,
-		const char *buf, size_t count)
-{
-	struct slot *slot = hotplug_slot->private;
-	unsigned long llock;
-	u8 lock;
-	int retval = 0;
-
-	llock = simple_strtoul(buf, NULL, 10);
-	lock = (u8)(llock & 0xff);
-
-	switch (lock) {
-		case 0:
-		case 1:
-			retval = set_lock_status(hotplug_slot, lock);
-			break;
-		default:
-			ctrl_err(slot->ctrl, "%d is an invalid lock value\n",
-				 lock);
-			retval = -EINVAL;
-	}
-	if (retval)
-		return retval;
-	return count;
-}
-
-static struct hotplug_slot_attribute hotplug_slot_attr_lock = {
-	.attr = {.name = "lock", .mode = S_IFREG | S_IRUGO | S_IWUSR},
-	.show = lock_read_file,
-	.store = lock_write_file
-};
-
 /**
  * release_slot - free up the memory used by a slot
  * @hotplug_slot: slot to free
@@ -236,17 +143,6 @@ static int init_slots(struct controller *ctrl)
 		get_attention_status(hotplug_slot, &info->attention_status);
 		get_latch_status(hotplug_slot, &info->latch_status);
 		get_adapter_status(hotplug_slot, &info->adapter_status);
-		/* create additional sysfs entries */
-		if (EMI(ctrl)) {
-			retval = sysfs_create_file(&hotplug_slot->pci_slot->kobj,
-				&hotplug_slot_attr_lock.attr);
-			if (retval) {
-				pci_hp_deregister(hotplug_slot);
-				ctrl_err(ctrl, "Cannot create additional sysfs "
-					 "entries\n");
-				goto error_info;
-			}
-		}
 	}
 
 	return 0;
@@ -261,13 +157,8 @@ error:
 static void cleanup_slots(struct controller *ctrl)
 {
 	struct slot *slot;
-
-	list_for_each_entry(slot, &ctrl->slot_list, slot_list) {
-		if (EMI(ctrl))
-			sysfs_remove_file(&slot->hotplug_slot->pci_slot->kobj,
-				&hotplug_slot_attr_lock.attr);
+	list_for_each_entry(slot, &ctrl->slot_list, slot_list)
 		pci_hp_deregister(slot->hotplug_slot);
-	}
 }
 
 /*
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 07bd32151146..52813257e5bf 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -422,35 +422,6 @@ static int hpc_query_power_fault(struct slot *slot)
 	return !!(slot_status & PCI_EXP_SLTSTA_PFD);
 }
 
-static int hpc_get_emi_status(struct slot *slot, u8 *status)
-{
-	struct controller *ctrl = slot->ctrl;
-	u16 slot_status;
-	int retval;
-
-	retval = pciehp_readw(ctrl, PCI_EXP_SLTSTA, &slot_status);
-	if (retval) {
-		ctrl_err(ctrl, "Cannot check EMI status\n");
-		return retval;
-	}
-	*status = !!(slot_status & PCI_EXP_SLTSTA_EIS);
-	return retval;
-}
-
-static int hpc_toggle_emi(struct slot *slot)
-{
-	u16 slot_cmd;
-	u16 cmd_mask;
-	int rc;
-
-	slot_cmd = PCI_EXP_SLTCTL_EIC;
-	cmd_mask = PCI_EXP_SLTCTL_EIC;
-	rc = pcie_write_cmd(slot->ctrl, slot_cmd, cmd_mask);
-	slot->last_emi_toggle = get_seconds();
-
-	return rc;
-}
-
 static int hpc_set_attention_status(struct slot *slot, u8 value)
 {
 	struct controller *ctrl = slot->ctrl;
@@ -874,8 +845,6 @@ static struct hpc_ops pciehp_hpc_ops = {
 	.get_attention_status		= hpc_get_attention_status,
 	.get_latch_status		= hpc_get_latch_status,
 	.get_adapter_status		= hpc_get_adapter_status,
-	.get_emi_status			= hpc_get_emi_status,
-	.toggle_emi			= hpc_toggle_emi,
 
 	.get_max_bus_speed		= hpc_get_max_lnk_speed,
 	.get_cur_bus_speed		= hpc_get_cur_lnk_speed,
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 20998746518e..11936fd0b56d 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -66,14 +66,6 @@ enum pcie_link_speed {
 	PCIE_LNK_SPEED_UNKNOWN	= 0xFF,
 };
 
-struct hotplug_slot;
-struct hotplug_slot_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct hotplug_slot *, char *);
-	ssize_t (*store)(struct hotplug_slot *, const char *, size_t);
-};
-#define to_hotplug_attr(n) container_of(n, struct hotplug_slot_attribute, attr);
-
 /**
  * struct hotplug_slot_ops -the callbacks that the hotplug pci core can use
  * @owner: The module owner of this structure
-- 
cgit v1.2.3-71-gd317


From c825bc94c8c1908750ab20413eb639c6be029e2d Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 11:01:25 +0900
Subject: PCI hotplug: create symlink to hotplug driver module

Create symbolic link to hotplug driver module in the PCI slot
directory (/sys/bus/pci/slots/<SLOT#>). In the past, we need to load
hotplug drivers one by one to identify the hotplug driver that handles
the slot, and it was very inconvenient especially for trouble shooting.
With this change, we can easily identify the hotplug driver.

Signed-off-by: Taku Izumi <izumi.taku@jp.fujitsu.com>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci |  7 ++++++
 drivers/pci/hotplug/pci_hotplug_core.c  | 23 +++++++++++++------
 drivers/pci/slot.c                      | 39 +++++++++++++++++++++++++++++++++
 include/linux/pci.h                     |  5 +++++
 include/linux/pci_hotplug.h             | 15 +++++++++++--
 5 files changed, 80 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 97ad190e13af..6bf68053e4b8 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -122,3 +122,10 @@ Description:
 		This symbolic link appears when a device is a Virtual Function.
 		The symbolic link points to the PCI device sysfs entry of the
 		Physical Function this device associates with.
+
+What:		/sys/bus/pci/slots/.../module
+Date:		June 2009
+Contact:	linux-pci@vger.kernel.org
+Description:
+		This symbolic link points to the PCI hotplug controller driver
+		module that manages the hotplug slot.
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index ff32c6b4ae13..844580489d4d 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -424,6 +424,9 @@ static int fs_add_slot(struct pci_slot *slot)
 {
 	int retval = 0;
 
+	/* Create symbolic link to the hotplug driver module */
+	pci_hp_create_module_link(slot);
+
 	if (has_power_file(slot)) {
 		retval = sysfs_create_file(&slot->kobj,
 					   &hotplug_slot_attr_power.attr);
@@ -498,6 +501,7 @@ exit_attention:
 	if (has_power_file(slot))
 		sysfs_remove_file(&slot->kobj, &hotplug_slot_attr_power.attr);
 exit_power:
+	pci_hp_remove_module_link(slot);
 exit:
 	return retval;
 }
@@ -528,6 +532,8 @@ static void fs_remove_slot(struct pci_slot *slot)
 
 	if (has_test_file(slot))
 		sysfs_remove_file(&slot->kobj, &hotplug_slot_attr_test.attr);
+
+	pci_hp_remove_module_link(slot);
 }
 
 static struct hotplug_slot *get_slot_from_name (const char *name)
@@ -544,10 +550,10 @@ static struct hotplug_slot *get_slot_from_name (const char *name)
 }
 
 /**
- * pci_hp_register - register a hotplug_slot with the PCI hotplug subsystem
+ * __pci_hp_register - register a hotplug_slot with the PCI hotplug subsystem
  * @bus: bus this slot is on
  * @slot: pointer to the &struct hotplug_slot to register
- * @slot_nr: slot number
+ * @devnr: device number
  * @name: name registered with kobject core
  *
  * Registers a hotplug slot with the pci hotplug subsystem, which will allow
@@ -555,8 +561,9 @@ static struct hotplug_slot *get_slot_from_name (const char *name)
  *
  * Returns 0 if successful, anything else for an error.
  */
-int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr,
-			const char *name)
+int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus,
+		      int devnr, const char *name,
+		      struct module *owner, const char *mod_name)
 {
 	int result;
 	struct pci_slot *pci_slot;
@@ -571,14 +578,16 @@ int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr,
 		return -EINVAL;
 	}
 
-	mutex_lock(&pci_hp_mutex);
+	slot->ops->owner = owner;
+	slot->ops->mod_name = mod_name;
 
+	mutex_lock(&pci_hp_mutex);
 	/*
 	 * No problems if we call this interface from both ACPI_PCI_SLOT
 	 * driver and call it here again. If we've already created the
 	 * pci_slot, the interface will simply bump the refcount.
 	 */
-	pci_slot = pci_create_slot(bus, slot_nr, name, slot);
+	pci_slot = pci_create_slot(bus, devnr, name, slot);
 	if (IS_ERR(pci_slot)) {
 		result = PTR_ERR(pci_slot);
 		goto out;
@@ -688,6 +697,6 @@ MODULE_LICENSE("GPL");
 module_param(debug, bool, 0644);
 MODULE_PARM_DESC(debug, "Debugging mode enabled or not");
 
-EXPORT_SYMBOL_GPL(pci_hp_register);
+EXPORT_SYMBOL_GPL(__pci_hp_register);
 EXPORT_SYMBOL_GPL(pci_hp_deregister);
 EXPORT_SYMBOL_GPL(pci_hp_change_slot_info);
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index fe95ce20bcbd..eddb0748b0ea 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -307,6 +307,45 @@ void pci_destroy_slot(struct pci_slot *slot)
 }
 EXPORT_SYMBOL_GPL(pci_destroy_slot);
 
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+#include <linux/pci_hotplug.h>
+/**
+ * pci_hp_create_link - create symbolic link to the hotplug driver module.
+ * @slot: struct pci_slot
+ *
+ * Helper function for pci_hotplug_core.c to create symbolic link to
+ * the hotplug driver module.
+ */
+void pci_hp_create_module_link(struct pci_slot *pci_slot)
+{
+	struct hotplug_slot *slot = pci_slot->hotplug;
+	struct kobject *kobj = NULL;
+	int no_warn;
+
+	if (!slot || !slot->ops)
+		return;
+	kobj = kset_find_obj(module_kset, slot->ops->mod_name);
+	if (!kobj)
+		return;
+	no_warn = sysfs_create_link(&pci_slot->kobj, kobj, "module");
+	kobject_put(kobj);
+}
+EXPORT_SYMBOL_GPL(pci_hp_create_module_link);
+
+/**
+ * pci_hp_remove_link - remove symbolic link to the hotplug driver module.
+ * @slot: struct pci_slot
+ *
+ * Helper function for pci_hotplug_core.c to remove symbolic link to
+ * the hotplug driver module.
+ */
+void pci_hp_remove_module_link(struct pci_slot *pci_slot)
+{
+	sysfs_remove_link(&pci_slot->kobj, "module");
+}
+EXPORT_SYMBOL_GPL(pci_hp_remove_module_link);
+#endif
+
 static int pci_slot_init(void)
 {
 	struct kset *pci_bus_kset;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ea2a153a9126..6a1800ecd95d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1261,5 +1261,10 @@ static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev)
 }
 #endif
 
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+extern void pci_hp_create_module_link(struct pci_slot *pci_slot);
+extern void pci_hp_remove_module_link(struct pci_slot *pci_slot);
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 11936fd0b56d..b3646cd7fd5a 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -69,6 +69,7 @@ enum pcie_link_speed {
 /**
  * struct hotplug_slot_ops -the callbacks that the hotplug pci core can use
  * @owner: The module owner of this structure
+ * @mod_name: The module name (KBUILD_MODNAME) of this structure
  * @enable_slot: Called when the user wants to enable a specific pci slot
  * @disable_slot: Called when the user wants to disable a specific pci slot
  * @set_attention_status: Called to set the specific slot's attention LED to
@@ -101,6 +102,7 @@ enum pcie_link_speed {
  */
 struct hotplug_slot_ops {
 	struct module *owner;
+	const char *mod_name;
 	int (*enable_slot)		(struct hotplug_slot *slot);
 	int (*disable_slot)		(struct hotplug_slot *slot);
 	int (*set_attention_status)	(struct hotplug_slot *slot, u8 value);
@@ -159,12 +161,21 @@ static inline const char *hotplug_slot_name(const struct hotplug_slot *slot)
 	return pci_slot_name(slot->pci_slot);
 }
 
-extern int pci_hp_register(struct hotplug_slot *, struct pci_bus *, int nr,
-			   const char *name);
+extern int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *pbus,
+			     int nr, const char *name,
+			     struct module *owner, const char *mod_name);
 extern int pci_hp_deregister(struct hotplug_slot *slot);
 extern int __must_check pci_hp_change_slot_info	(struct hotplug_slot *slot,
 						 struct hotplug_slot_info *info);
 
+static inline int pci_hp_register(struct hotplug_slot *slot,
+				  struct pci_bus *pbus,
+				  int devnr, const char *name)
+{
+	return __pci_hp_register(slot, pbus, devnr, name,
+				 THIS_MODULE, KBUILD_MODNAME);
+}
+
 /* PCI Setting Record (Type 0) */
 struct hpp_type0 {
 	u32 revision;
-- 
cgit v1.2.3-71-gd317


From 70298c6e6c1ba68346336b4ea54bd5c0abbf73c8 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Tue, 16 Jun 2009 13:34:38 +0800
Subject: PCI AER: support Multiple Error Received and no error source id

Based on PCI Express AER specs, a root port might receive multiple
TLP errors while it could only save a correctable error source id
and an uncorrectable error source id at the same time. In addition,
some root port hardware might be unable to provide a correct source
id, i.e., the source id, or the bus id part of the source id provided
by root port might be equal to 0.

The patchset implements the support in kernel by searching the device
tree under the root port.

Patch 1 changes parameter cb of function pci_walk_bus to return a value.
When cb return non-zero, pci_walk_bus stops more searching on the
device tree.

Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/powerpc/platforms/pseries/eeh_driver.c | 38 ++++++++++++++++++-----------
 drivers/pci/bus.c                           | 11 +++++++--
 drivers/pci/pcie/aer/aerdrv_core.c          | 30 ++++++++++++-----------
 include/linux/pci.h                         |  2 +-
 4 files changed, 50 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 9a2a6e32f00f..0e8db6771252 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -122,7 +122,7 @@ static void eeh_enable_irq(struct pci_dev *dev)
  * passed back in "userdata".
  */
 
-static void eeh_report_error(struct pci_dev *dev, void *userdata)
+static int eeh_report_error(struct pci_dev *dev, void *userdata)
 {
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver = dev->driver;
@@ -130,19 +130,21 @@ static void eeh_report_error(struct pci_dev *dev, void *userdata)
 	dev->error_state = pci_channel_io_frozen;
 
 	if (!driver)
-		return;
+		return 0;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->error_detected)
-		return;
+		return 0;
 
 	rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen);
 
 	/* A driver that needs a reset trumps all others */
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	return 0;
 }
 
 /**
@@ -153,7 +155,7 @@ static void eeh_report_error(struct pci_dev *dev, void *userdata)
  * Cumulative response passed back in "userdata".
  */
 
-static void eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
+static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
 {
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver = dev->driver;
@@ -161,26 +163,28 @@ static void eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
 	if (!driver ||
 	    !driver->err_handler ||
 	    !driver->err_handler->mmio_enabled)
-		return;
+		return 0;
 
 	rc = driver->err_handler->mmio_enabled (dev);
 
 	/* A driver that needs a reset trumps all others */
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	return 0;
 }
 
 /**
  * eeh_report_reset - tell device that slot has been reset
  */
 
-static void eeh_report_reset(struct pci_dev *dev, void *userdata)
+static int eeh_report_reset(struct pci_dev *dev, void *userdata)
 {
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver = dev->driver;
 
 	if (!driver)
-		return;
+		return 0;
 
 	dev->error_state = pci_channel_io_normal;
 
@@ -188,35 +192,39 @@ static void eeh_report_reset(struct pci_dev *dev, void *userdata)
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->slot_reset)
-		return;
+		return 0;
 
 	rc = driver->err_handler->slot_reset(dev);
 	if ((*res == PCI_ERS_RESULT_NONE) ||
 	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
 	if (*res == PCI_ERS_RESULT_DISCONNECT &&
 	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+	return 0;
 }
 
 /**
  * eeh_report_resume - tell device to resume normal operations
  */
 
-static void eeh_report_resume(struct pci_dev *dev, void *userdata)
+static int eeh_report_resume(struct pci_dev *dev, void *userdata)
 {
 	struct pci_driver *driver = dev->driver;
 
 	dev->error_state = pci_channel_io_normal;
 
 	if (!driver)
-		return;
+		return 0;
 
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->resume)
-		return;
+		return 0;
 
 	driver->err_handler->resume(dev);
+
+	return 0;
 }
 
 /**
@@ -226,22 +234,24 @@ static void eeh_report_resume(struct pci_dev *dev, void *userdata)
  * dead, and that no further recovery attempts will be made on it.
  */
 
-static void eeh_report_failure(struct pci_dev *dev, void *userdata)
+static int eeh_report_failure(struct pci_dev *dev, void *userdata)
 {
 	struct pci_driver *driver = dev->driver;
 
 	dev->error_state = pci_channel_io_perm_failure;
 
 	if (!driver)
-		return;
+		return 0;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->error_detected)
-		return;
+		return 0;
 
 	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+	return 0;
 }
 
 /* ------------------------------------------------------- */
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 40af27f31043..cef28a79103f 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -206,13 +206,18 @@ void pci_enable_bridges(struct pci_bus *bus)
  *  Walk the given bus, including any bridged devices
  *  on buses under this bus.  Call the provided callback
  *  on each device found.
+ *
+ *  We check the return of @cb each time. If it returns anything
+ *  other than 0, we break out.
+ *
  */
-void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
+void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata)
 {
 	struct pci_dev *dev;
 	struct pci_bus *bus;
 	struct list_head *next;
+	int retval;
 
 	bus = top;
 	down_read(&pci_bus_sem);
@@ -236,8 +241,10 @@ void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
 
 		/* Run device routines with the device locked */
 		down(&dev->dev.sem);
-		cb(dev, userdata);
+		retval = cb(dev, userdata);
 		up(&dev->dev.sem);
+		if (retval)
+			break;
 	}
 	up_read(&pci_bus_sem);
 }
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index dd3829e68e3f..a7a3919904bb 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -109,7 +109,7 @@ int pci_cleanup_aer_correct_error_status(struct pci_dev *dev)
 #endif  /*  0  */
 
 
-static void set_device_error_reporting(struct pci_dev *dev, void *data)
+static int set_device_error_reporting(struct pci_dev *dev, void *data)
 {
 	bool enable = *((bool *)data);
 
@@ -124,6 +124,8 @@ static void set_device_error_reporting(struct pci_dev *dev, void *data)
 
 	if (enable)
 		pcie_set_ecrc_checking(dev);
+
+	return 0;
 }
 
 /**
@@ -207,7 +209,7 @@ static struct device* find_source_device(struct pci_dev *parent, u16 id)
 	return NULL;
 }
 
-static void report_error_detected(struct pci_dev *dev, void *data)
+static int report_error_detected(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote;
 	struct pci_error_handlers *err_handler;
@@ -232,16 +234,16 @@ static void report_error_detected(struct pci_dev *dev, void *data)
 				   dev->driver ?
 				   "no AER-aware driver" : "no driver");
 		}
-		return;
+		return 0;
 	}
 
 	err_handler = dev->driver->err_handler;
 	vote = err_handler->error_detected(dev, result_data->state);
 	result_data->result = merge_result(result_data->result, vote);
-	return;
+	return 0;
 }
 
-static void report_mmio_enabled(struct pci_dev *dev, void *data)
+static int report_mmio_enabled(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote;
 	struct pci_error_handlers *err_handler;
@@ -251,15 +253,15 @@ static void report_mmio_enabled(struct pci_dev *dev, void *data)
 	if (!dev->driver ||
 		!dev->driver->err_handler ||
 		!dev->driver->err_handler->mmio_enabled)
-		return;
+		return 0;
 
 	err_handler = dev->driver->err_handler;
 	vote = err_handler->mmio_enabled(dev);
 	result_data->result = merge_result(result_data->result, vote);
-	return;
+	return 0;
 }
 
-static void report_slot_reset(struct pci_dev *dev, void *data)
+static int report_slot_reset(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote;
 	struct pci_error_handlers *err_handler;
@@ -269,15 +271,15 @@ static void report_slot_reset(struct pci_dev *dev, void *data)
 	if (!dev->driver ||
 		!dev->driver->err_handler ||
 		!dev->driver->err_handler->slot_reset)
-		return;
+		return 0;
 
 	err_handler = dev->driver->err_handler;
 	vote = err_handler->slot_reset(dev);
 	result_data->result = merge_result(result_data->result, vote);
-	return;
+	return 0;
 }
 
-static void report_resume(struct pci_dev *dev, void *data)
+static int report_resume(struct pci_dev *dev, void *data)
 {
 	struct pci_error_handlers *err_handler;
 
@@ -286,11 +288,11 @@ static void report_resume(struct pci_dev *dev, void *data)
 	if (!dev->driver ||
 		!dev->driver->err_handler ||
 		!dev->driver->err_handler->resume)
-		return;
+		return 0;
 
 	err_handler = dev->driver->err_handler;
 	err_handler->resume(dev);
-	return;
+	return 0;
 }
 
 /**
@@ -307,7 +309,7 @@ static void report_resume(struct pci_dev *dev, void *data)
 static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 	enum pci_channel_state state,
 	char *error_mesg,
-	void (*cb)(struct pci_dev *, void *))
+	int (*cb)(struct pci_dev *, void *))
 {
 	struct aer_broadcast_data result_data;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6a1800ecd95d..61d9b790d21c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -789,7 +789,7 @@ const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
 int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 		    int pass);
 
-void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
+void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata);
 int pci_cfg_space_size_ext(struct pci_dev *dev);
 int pci_cfg_space_size(struct pci_dev *dev);
-- 
cgit v1.2.3-71-gd317


From 8c1c699fec9e9021bf6ff0285dee086bb27aec90 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Sat, 13 Jun 2009 15:52:13 +0800
Subject: PCI: cleanup Function Level Reset

This patch enhances the FLR functions:
  1) remove disable_irq() so the shared IRQ won't be disabled.
  2) replace the 1s wait with 100, 200 and 400ms wait intervals
     for the Pending Transaction.
  3) replace mdelay() with msleep().
  4) add might_sleep().
  5) lock the device to prevent PM suspend from accessing the CSRs
     during the reset.
  6) coding style fixes.

Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   |   4 +-
 drivers/pci/pci.c   | 166 ++++++++++++++++++++++++++--------------------------
 include/linux/pci.h |   2 +-
 3 files changed, 87 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index e87fe95da814..03c7706c0a09 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -110,7 +110,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
 	}
 
 	if (reset)
-		pci_execute_reset_function(virtfn);
+		__pci_reset_function(virtfn);
 
 	pci_device_add(virtfn, virtfn->bus);
 	mutex_unlock(&iov->dev->sriov->lock);
@@ -164,7 +164,7 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
 
 	if (reset) {
 		device_release_driver(&virtfn->dev);
-		pci_execute_reset_function(virtfn);
+		__pci_reset_function(virtfn);
 	}
 
 	sprintf(buf, "virtfn%u", id);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8ea911e55722..6a052ada3fe8 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2055,111 +2055,112 @@ int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask)
 EXPORT_SYMBOL(pci_set_dma_seg_boundary);
 #endif
 
-static int __pcie_flr(struct pci_dev *dev, int probe)
+static int pcie_flr(struct pci_dev *dev, int probe)
 {
-	u16 status;
+	int i;
+	int pos;
 	u32 cap;
-	int exppos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	u16 status;
 
-	if (!exppos)
+	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	if (!pos)
 		return -ENOTTY;
-	pci_read_config_dword(dev, exppos + PCI_EXP_DEVCAP, &cap);
+
+	pci_read_config_dword(dev, pos + PCI_EXP_DEVCAP, &cap);
 	if (!(cap & PCI_EXP_DEVCAP_FLR))
 		return -ENOTTY;
 
 	if (probe)
 		return 0;
 
-	pci_block_user_cfg_access(dev);
-
 	/* Wait for Transaction Pending bit clean */
-	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-	if (!(status & PCI_EXP_DEVSTA_TRPND))
-		goto transaction_done;
+	for (i = 0; i < 4; i++) {
+		if (i)
+			msleep((1 << (i - 1)) * 100);
 
-	msleep(100);
-	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-	if (!(status & PCI_EXP_DEVSTA_TRPND))
-		goto transaction_done;
-
-	dev_info(&dev->dev, "Busy after 100ms while trying to reset; "
-			"sleeping for 1 second\n");
-	ssleep(1);
-	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-	if (status & PCI_EXP_DEVSTA_TRPND)
-		dev_info(&dev->dev, "Still busy after 1s; "
-				"proceeding with reset anyway\n");
-
-transaction_done:
-	pci_write_config_word(dev, exppos + PCI_EXP_DEVCTL,
+		pci_read_config_word(dev, pos + PCI_EXP_DEVSTA, &status);
+		if (!(status & PCI_EXP_DEVSTA_TRPND))
+			goto clear;
+	}
+
+	dev_err(&dev->dev, "transaction is not cleared; "
+			"proceeding with reset anyway\n");
+
+clear:
+	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
 				PCI_EXP_DEVCTL_BCR_FLR);
-	mdelay(100);
+	msleep(100);
 
-	pci_unblock_user_cfg_access(dev);
 	return 0;
 }
 
-static int __pci_af_flr(struct pci_dev *dev, int probe)
+static int pci_af_flr(struct pci_dev *dev, int probe)
 {
-	int cappos = pci_find_capability(dev, PCI_CAP_ID_AF);
-	u8 status;
+	int i;
+	int pos;
 	u8 cap;
+	u8 status;
 
-	if (!cappos)
+	pos = pci_find_capability(dev, PCI_CAP_ID_AF);
+	if (!pos)
 		return -ENOTTY;
-	pci_read_config_byte(dev, cappos + PCI_AF_CAP, &cap);
+
+	pci_read_config_byte(dev, pos + PCI_AF_CAP, &cap);
 	if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
 		return -ENOTTY;
 
 	if (probe)
 		return 0;
 
-	pci_block_user_cfg_access(dev);
-
 	/* Wait for Transaction Pending bit clean */
-	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
-	if (!(status & PCI_AF_STATUS_TP))
-		goto transaction_done;
+	for (i = 0; i < 4; i++) {
+		if (i)
+			msleep((1 << (i - 1)) * 100);
+
+		pci_read_config_byte(dev, pos + PCI_AF_STATUS, &status);
+		if (!(status & PCI_AF_STATUS_TP))
+			goto clear;
+	}
+
+	dev_err(&dev->dev, "transaction is not cleared; "
+			"proceeding with reset anyway\n");
 
+clear:
+	pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
 	msleep(100);
-	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
-	if (!(status & PCI_AF_STATUS_TP))
-		goto transaction_done;
-
-	dev_info(&dev->dev, "Busy after 100ms while trying to"
-			" reset; sleeping for 1 second\n");
-	ssleep(1);
-	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
-	if (status & PCI_AF_STATUS_TP)
-		dev_info(&dev->dev, "Still busy after 1s; "
-				"proceeding with reset anyway\n");
-
-transaction_done:
-	pci_write_config_byte(dev, cappos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
-	mdelay(100);
-
-	pci_unblock_user_cfg_access(dev);
+
 	return 0;
 }
 
-static int __pci_reset_function(struct pci_dev *pdev, int probe)
+static int pci_dev_reset(struct pci_dev *dev, int probe)
 {
-	int res;
+	int rc;
+
+	might_sleep();
+
+	if (!probe) {
+		pci_block_user_cfg_access(dev);
+		/* block PM suspend, driver probe, etc. */
+		down(&dev->dev.sem);
+	}
 
-	res = __pcie_flr(pdev, probe);
-	if (res != -ENOTTY)
-		return res;
+	rc = pcie_flr(dev, probe);
+	if (rc != -ENOTTY)
+		goto done;
 
-	res = __pci_af_flr(pdev, probe);
-	if (res != -ENOTTY)
-		return res;
+	rc = pci_af_flr(dev, probe);
+done:
+	if (!probe) {
+		up(&dev->dev.sem);
+		pci_unblock_user_cfg_access(dev);
+	}
 
-	return res;
+	return rc;
 }
 
 /**
- * pci_execute_reset_function() - Reset a PCI device function
- * @dev: Device function to reset
+ * __pci_reset_function - reset a PCI device function
+ * @dev: PCI device to reset
  *
  * Some devices allow an individual function to be reset without affecting
  * other functions in the same device.  The PCI device must be responsive
@@ -2171,18 +2172,18 @@ static int __pci_reset_function(struct pci_dev *pdev, int probe)
  * device including MSI, bus mastering, BARs, decoding IO and memory spaces,
  * etc.
  *
- * Returns 0 if the device function was successfully reset or -ENOTTY if the
+ * Returns 0 if the device function was successfully reset or negative if the
  * device doesn't support resetting a single function.
  */
-int pci_execute_reset_function(struct pci_dev *dev)
+int __pci_reset_function(struct pci_dev *dev)
 {
-	return __pci_reset_function(dev, 0);
+	return pci_dev_reset(dev, 0);
 }
-EXPORT_SYMBOL_GPL(pci_execute_reset_function);
+EXPORT_SYMBOL_GPL(__pci_reset_function);
 
 /**
- * pci_reset_function() - quiesce and reset a PCI device function
- * @dev: Device function to reset
+ * pci_reset_function - quiesce and reset a PCI device function
+ * @dev: PCI device to reset
  *
  * Some devices allow an individual function to be reset without affecting
  * other functions in the same device.  The PCI device must be responsive
@@ -2190,32 +2191,33 @@ EXPORT_SYMBOL_GPL(pci_execute_reset_function);
  *
  * This function does not just reset the PCI portion of a device, but
  * clears all the state associated with the device.  This function differs
- * from pci_execute_reset_function in that it saves and restores device state
+ * from __pci_reset_function in that it saves and restores device state
  * over the reset.
  *
- * Returns 0 if the device function was successfully reset or -ENOTTY if the
+ * Returns 0 if the device function was successfully reset or negative if the
  * device doesn't support resetting a single function.
  */
 int pci_reset_function(struct pci_dev *dev)
 {
-	int r = __pci_reset_function(dev, 1);
+	int rc;
 
-	if (r < 0)
-		return r;
+	rc = pci_dev_reset(dev, 1);
+	if (rc)
+		return rc;
 
-	if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0)
-		disable_irq(dev->irq);
 	pci_save_state(dev);
 
+	/*
+	 * both INTx and MSI are disabled after the Interrupt Disable bit
+	 * is set and the Bus Master bit is cleared.
+	 */
 	pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
 
-	r = pci_execute_reset_function(dev);
+	rc = pci_dev_reset(dev, 0);
 
 	pci_restore_state(dev);
-	if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0)
-		enable_irq(dev->irq);
 
-	return r;
+	return rc;
 }
 EXPORT_SYMBOL_GPL(pci_reset_function);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 61d9b790d21c..91b06be2f01e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -702,8 +702,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
 int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
+int __pci_reset_function(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
-int pci_execute_reset_function(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
-- 
cgit v1.2.3-71-gd317


From 7d9a73f6dcf4390d256bf19330c81e91523a26d5 Mon Sep 17 00:00:00 2001
From: Frans Pop <elendil@planet.nl>
Date: Wed, 17 Jun 2009 00:16:15 +0200
Subject: PCI PM: consistently use type bool for wake enable variable

Other functions use type bool, so use that for pci_enable_wake as well.

Signed-off-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c   | 2 +-
 include/linux/pci.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7b59fd7c9575..ccc0a0ccbef9 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1205,7 +1205,7 @@ void pci_pme_active(struct pci_dev *dev, bool enable)
  * Error code depending on the platform is returned if both the platform and
  * the native mechanism fail to enable the generation of wake-up events
  */
-int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable)
+int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
 {
 	int error = 0;
 	bool pme_done = false;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 91b06be2f01e..62e8452c2ec6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -723,7 +723,7 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 bool pci_pme_capable(struct pci_dev *dev, pci_power_t state);
 void pci_pme_active(struct pci_dev *dev, bool enable);
-int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
+int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable);
 int pci_wake_from_d3(struct pci_dev *dev, bool enable);
 pci_power_t pci_target_state(struct pci_dev *dev);
 int pci_prepare_to_sleep(struct pci_dev *dev);
-- 
cgit v1.2.3-71-gd317


From 44549dff82753b6a5ffabcefeead34be63e95d96 Mon Sep 17 00:00:00 2001
From: Mike Sager <sager@netapp.com>
Date: Wed, 1 Apr 2009 09:21:47 -0400
Subject: nfs41: define NFS4_MAX_MINOR_VERSION based on CONFIG_NFS_V4_1

If 4.1 isn't supported, NFS4_MAX_MINOR_VERSION will be 0.

Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs4.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e3f0cbcbd0db..7c36fcf2dfb7 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -462,6 +462,13 @@ enum lock_type4 {
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
 #define NFS4_MINOR_VERSION 0
+
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MINOR_VERSION 1
+#else
+#define NFS4_MAX_MINOR_VERSION 0
+#endif /* CONFIG_NFS_V4_1 */
+
 #define NFS4_DEBUG 1
 
 /* Index of predefined Linux client operations */
-- 
cgit v1.2.3-71-gd317


From 94a417f3d7a02478a2d7842e693a61339fb54ea4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:49 -0400
Subject: nfs41: nfs_client.cl_minorversion

This field is set to the nfsv4 minor version for this mount.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>

Note: This patch sets the referral to the same minorversion as the
current mount. Revisit in future patch.

Signed-off-by: Andy Adamson <andros@netapp.com>
[removed cl_minorversion assignment in nfs_set_client]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[always define nfs_client.cl_minorversion]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 9 ++++++---
 include/linux/nfs_fs_sb.h | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 75c9cd2aa119..0efcb55c2caa 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1101,7 +1101,8 @@ static int nfs4_set_client(struct nfs_server *server,
 		const size_t addrlen,
 		const char *ip_addr,
 		rpc_authflavor_t authflavour,
-		int proto, const struct rpc_timeout *timeparms)
+		int proto, const struct rpc_timeout *timeparms,
+		u32 minorversion)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
@@ -1164,7 +1165,8 @@ static int nfs4_init_server(struct nfs_server *server,
 			data->client_address,
 			data->auth_flavors[0],
 			data->nfs_server.protocol,
-			&timeparms);
+			&timeparms,
+			data->minorversion);
 	if (error < 0)
 		goto error;
 
@@ -1282,7 +1284,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				parent_client->cl_ipaddr,
 				data->authflavor,
 				parent_server->client->cl_xprt->prot,
-				parent_server->client->cl_timeout);
+				parent_server->client->cl_timeout,
+				parent_client->cl_minorversion);
 	if (error < 0)
 		goto error;
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6ad75948cbf7..e9a51fe46aa3 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -32,6 +32,7 @@ struct nfs_client {
 	const struct nfs_rpc_ops *rpc_ops;	/* NFS protocol vector */
 	int			cl_proto;	/* Network transport protocol */
 
+	u32			cl_minorversion;/* NFSv4 minorversion */
 	struct rpc_cred		*cl_machine_cred;
 
 #ifdef CONFIG_NFS_V4
@@ -63,7 +64,7 @@ struct nfs_client {
 	 */
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
-#endif
+#endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
-- 
cgit v1.2.3-71-gd317


From 9ff71c3a9827b99699510076dffa0bbe7c36bfd4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:52 -0400
Subject: nfs41: client xdr definitions

Define stubs for sequence args and res data structures and embed
them in all other nfs4 and nfs41 xdr types.  They are needed for
sending any op in a nfs41 compound rpc.

Signed-off-by: Andy Adamson<andros@netapp.com>
[moved new args/res definitions away, to where they're first used]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_xdr.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b89c34e40bc2..f2c5700c7b6e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -145,6 +145,15 @@ struct nfs4_change_info {
 };
 
 struct nfs_seqid;
+
+struct nfs4_sequence_args {
+	/* stub */
+};
+
+struct nfs4_sequence_res {
+	/* stub */
+};
+
 /*
  * Arguments to the open call.
  */
@@ -165,6 +174,7 @@ struct nfs_openargs {
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
 	__u32			claim;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_openres {
@@ -181,6 +191,7 @@ struct nfs_openres {
 	__u32			do_recall;
 	__u64			maxsize;
 	__u32			attrset[NFS4_BITMAP_SIZE];
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -206,6 +217,7 @@ struct nfs_closeargs {
 	struct nfs_seqid *	seqid;
 	fmode_t			fmode;
 	const u32 *		bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_closeres {
@@ -213,6 +225,7 @@ struct nfs_closeres {
 	struct nfs_fattr *	fattr;
 	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
 };
 /*
  *  * Arguments to the lock,lockt, and locku call.
@@ -233,12 +246,14 @@ struct nfs_lock_args {
 	unsigned char		block : 1;
 	unsigned char		reclaim : 1;
 	unsigned char		new_lock_owner : 1;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_lock_res {
 	nfs4_stateid		stateid;
 	struct nfs_seqid *	lock_seqid;
 	struct nfs_seqid *	open_seqid;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_locku_args {
@@ -246,32 +261,38 @@ struct nfs_locku_args {
 	struct file_lock *	fl;
 	struct nfs_seqid *	seqid;
 	nfs4_stateid *		stateid;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_locku_res {
 	nfs4_stateid		stateid;
 	struct nfs_seqid *	seqid;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_lockt_args {
 	struct nfs_fh *		fh;
 	struct file_lock *	fl;
 	struct nfs_lowner	lock_owner;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_lockt_res {
 	struct file_lock *	denied; /* LOCK, LOCKT failed */
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_delegreturnargs {
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
 	const u32 * bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_delegreturnres {
 	struct nfs_fattr * fattr;
 	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -284,12 +305,14 @@ struct nfs_readargs {
 	__u32			count;
 	unsigned int		pgbase;
 	struct page **		pages;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_readres {
 	struct nfs_fattr *	fattr;
 	__u32			count;
 	int                     eof;
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -304,6 +327,7 @@ struct nfs_writeargs {
 	unsigned int		pgbase;
 	struct page **		pages;
 	const u32 *		bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_writeverf {
@@ -316,6 +340,7 @@ struct nfs_writeres {
 	struct nfs_writeverf *	verf;
 	__u32			count;
 	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -325,12 +350,14 @@ struct nfs_removeargs {
 	const struct nfs_fh	*fh;
 	struct qstr		name;
 	const u32 *		bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_removeres {
 	const struct nfs_server *server;
 	struct nfs4_change_info	cinfo;
 	struct nfs_fattr	dir_attr;
+	struct nfs4_sequence_res 	seq_res;
 };
 
 /*
@@ -383,6 +410,7 @@ struct nfs_setattrargs {
 	struct iattr *                  iap;
 	const struct nfs_server *	server; /* Needed for name mapping */
 	const u32 *			bitmask;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs_setaclargs {
@@ -390,6 +418,7 @@ struct nfs_setaclargs {
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_getaclargs {
@@ -397,11 +426,13 @@ struct nfs_getaclargs {
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs_setattrres {
 	struct nfs_fattr *              fattr;
 	const struct nfs_server *	server;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_linkargs {
@@ -583,6 +614,7 @@ struct nfs4_accessargs {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
 	u32				access;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_accessres {
@@ -590,6 +622,7 @@ struct nfs4_accessres {
 	struct nfs_fattr *		fattr;
 	u32				supported;
 	u32				access;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_create_arg {
@@ -609,6 +642,7 @@ struct nfs4_create_arg {
 	const struct iattr *		attrs;
 	const struct nfs_fh *		dir_fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs4_create_res {
@@ -617,21 +651,25 @@ struct nfs4_create_res {
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		dir_cinfo;
 	struct nfs_fattr *		dir_fattr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_fsinfo_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_getattr_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_getattr_res {
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_link_arg {
@@ -639,6 +677,7 @@ struct nfs4_link_arg {
 	const struct nfs_fh *		dir_fh;
 	const struct qstr *		name;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs4_link_res {
@@ -646,6 +685,7 @@ struct nfs4_link_res {
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		cinfo;
 	struct nfs_fattr *		dir_attr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 
@@ -653,21 +693,25 @@ struct nfs4_lookup_arg {
 	const struct nfs_fh *		dir_fh;
 	const struct qstr *		name;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_lookup_res {
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
 	struct nfs_fh *			fh;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_lookup_root_arg {
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_pathconf_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_readdir_arg {
@@ -678,11 +722,13 @@ struct nfs4_readdir_arg {
 	struct page **			pages;	/* zero-copy data */
 	unsigned int			pgbase;	/* zero-copy data */
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_readdir_res {
 	nfs4_verifier			verifier;
 	unsigned int			pgbase;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_readlink {
@@ -690,6 +736,7 @@ struct nfs4_readlink {
 	unsigned int			pgbase;
 	unsigned int			pglen;   /* zero-copy data */
 	struct page **			pages;   /* zero-copy data */
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_rename_arg {
@@ -698,6 +745,7 @@ struct nfs4_rename_arg {
 	const struct qstr *		old_name;
 	const struct qstr *		new_name;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_rename_res {
@@ -706,6 +754,7 @@ struct nfs4_rename_res {
 	struct nfs_fattr *		old_fattr;
 	struct nfs4_change_info		new_cinfo;
 	struct nfs_fattr *		new_fattr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 #define NFS4_SETCLIENTID_NAMELEN	(127)
@@ -724,6 +773,7 @@ struct nfs4_setclientid {
 struct nfs4_statfs_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_server_caps_res {
@@ -731,6 +781,7 @@ struct nfs4_server_caps_res {
 	u32				acl_bitmask;
 	u32				has_links;
 	u32				has_symlinks;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_string {
@@ -765,6 +816,7 @@ struct nfs4_fs_locations_arg {
 	const struct qstr *name;
 	struct page *page;
 	const u32 *bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 #endif /* CONFIG_NFS_V4 */
-- 
cgit v1.2.3-71-gd317


From 557134a39c8d2ab79d8b8d53438e03e29feb5ec4 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:21:53 -0400
Subject: nfs41: sessions client infrastructure

NFSv4.1 Sessions basic data types, initialization, and destruction.

The session is always associated with a struct nfs_client that holds
the exchange_id results.

Signed-off-by: Rahul Iyer <iyer@netapp.com>
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[remove extraneous rpc_clnt pointer, use the struct nfs_client cl_rpcclient.
remove the rpc_clnt parameter from nfs4 nfs4_init_session]
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Use the presence of a session to determine behaviour instead of the
minorversion number.]
Signed-off-by: Andy Adamson <andros@netapp.com>
[constified nfs4_has_session's struct nfs_client parameter]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Rename nfs4_put_session() to nfs4_destroy_session() and call it from nfs4_free_client() not nfs4_free_server().
Also get rid of nfs4_get_session() and the ref_count in nfs4_session struct as keeping track of nfs_client should be sufficient]
Signed-off-by: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
[nfs41: pass rsize and wsize into nfs4_init_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
[separated out removal of rpc_clnt parameter from nfs4_init_session ot a
 patch of its own]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Pass the nfs_client pointer into nfs4_alloc_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: don't assign to session->clp->cl_session in nfs4_destroy_session]
[nfs41: fixup nfs4_clear_client_minor_version]
[introduce nfs4_clear_client_minor_version() in this patch]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Refactor nfs4_init_session]
    Moved session allocation into nfs4_init_client_minor_version, called from
    nfs4_init_client.
    Leave rwise and wsize initialization in nfs4_init_session, called from
    nfs4_init_server.
    Reverted moving of nfs_fsid definition to nfs_fs_sb.h
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: Move NFS4_MAX_SLOT_TABLE define from under CONFIG_NFS_V4_1]
[Fix comile error when CONFIG_NFS_V4_1 is not set.]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[moved nfs4_init_slot_table definition to "create_session operation"]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: alloc session with GFP_KERNEL]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/internal.h         | 12 ++++++++++
 fs/nfs/nfs4_fs.h          |  4 ++++
 fs/nfs/nfs4proc.c         | 34 +++++++++++++++++++++++++++
 include/linux/nfs_fs_sb.h | 49 ++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_xdr.h   | 15 ++++++++++++
 6 files changed, 174 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a736160046c3..f1506f148521 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -183,6 +183,20 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 #endif
 }
 
+/*
+ * Clears/puts all minor version specific parts from an nfs_client struct
+ * reverting it to minorversion 0.
+ */
+static void nfs4_clear_client_minor_version(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+	if (nfs4_has_session(clp)) {
+		nfs4_destroy_session(clp->cl_session);
+		clp->cl_session = NULL;
+	}
+#endif /* CONFIG_NFS_V4_1 */
+}
+
 /*
  * Destroy a shared client record
  */
@@ -190,6 +204,7 @@ static void nfs_free_client(struct nfs_client *clp)
 {
 	dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
 
+	nfs4_clear_client_minor_version(clp);
 	nfs4_shutdown_client(clp);
 
 	nfs_fscache_release_client_cookie(clp);
@@ -1053,6 +1068,30 @@ error:
 }
 
 #ifdef CONFIG_NFS_V4
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+#if defined(CONFIG_NFS_V4_1)
+	if (clp->cl_minorversion) {
+		struct nfs4_session *session = NULL;
+		/*
+		 * Create the session and mark it expired.
+		 * When a SEQUENCE operation encounters the expired session
+		 * it will do session recovery to initialize it.
+		 */
+		session = nfs4_alloc_session(clp);
+		if (!session)
+			return -ENOMEM;
+
+		clp->cl_session = session;
+	}
+#endif /* CONFIG_NFS_V4_1 */
+
+	return 0;
+}
+
 /*
  * Initialise an NFS4 client record
  */
@@ -1087,6 +1126,10 @@ static int nfs4_init_client(struct nfs_client *clp,
 	}
 	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
 
+	error = nfs4_init_client_minor_version(clp);
+	if (error < 0)
+		goto error;
+
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 	return 0;
 
@@ -1143,6 +1186,21 @@ error:
 	return error;
 }
 
+/*
+ * Initialize a session.
+ * Note: save the mount rsize and wsize for create_server negotiation.
+ */
+static void nfs4_init_session(struct nfs_client *clp,
+			      unsigned int wsize, unsigned int rsize)
+{
+#if defined(CONFIG_NFS_V4_1)
+	if (nfs4_has_session(clp)) {
+		clp->cl_session->fc_attrs.max_rqst_sz = wsize;
+		clp->cl_session->fc_attrs.max_resp_sz = rsize;
+	}
+#endif /* CONFIG_NFS_V4_1 */
+}
+
 /*
  * Create a version 4 volume record
  */
@@ -1221,6 +1279,8 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	BUG_ON(!server->nfs_client->rpc_ops);
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
+	nfs4_init_session(server->nfs_client, server->wsize, server->rsize);
+
 	/* Probe the root fh to retrieve its FSID */
 	error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
 	if (error < 0)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ffa6bd54d439..7cef45db9257 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -206,6 +206,18 @@ extern int nfs4_path_walk(struct nfs_server *server,
 			  const char *path);
 #endif
 
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+	if (clp->cl_session)
+		return 1;
+#endif /* CONFIG_NFS_V4_1 */
+	return 0;
+}
+
 /*
  * Determine the device name as a string
  */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 84345deab26f..acac6f8c3d39 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -202,6 +202,10 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
+#if defined(CONFIG_NFS_V4_1)
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
 
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4674f8092da8..cdd8e74c47d0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3723,6 +3723,40 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	return status;
 }
 
+#ifdef CONFIG_NFS_V4_1
+/* Destroy the slot table */
+static void nfs4_destroy_slot_table(struct nfs4_session *session)
+{
+	if (session->fc_slot_table.slots == NULL)
+		return;
+	kfree(session->fc_slot_table.slots);
+	session->fc_slot_table.slots = NULL;
+	return;
+}
+
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+
+	session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+	if (!session)
+		return NULL;
+	tbl = &session->fc_slot_table;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "Slot table");
+	session->clp = clp;
+	return session;
+}
+
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+	nfs4_destroy_slot_table(session);
+	kfree(session);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
 	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
 	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e9a51fe46aa3..b47c0fc55d42 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -4,9 +4,12 @@
 #include <linux/list.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
+#include <linux/nfs_xdr.h>
+#include <linux/sunrpc/xprt.h>
 
 #include <asm/atomic.h>
 
+struct nfs4_session;
 struct nfs_iostats;
 struct nlm_host;
 
@@ -66,6 +69,10 @@ struct nfs_client {
 	unsigned char		cl_id_uniquifier;
 #endif /* CONFIG_NFS_V4 */
 
+#ifdef CONFIG_NFS_V4_1
+	struct nfs4_session	*cl_session; 	/* sharred session */
+#endif /* CONFIG_NFS_V4_1 */
+
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
@@ -146,4 +153,46 @@ struct nfs_server {
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
 
+
+/* maximum number of slots to use */
+#define NFS4_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE
+
+#if defined(CONFIG_NFS_V4_1)
+
+/* Sessions */
+#define SLOT_TABLE_SZ (NFS4_MAX_SLOT_TABLE/(8*sizeof(long)))
+struct nfs4_slot_table {
+	struct nfs4_slot *slots;		/* seqid per slot */
+	unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
+	spinlock_t	slot_tbl_lock;
+	struct rpc_wait_queue	slot_tbl_waitq;	/* allocators may wait here */
+	int		max_slots;		/* # slots in table */
+	int		highest_used_slotid;	/* sent to server on each SEQ.
+						 * op for dynamic resizing */
+};
+
+static inline int slot_idx(struct nfs4_slot_table *tbl, struct nfs4_slot *sp)
+{
+	return sp - tbl->slots;
+}
+
+/*
+ * Session related parameters
+ */
+struct nfs4_session {
+	struct nfs4_sessionid		sess_id;
+	u32				flags;
+	unsigned long			session_state;
+	u32				hash_alg;
+	u32				ssv_len;
+
+	/* The fore and back channel */
+	struct nfs4_channel_attrs	fc_attrs;
+	struct nfs4_slot_table		fc_slot_table;
+	struct nfs4_channel_attrs	bc_attrs;
+					/* back channel has one slot */
+	struct nfs_client		*clp;
+};
+
+#endif /* CONFIG_NFS_V4_1 */
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f2c5700c7b6e..8f8c026c558e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -146,6 +146,21 @@ struct nfs4_change_info {
 
 struct nfs_seqid;
 
+/* nfs41 sessions channel attributes */
+struct nfs4_channel_attrs {
+	u32			headerpadsz;
+	u32			max_rqst_sz;
+	u32			max_resp_sz;
+	u32			max_resp_sz_cached;
+	u32			max_ops;
+	u32			max_reqs;
+};
+
+/* nfs41 sessions slot seqid */
+struct nfs4_slot {
+	u32		 	seq_nr;
+};
+
 struct nfs4_sequence_args {
 	/* stub */
 };
-- 
cgit v1.2.3-71-gd317


From 43652ad55342d9146d8035932101a5814b22315a Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:54 -0400
Subject: nfs41: use nfs4_server_caps_arg

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_server_caps_arg]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cdd8e74c47d0..5ef1022b7e6f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1584,10 +1584,13 @@ void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
+	struct nfs4_server_caps_arg args = {
+		.fhandle = fhandle,
+	};
 	struct nfs4_server_caps_res res = {};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
-		.rpc_argp = fhandle,
+		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
 	int status;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1690f0e44b91..91305861037a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1900,7 +1900,8 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
  * GETATTR_BITMAP request
  */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
+static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+				    struct nfs4_server_caps_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1909,7 +1910,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
-	encode_putfh(&xdr, fhandle, &hdr);
+	encode_putfh(&xdr, args->fhandle, &hdr);
 	encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
 			   FATTR4_WORD0_LINK_SUPPORT|
 			   FATTR4_WORD0_SYMLINK_SUPPORT|
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8f8c026c558e..a7b7f2a059cc 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -791,6 +791,11 @@ struct nfs4_statfs_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_server_caps_arg {
+	struct nfs_fh		       *fhandle;
+	struct nfs4_sequence_args	seq_args;
+};
+
 struct nfs4_server_caps_res {
 	u32				attr_bitmask[2];
 	u32				acl_bitmask;
-- 
cgit v1.2.3-71-gd317


From f50c7000817e7cb4e676ac5d911a82c0f3fd226f Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:55 -0400
Subject: nfs41: use nfs4_readlink_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_readlink_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 3 ++-
 fs/nfs/nfs4xdr.c        | 3 ++-
 include/linux/nfs_xdr.h | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5ef1022b7e6f..b0ec8ff96eb7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1960,10 +1960,11 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
 		.pglen    = pglen,
 		.pages    = &page,
 	};
+	struct nfs4_readlink_res res;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
 		.rpc_argp = &args,
-		.rpc_resp = NULL,
+		.rpc_resp = &res,
 	};
 
 	return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 91305861037a..1e41420916ad 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4235,7 +4235,8 @@ out:
 /*
  * Decode READLINK response
  */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+				 struct nfs4_readlink_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index a7b7f2a059cc..f71260aeb803 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -754,6 +754,10 @@ struct nfs4_readlink {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_readlink_res {
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_rename_arg {
 	const struct nfs_fh *		old_dir;
 	const struct nfs_fh *		new_dir;
-- 
cgit v1.2.3-71-gd317


From 24ad148a0ff74b1e703a8bc5b3e0793dc7d4e3a9 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:56 -0400
Subject: nfs41: use nfs4_statfs_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_statfs_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b0ec8ff96eb7..b715f6057611 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2426,10 +2426,13 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 		.fh = fhandle,
 		.bitmask = server->attr_bitmask,
 	};
+	struct nfs4_statfs_res res = {
+		.fsstat = fsstat,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
 		.rpc_argp = &args,
-		.rpc_resp = fsstat,
+		.rpc_resp = &res,
 	};
 
 	nfs_fattr_init(fsstat->fattr);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1e41420916ad..b7871ad82aac 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4387,7 +4387,8 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pat
 /*
  * STATFS request
  */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+			       struct nfs4_statfs_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4398,7 +4399,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fssta
 	if (!status)
 		status = decode_putfh(&xdr);
 	if (!status)
-		status = decode_statfs(&xdr, fsstat);
+		status = decode_statfs(&xdr, res->fsstat);
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f71260aeb803..4dac59ef6f4f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -795,6 +795,11 @@ struct nfs4_statfs_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_statfs_res {
+	struct nfs_fsstat	       *fsstat;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_server_caps_arg {
 	struct nfs_fh		       *fhandle;
 	struct nfs4_sequence_args	seq_args;
-- 
cgit v1.2.3-71-gd317


From 3dda5e434721f942870ee30bc6103761618d410f Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:57 -0400
Subject: nfs41: use nfs4_fsinfo_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_fsinfo_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b715f6057611..b8915ef533da 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2458,10 +2458,13 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 		.fh = fhandle,
 		.bitmask = server->attr_bitmask,
 	};
+	struct nfs4_fsinfo_res res = {
+		.fsinfo = fsinfo,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
 		.rpc_argp = &args,
-		.rpc_resp = fsinfo,
+		.rpc_resp = &res,
 	};
 
 	return rpc_call_sync(server->client, &msg, 0);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b7871ad82aac..d9ab8209c286 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4351,7 +4351,8 @@ out:
 /*
  * FSINFO request
  */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+			       struct nfs4_fsinfo_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4362,7 +4363,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
 	if (!status)
 		status = decode_putfh(&xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, fsinfo);
+		status = decode_fsinfo(&xdr, res->fsinfo);
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4dac59ef6f4f..7d64913cbb1b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -675,6 +675,11 @@ struct nfs4_fsinfo_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_fsinfo_res {
+	struct nfs_fsinfo	       *fsinfo;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_getattr_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
-- 
cgit v1.2.3-71-gd317


From d45b2989a7956ae9e71d584ceac942278c0371c7 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:58 -0400
Subject: nfs41: use nfs4_pathconf_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_pathconf_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b8915ef533da..aea2e83d3939 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2496,10 +2496,13 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
 		.fh = fhandle,
 		.bitmask = server->attr_bitmask,
 	};
+	struct nfs4_pathconf_res res = {
+		.pathconf = pathconf,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
 		.rpc_argp = &args,
-		.rpc_resp = pathconf,
+		.rpc_resp = &res,
 	};
 
 	/* None of the pathconf attributes are mandatory to implement */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d9ab8209c286..a77ee3dd0b3d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4370,7 +4370,8 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
 /*
  * PATHCONF request
  */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+				 struct nfs4_pathconf_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4381,7 +4382,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pat
 	if (!status)
 		status = decode_putfh(&xdr);
 	if (!status)
-		status = decode_pathconf(&xdr, pathconf);
+		status = decode_pathconf(&xdr, res->pathconf);
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7d64913cbb1b..56523319e14c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -734,6 +734,11 @@ struct nfs4_pathconf_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_pathconf_res {
+	struct nfs_pathconf	       *pathconf;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_readdir_arg {
 	const struct nfs_fh *		fh;
 	u64				cookie;
-- 
cgit v1.2.3-71-gd317


From 663c79b3cd8f5fe21fe7d7565fec0072e3234ddc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:59 -0400
Subject: nfs41: use nfs4_getaclres

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: embed resp_len in nfs_getaclres]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 20 +++++++++++---------
 fs/nfs/nfs4xdr.c        |  5 +++--
 include/linux/nfs_xdr.h |  5 +++++
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aea2e83d3939..20c9acf689fd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2755,12 +2755,14 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		.acl_pages = pages,
 		.acl_len = buflen,
 	};
-	size_t resp_len = buflen;
+	struct nfs_getaclres res = {
+		.acl_len = buflen,
+	};
 	void *resp_buf;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
 		.rpc_argp = &args,
-		.rpc_resp = &resp_len,
+		.rpc_resp = &res,
 	};
 	struct page *localpage = NULL;
 	int ret;
@@ -2774,7 +2776,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 			return -ENOMEM;
 		args.acl_pages[0] = localpage;
 		args.acl_pgbase = 0;
-		resp_len = args.acl_len = PAGE_SIZE;
+		args.acl_len = PAGE_SIZE;
 	} else {
 		resp_buf = buf;
 		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
@@ -2782,18 +2784,18 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (ret)
 		goto out_free;
-	if (resp_len > args.acl_len)
-		nfs4_write_cached_acl(inode, NULL, resp_len);
+	if (res.acl_len > args.acl_len)
+		nfs4_write_cached_acl(inode, NULL, res.acl_len);
 	else
-		nfs4_write_cached_acl(inode, resp_buf, resp_len);
+		nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
 	if (buf) {
 		ret = -ERANGE;
-		if (resp_len > buflen)
+		if (res.acl_len > buflen)
 			goto out_free;
 		if (localpage)
-			memcpy(buf, resp_buf, resp_len);
+			memcpy(buf, resp_buf, res.acl_len);
 	}
-	ret = resp_len;
+	ret = res.acl_len;
 out_free:
 	if (localpage)
 		__free_page(localpage);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a77ee3dd0b3d..3e777893e2b0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4018,7 +4018,8 @@ out:
  * Decode GETACL response
  */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+		    struct nfs_getaclres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4031,7 +4032,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
 	status = decode_putfh(&xdr);
 	if (status)
 		goto out;
-	status = decode_getacl(&xdr, rqstp, acl_len);
+	status = decode_getacl(&xdr, rqstp, &res->acl_len);
 
 out:
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 56523319e14c..6e9ee2848606 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -444,6 +444,11 @@ struct nfs_getaclargs {
 	struct nfs4_sequence_args 	seq_args;
 };
 
+struct nfs_getaclres {
+	size_t				acl_len;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs_setattrres {
 	struct nfs_fattr *              fattr;
 	const struct nfs_server *	server;
-- 
cgit v1.2.3-71-gd317


From 73c403a9a93743b068103c13c05ed136dc687d05 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:01 -0400
Subject: nfs41: use nfs4_setaclres

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs_setaclres]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 3 ++-
 fs/nfs/nfs4xdr.c        | 3 ++-
 include/linux/nfs_xdr.h | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 20c9acf689fd..62bbe25d9423 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2842,10 +2842,11 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		.acl_pages	= pages,
 		.acl_len	= buflen,
 	};
+	struct nfs_setaclres res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETACL],
 		.rpc_argp	= &arg,
-		.rpc_resp	= NULL,
+		.rpc_resp	= &res,
 	};
 	int ret;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 27dd25d9ad42..aa350d5bf207 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3996,7 +3996,8 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
  * Decode SETACL response
  */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+		    struct nfs_setaclres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6e9ee2848606..0f2dc8f4cc36 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -436,6 +436,10 @@ struct nfs_setaclargs {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs_setaclres {
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs_getaclargs {
 	struct nfs_fh *			fh;
 	size_t				acl_len;
-- 
cgit v1.2.3-71-gd317


From 22958463d5dca8548e19430779f379e66fd6e4a4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:02 -0400
Subject: nfs41: use nfs4_fs_locations_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[find nfs4_fs_locations_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 6 ++++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 62bbe25d9423..e08edc99faac 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3722,10 +3722,13 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		.page = page,
 		.bitmask = bitmask,
 	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations = fs_locations,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
 		.rpc_argp = &args,
-		.rpc_resp = fs_locations,
+		.rpc_resp = &res,
 	};
 	int status;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index aa350d5bf207..e448e33b4d05 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4504,7 +4504,8 @@ out:
 /*
  * FS_LOCATIONS request
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+				     struct nfs4_fs_locations_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4519,7 +4520,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 	if ((status = decode_lookup(&xdr)) != 0)
 		goto out;
 	xdr_enter_page(&xdr, PAGE_SIZE);
-	status = decode_getfattr(&xdr, &res->fattr, res->server);
+	status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+				 res->fs_locations->server);
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0f2dc8f4cc36..d837f10c49ef 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -867,6 +867,11 @@ struct nfs4_fs_locations_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_fs_locations_res {
+	struct nfs4_fs_locations       *fs_locations;
+	struct nfs4_sequence_res	seq_res;
+};
+
 #endif /* CONFIG_NFS_V4 */
 
 struct nfs_page;
-- 
cgit v1.2.3-71-gd317


From cccef3b96a4759ae0790452280c00ea505412157 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:03 -0400
Subject: nfs41: introduce nfs4_call_sync

Use nfs4_call_sync rather than rpc_call_sync to provide
for a nfs41 sessions-enabled interface for sessions manipulation.

The nfs41 rpc logic uses the rpc_call_prepare method to
recover and create the session, as well as selecting a free slot id
and the rpc_call_done to free the slot and update slot table
related metadata.

In the coming patches we'll add rpc prepare and done routines
for setting up the sequence op and processing the sequence result.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: nfs4_call_sync]
As per 11-14-08 review.
Squash into "nfs41: introduce nfs4_call_sync" and "nfs41: nfs4_setup_sequence"
Define two functions one for v4 and one for v41
add a pointer to struct nfs4_client to the correct one.
Signed-off-by: Andy Adamson <andros@netapp.com>
[added BUG() in _nfs4_call_sync_session if !CONFIG_NFS_V4_1]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: check for session not minorversion]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[group minorversion specific stuff together]
Signed-off-by: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfs41: fixup nfs4_clear_client_minor_version]
[introduce nfs4_init_client_minor_version() in this patch]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[cleaned-up patch: got rid of nfs_call_sync_t, dprintks, cosmetics, extra server defs]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  5 ++++
 fs/nfs/internal.h         | 12 +++++++++
 fs/nfs/nfs4proc.c         | 67 +++++++++++++++++++++++++++++++++--------------
 include/linux/nfs_fs_sb.h |  8 ++++++
 4 files changed, 73 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f1506f148521..a9828baaa445 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -194,6 +194,8 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 		nfs4_destroy_session(clp->cl_session);
 		clp->cl_session = NULL;
 	}
+
+	clp->cl_call_sync = _nfs4_call_sync;
 #endif /* CONFIG_NFS_V4_1 */
 }
 
@@ -1073,6 +1075,8 @@ error:
  */
 static int nfs4_init_client_minor_version(struct nfs_client *clp)
 {
+	clp->cl_call_sync = _nfs4_call_sync;
+
 #if defined(CONFIG_NFS_V4_1)
 	if (clp->cl_minorversion) {
 		struct nfs4_session *session = NULL;
@@ -1086,6 +1090,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 			return -ENOMEM;
 
 		clp->cl_session = session;
+		clp->cl_call_sync = _nfs4_call_sync_session;
 	}
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7cef45db9257..8d67c2865dc3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -206,6 +206,18 @@ extern int nfs4_path_walk(struct nfs_server *server,
 			  const char *path);
 #endif
 
+/* nfs4proc.c */
+extern int _nfs4_call_sync(struct nfs_server *server,
+			   struct rpc_message *msg,
+			   struct nfs4_sequence_args *args,
+			   struct nfs4_sequence_res *res,
+			   int cache_reply);
+extern int _nfs4_call_sync_session(struct nfs_server *server,
+				   struct rpc_message *msg,
+				   struct nfs4_sequence_args *args,
+				   struct nfs4_sequence_res *res,
+				   int cache_reply);
+
 /*
  * Determine if sessions are in use.
  */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e08edc99faac..4fc5b385f61e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -271,6 +271,33 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 	spin_unlock(&clp->cl_lock);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
+int _nfs4_call_sync_session(struct nfs_server *server,
+			    struct rpc_message *msg,
+			    struct nfs4_sequence_args *args,
+			    struct nfs4_sequence_res *res,
+			    int cache_reply)
+{
+	/* in preparation for setting up the sequence op */
+	return rpc_call_sync(server->client, msg, 0);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+int _nfs4_call_sync(struct nfs_server *server,
+		    struct rpc_message *msg,
+		    struct nfs4_sequence_args *args,
+		    struct nfs4_sequence_res *res,
+		    int cache_reply)
+{
+	return rpc_call_sync(server->client, msg, 0);
+}
+
+#define nfs4_call_sync(server, msg, args, res, cache_reply) \
+	(server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+			&(res)->seq_res, (cache_reply))
+
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
@@ -1269,7 +1296,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 	} else
 		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
 
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	if (status == 0 && state != NULL)
 		renew_lease(server, timestamp);
 	return status;
@@ -1595,7 +1622,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	};
 	int status;
 
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	if (status == 0) {
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
@@ -1609,6 +1636,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->acl_bitmask = res.acl_bitmask;
 	}
+
 	return status;
 }
 
@@ -1641,7 +1669,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_resp = &res,
 	};
 	nfs_fattr_init(info->fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -1731,7 +1759,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 	
 	nfs_fattr_init(fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -1815,7 +1843,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
 	nfs_fattr_init(fattr);
 
 	dprintk("NFS call  lookupfh %s\n", name->name);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	dprintk("NFS reply lookupfh: %d\n", status);
 	return status;
 }
@@ -1901,7 +1929,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 			args.access |= NFS4_ACCESS_EXECUTE;
 	}
 	nfs_fattr_init(&fattr);
-	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	if (!status) {
 		entry->mask = 0;
 		if (res.access & NFS4_ACCESS_READ)
@@ -1967,7 +1995,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
 		.rpc_resp = &res,
 	};
 
-	return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2061,7 +2089,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 	int			status;
 
 	nfs_fattr_init(&res.dir_attr);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, &res.dir_attr);
@@ -2129,7 +2157,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 	
 	nfs_fattr_init(res.old_fattr);
 	nfs_fattr_init(res.new_fattr);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 
 	if (!status) {
 		update_changeattr(old_dir, &res.old_cinfo);
@@ -2178,7 +2206,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 
 	nfs_fattr_init(res.fattr);
 	nfs_fattr_init(res.dir_attr);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2239,7 +2267,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
 {
-	int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+	int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg,
+				    &data->arg, &data->res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &data->res.dir_cinfo);
 		nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2348,7 +2377,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 			(unsigned long long)cookie);
 	nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
 	res.pgbase = args.pgbase;
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
 	if (status == 0)
 		memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
 
@@ -2436,7 +2465,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 
 	nfs_fattr_init(fsstat->fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return  nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -2467,7 +2496,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_resp = &res,
 	};
 
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -2512,7 +2541,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
 	}
 
 	nfs_fattr_init(pathconf->fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2781,7 +2810,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		resp_buf = buf;
 		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
 	}
-	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
 	if (ret)
 		goto out_free;
 	if (res.acl_len > args.acl_len)
@@ -2854,7 +2883,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		return -EOPNOTSUPP;
 	nfs_inode_return_delegation(inode);
 	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
-	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	nfs_access_zap_cache(inode);
 	nfs_zap_acl_cache(inode);
 	return ret;
@@ -3143,7 +3172,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
 	arg.lock_owner.id = lsp->ls_id.id;
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	switch (status) {
 		case 0:
 			request->fl_type = F_UNLCK;
@@ -3736,7 +3765,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	nfs_fattr_init(&fs_locations->fattr);
 	fs_locations->server = server;
 	fs_locations->nlocations = 0;
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	nfs_fixup_referral_attributes(&fs_locations->fattr);
 	dprintk("%s: returned status = %d\n", __func__, status);
 	return status;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b47c0fc55d42..206485e5082f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -12,6 +12,9 @@
 struct nfs4_session;
 struct nfs_iostats;
 struct nlm_host;
+struct nfs4_sequence_args;
+struct nfs4_sequence_res;
+struct nfs_server;
 
 /*
  * The nfs_client identifies our client state to the server.
@@ -67,6 +70,11 @@ struct nfs_client {
 	 */
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
+	int		     (* cl_call_sync)(struct nfs_server *server,
+					      struct rpc_message *msg,
+					      struct nfs4_sequence_args *args,
+					      struct nfs4_sequence_res *res,
+					      int cache_reply);
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_V4_1
-- 
cgit v1.2.3-71-gd317


From f3752975caa716709c5ea0b0820b86111d921df4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:04 -0400
Subject: nfs41: nfs41: pass *session in seq_args and seq_res

To be used for getting the rpc's minorversion and for nfs41 xdr
{en,de}coding of the sequence operation.
Reset the seq session ptrs for minorversion=0 rpc calls.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 1 +
 include/linux/nfs_xdr.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4fc5b385f61e..49cf969417c0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -291,6 +291,7 @@ int _nfs4_call_sync(struct nfs_server *server,
 		    struct nfs4_sequence_res *res,
 		    int cache_reply)
 {
+	args->sa_session = res->sr_session = NULL;
 	return rpc_call_sync(server->client, msg, 0);
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d837f10c49ef..f5675063f951 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -162,11 +162,11 @@ struct nfs4_slot {
 };
 
 struct nfs4_sequence_args {
-	/* stub */
+	struct nfs4_session	*sa_session;
 };
 
 struct nfs4_sequence_res {
-	/* stub */
+	struct nfs4_session	*sr_session;
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From 5f7dbd5c752d88310d8fe1feedefd5c6496eff48 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:05 -0400
Subject: nfs41: set up seq_res.sr_slotid

Initialize nfs4_sequence_res sr_slotid to NFS4_MAX_SLOT_TABLE.

[was nfs41: sequence res use slotid]
Signed-off-by: Andy Adamson <andros@netapp.com>
[pulled definition of struct nfs4_sequence_res.sr_slotid to here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 +++++
 fs/nfs/read.c           | 1 +
 fs/nfs/unlink.c         | 1 +
 fs/nfs/write.c          | 2 ++
 include/linux/nfs_xdr.h | 1 +
 5 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 49cf969417c0..5878930b4c3b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -371,6 +371,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
+	p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	if (flags & O_EXCL) {
 		u32 *s = (u32 *) p->o_arg.u.verifier.data;
 		s[0] = jiffies;
@@ -1463,6 +1464,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
+	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	calldata->path.mnt = mntget(path->mnt);
 	calldata->path.dentry = dget(path->dentry);
 
@@ -3088,6 +3090,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
 	data->res.fattr = &data->fattr;
 	data->res.server = server;
+	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	nfs_fattr_init(data->res.fattr);
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
@@ -3240,6 +3243,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	p->arg.fl = &p->fl;
 	p->arg.seqid = seqid;
 	p->res.seqid = seqid;
+	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->arg.stateid = &lsp->ls_stateid;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
@@ -3412,6 +3416,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
 	p->res.lock_seqid = p->arg.lock_seqid;
+	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4ace3c50a8eb..70ba2b4cb9a4 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,6 +46,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
+		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index ecc295347775..83d0ce2600ab 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -241,6 +241,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 		status = PTR_ERR(data->cred);
 		goto out_free;
 	}
+	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 
 	status = -EBUSY;
 	spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e560a78995a3..035e6fb9f57e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -52,6 +52,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	}
 	return p;
 }
@@ -71,6 +72,7 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
+		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f5675063f951..db0d1236aae7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -167,6 +167,7 @@ struct nfs4_sequence_args {
 
 struct nfs4_sequence_res {
 	struct nfs4_session	*sr_session;
+	u8			sr_slotid;	/* slot used to send request */
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From fbcd4abcb3841f85578985c09c6df85aa41b0ae8 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:15 -0400
Subject: nfs41: setup_sequence method

Allocate a slot in the session slot table and set the sequence op arguments.

Called at the rpc prepare stage.

Add a status to nfs41_sequence_res, initialize it to one so that we catch
rpc level failures which do not go through decode_sequence which sets
the new status field.

Note that upon an rpc level failure, we don't know if the server processed the
sequence operation or not. Proceed as if the server did process the sequence
operation.

Signed-off-by: Rahul Iyer <iyer@netapp.com>
[nfs41: sequence args use slotid]
[nfs41: find slot return slotid]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: remove SEQ4_STATUS_USE_TK_STATUS]
As per 11-14-08 review
[move extern declaration from nfs41: sequence setup/done support]
[removed sa_session definition, changed sa_cache_this into a u8 to reduce footprint]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: rpc_sleep_on slot_tbl_waitq must be called under slot_tbl_lock]
    Otherwise there's a race (we've hit) with nfs4_free_slot where
    nfs41_setup_sequence sees a full slot table, unlocks slot_tbl_lock,
    nfs4_free_slots happen concurrently and call rpc_wake_up_next
    where there's nobody to wake up yet, context goes back to
    nfs41_setup_sequence which goes to sleep when the slot table
    is actually empty now and there's no-one to wake it up anymore.
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        | 10 ++++++++++
 fs/nfs/nfs4proc.c       | 34 ++++++++++++++++++++++++++++++++--
 include/linux/nfs_xdr.h |  4 ++++
 3 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index acac6f8c3d39..eccf4e93e7d7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -203,8 +203,18 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
 #if defined(CONFIG_NFS_V4_1)
+extern int nfs4_setup_sequence(struct nfs_client *clp,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+#else /* CONFIG_NFS_v4_1 */
+static inline int nfs4_setup_sequence(struct nfs_client *clp,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task)
+{
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 extern const u32 nfs4_fattr_bitmap[2];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a0946a0d116e..c9618080317e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -280,6 +280,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
  * If found, we mark the slot as used, update the highest_used_slotid,
  * and respectively set up the sequence operation args.
  * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
+ *
+ * Note: must be called with under the slot_tbl_lock.
  */
 static u8
 nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
@@ -288,7 +290,6 @@ nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
 	u8 ret_id = NFS4_MAX_SLOT_TABLE;
 	BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
 
-	spin_lock(&tbl->slot_tbl_lock);
 	dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
 		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
 		tbl->max_slots);
@@ -302,7 +303,6 @@ nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
 out:
 	dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
 		__func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
-	spin_unlock(&tbl->slot_tbl_lock);
 	return ret_id;
 }
 
@@ -312,12 +312,42 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 				int cache_reply,
 				struct rpc_task *task)
 {
+	struct nfs4_slot *slot;
+	struct nfs4_slot_table *tbl;
+	u8 slotid;
+
+	dprintk("--> %s\n", __func__);
 	/* slot already allocated? */
 	if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
 		return 0;
 
 	memset(res, 0, sizeof(*res));
 	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+	tbl = &session->fc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	slotid = nfs4_find_slot(tbl, task);
+	if (slotid == NFS4_MAX_SLOT_TABLE) {
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+		spin_unlock(&tbl->slot_tbl_lock);
+		dprintk("<-- %s: no free slots\n", __func__);
+		return -EAGAIN;
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	slot = tbl->slots + slotid;
+	args->sa_slotid = slotid;
+	args->sa_cache_this = cache_reply;
+
+	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
+
+	res->sr_slotid = slotid;
+	res->sr_renewal_time = jiffies;
+	/*
+	 * sr_status is only set in decode_sequence, and so will remain
+	 * set to 1 if an rpc level failure occurs.
+	 */
+	res->sr_status = 1;
 	return 0;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index db0d1236aae7..4ac14b40efc9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -163,11 +163,15 @@ struct nfs4_slot {
 
 struct nfs4_sequence_args {
 	struct nfs4_session	*sa_session;
+	u8			sa_slotid;
+	u8			sa_cache_this;
 };
 
 struct nfs4_sequence_res {
 	struct nfs4_session	*sr_session;
 	u8			sr_slotid;	/* slot used to send request */
+	unsigned long		sr_renewal_time;
+	int			sr_status;	/* sequence operation status */
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From 99fe60d062cfecf382c036065b3278b82b6c5eff Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:29 -0400
Subject: nfs41: exchange_id operation

Implement the exchange_id operation conforming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

Unlike NFSv4.0, NFSv4.1 requires machine credentials. RPC_AUTH_GSS machine
credentials will be passed into the kernel at mount time to be available for
the exchange_id operation.

RPC_AUTH_UNIX root mounts can use the UNIX root credential. Store the root
credential in the nfs_client struct.

Without a credential, NFSv4.1 state renewal fails.

[nfs41: establish clientid via exchange id only if cred != NULL]
Signed-off-by: Andy Adamson<andros@umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: move nfstime4 from under CONFIG_NFS_V4_1]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: do not wait a lease time in exchange id]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: pass *session in seq_args and seq_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
[nfs41: Ignoring impid in decode_exchange_id is missing a READ_BUF]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: fix Xcode_exchange_id's xdr Xcoding pointer type]
[nfs41: get rid of unused struct nfs41_exchange_id_res members]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4proc.c          |  61 ++++++++++++++++++++
 fs/nfs/nfs4xdr.c           | 139 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4.h       |   1 +
 include/linux/nfs_fs_sb.h  |   6 ++
 include/linux/nfs_xdr.h    |  40 +++++++++++++
 include/linux/nfsd/state.h |   1 -
 6 files changed, 247 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dc0feb5837b1..6f384e290753 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -48,6 +48,7 @@
 #include <linux/smp_lock.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/module.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -433,11 +434,13 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	spin_unlock(&tbl->slot_tbl_lock);
 
 	slot = tbl->slots + slotid;
+	args->sa_session = session;
 	args->sa_slotid = slotid;
 	args->sa_cache_this = cache_reply;
 
 	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
 
+	res->sr_session = session;
 	res->sr_slotid = slotid;
 	res->sr_renewal_time = jiffies;
 	/*
@@ -4128,6 +4131,64 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 }
 
 #ifdef CONFIG_NFS_V4_1
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ */
+static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	nfs4_verifier verifier;
+	struct nfs41_exchange_id_args args = {
+		.client = clp,
+		.flags = clp->cl_exchange_flags,
+	};
+	struct nfs41_exchange_id_res res = {
+		.client = clp,
+	};
+	int status;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	__be32 *p;
+
+	dprintk("--> %s\n", __func__);
+	BUG_ON(clp == NULL);
+	p = (u32 *)verifier.data;
+	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
+	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
+	args.verifier = &verifier;
+
+	while (1) {
+		args.id_len = scnprintf(args.id, sizeof(args.id),
+					"%s/%s %u",
+					clp->cl_ipaddr,
+					rpc_peeraddr2str(clp->cl_rpcclient,
+							 RPC_DISPLAY_ADDR),
+					clp->cl_id_uniquifier);
+
+		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+
+		if (status != NFS4ERR_CLID_INUSE)
+			break;
+
+		if (signalled())
+			break;
+
+		if (++clp->cl_id_uniquifier == 0)
+			break;
+	}
+
+	dprintk("<-- %s status= %d\n", __func__, status);
+	return status;
+}
+
 /* Destroy the slot table */
 static void nfs4_destroy_slot_table(struct nfs4_session *session)
 {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5b944cd57218..783c4214dccd 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -246,6 +246,27 @@ static int nfs4_stat_to_errno(int);
 				(0)
 
 #if defined(CONFIG_NFS_V4_1)
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+				encode_verifier_maxsz + \
+				1 /* co_ownerid.len */ + \
+				XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+				1 /* flags */ + \
+				1 /* spa_how */ + \
+				0 /* SP4_NONE (for now) */ + \
+				1 /* zero implemetation id array */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+				2 /* eir_clientid */ + \
+				1 /* eir_sequenceid */ + \
+				1 /* eir_flags */ + \
+				1 /* spr_how */ + \
+				0 /* SP4_NONE (for now) */ + \
+				2 /* eir_server_owner.so_minor_id */ + \
+				/* eir_server_owner.so_major_id<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				/* eir_server_scope<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				1 /* eir_server_impl_id array length */ + \
+				0 /* ignored eir_server_impl_id contents */)
 #define encode_sequence_maxsz	0 /* stub */
 #define decode_sequence_maxsz	0 /* stub */
 #else /* CONFIG_NFS_V4_1 */
@@ -594,6 +615,14 @@ static int nfs4_stat_to_errno(int);
 				 decode_putfh_maxsz + \
 				 decode_lookup_maxsz + \
 				 decode_fs_locations_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_exchange_id_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_exchange_id_maxsz)
+#endif /* CONFIG_NFS_V4_1 */
 
 static const umode_t nfs_type2fmt[] = {
 	[NF4BAD] = 0,
@@ -1455,7 +1484,29 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	hdr->replen += decode_delegreturn_maxsz;
 }
 
+#if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
+static void encode_exchange_id(struct xdr_stream *xdr,
+			       struct nfs41_exchange_id_args *args,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	RESERVE_SPACE(4 + sizeof(args->verifier->data));
+	WRITE32(OP_EXCHANGE_ID);
+	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+
+	encode_string(xdr, args->id_len, args->id);
+
+	RESERVE_SPACE(12);
+	WRITE32(args->flags);
+	WRITE32(0);	/* zero length state_protect4_a */
+	WRITE32(0);	/* zero length implementation id array */
+	hdr->nops++;
+	hdr->replen += decode_exchange_id_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static void encode_sequence(struct xdr_stream *xdr,
 			    const struct nfs4_sequence_args *args,
 			    struct compound_hdr *hdr)
@@ -2162,6 +2213,26 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 	return 0;
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+				    struct nfs41_exchange_id_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_minorversion,
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_exchange_id(&xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * START OF "GENERIC" DECODE ROUTINES.
  *   These may look a little ugly since they are imported from a "generic"
@@ -3877,6 +3948,52 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_DELEGRETURN);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+static int decode_exchange_id(struct xdr_stream *xdr,
+			      struct nfs41_exchange_id_res *res)
+{
+	__be32 *p;
+	uint32_t dummy;
+	int status;
+	struct nfs_client *clp = res->client;
+
+	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+	if (status)
+		return status;
+
+	READ_BUF(8);
+	READ64(clp->cl_ex_clid);
+	READ_BUF(12);
+	READ32(clp->cl_seqid);
+	READ32(clp->cl_exchange_flags);
+
+	/* We ask for SP4_NONE */
+	READ32(dummy);
+	if (dummy != SP4_NONE)
+		return -EIO;
+
+	/* Throw away minor_id */
+	READ_BUF(8);
+
+	/* Throw away Major id */
+	READ_BUF(4);
+	READ32(dummy);
+	READ_BUF(dummy);
+
+	/* Throw away server_scope */
+	READ_BUF(4);
+	READ32(dummy);
+	READ_BUF(dummy);
+
+	/* Throw away Implementation id array */
+	READ_BUF(4);
+	READ32(dummy);
+	READ_BUF(dummy);
+
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static int decode_sequence(struct xdr_stream *xdr,
 			   struct nfs4_sequence_res *res,
 			   struct rpc_rqst *rqstp)
@@ -4774,6 +4891,25 @@ out:
 	return status;
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+				    void *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_exchange_id(&xdr, res);
+	return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
@@ -4943,6 +5079,9 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
+#if defined(CONFIG_NFS_V4_1)
+  PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+#endif /* CONFIG_NFS_V4_1 */
 };
 
 struct rpc_version		nfs_version4 = {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 7c36fcf2dfb7..ad65709ed8d3 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -21,6 +21,7 @@
 #define NFS4_FHSIZE		128
 #define NFS4_MAXPATHLEN		PATH_MAX
 #define NFS4_MAXNAMLEN		NAME_MAX
+#define NFS4_OPAQUE_LIMIT	1024
 #define NFS4_MAX_SESSIONID_LEN	16
 
 #define NFS4_ACCESS_READ        0x0001
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 206485e5082f..435ed556efb5 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -78,6 +78,12 @@ struct nfs_client {
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_V4_1
+	/* clientid returned from EXCHANGE_ID, used by session operations */
+	u64			cl_ex_clid;
+	/* The sequence id to use for the next CREATE_SESSION */
+	u32			cl_seqid;
+	/* The flags used for obtaining the clientid during EXCHANGE_ID */
+	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4ac14b40efc9..5d70b924af5e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -879,6 +879,46 @@ struct nfs4_fs_locations_res {
 
 #endif /* CONFIG_NFS_V4 */
 
+struct nfstime4 {
+	u64	seconds;
+	u32	nseconds;
+};
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs_impl_id4 {
+	u32		domain_len;
+	char		*domain;
+	u32		name_len;
+	char		*name;
+	struct nfstime4	date;
+};
+
+#define NFS4_EXCHANGE_ID_LEN	(48)
+struct nfs41_exchange_id_args {
+	struct nfs_client		*client;
+	nfs4_verifier			*verifier;
+	unsigned int 			id_len;
+	char 				id[NFS4_EXCHANGE_ID_LEN];
+	u32				flags;
+};
+
+struct server_owner {
+	uint64_t			minor_id;
+	uint32_t			major_id_sz;
+	char				major_id[NFS4_OPAQUE_LIMIT];
+};
+
+struct server_scope {
+	uint32_t			server_scope_sz;
+	char 				server_scope[NFS4_OPAQUE_LIMIT];
+};
+
+struct nfs41_exchange_id_res {
+	struct nfs_client		*client;
+	u32				flags;
+};
+#endif /* CONFIG_NFS_V4_1 */
+
 struct nfs_page;
 
 #define NFS_PAGEVEC_SIZE	(8U)
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 4d61c873feed..7ef4b7ad1214 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -41,7 +41,6 @@
 #include <linux/kref.h>
 #include <linux/sunrpc/clnt.h>
 
-#define NFS4_OPAQUE_LIMIT 1024
 typedef struct {
 	u32             cl_boot;
 	u32             cl_id;
-- 
cgit v1.2.3-71-gd317


From 2050f0cc0703aab7cee798b3cb47037754f368bc Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:30 -0400
Subject: nfs41: get_lease_time

get_lease_time uses the FSINFO rpc operation to
get the lease time attribute.

nfs4_get_lease_time() is only called from the state manager on session setup
so don't recover from clientid or sequence level errors.

We do need to recover from NFS4ERR_DELAY or NFS4ERR_GRACE.
Use NFS4_POLL_RETRY_MIN - the Linux server returns NFS4ERR_DELAY when an
upcall is needed to resolve an uncached export referenced by a file handle.

[nfs41: sequence res use slotid]
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: remove extraneous rpc_clnt pointer]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: have get_lease_time work on nfs_client]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: get_lease_time recover from NFS4ERR_DELAY]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: pass *session in seq_args and seq_res]
[define nfs4_get_lease_time_{args,res}]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4xdr.c        | 51 +++++++++++++++++++++++++++
 include/linux/nfs_xdr.h |  9 +++++
 3 files changed, 154 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6f384e290753..eafc99afd356 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4189,6 +4189,100 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	return status;
 }
 
+struct nfs4_get_lease_time_data {
+	struct nfs4_get_lease_time_args *args;
+	struct nfs4_get_lease_time_res *res;
+	struct nfs_client *clp;
+};
+
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+					void *calldata)
+{
+	int ret;
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	/* just setup sequence, do not trigger session recovery
+	   since we're invoked within one */
+	ret = nfs41_setup_sequence(data->clp->cl_session,
+					&data->args->la_seq_args,
+					&data->res->lr_seq_res, 0, task);
+
+	BUG_ON(ret == -EAGAIN);
+	rpc_call_start(task);
+	dprintk("<-- %s\n", __func__);
+}
+
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
+		rpc_delay(task, NFS4_POLL_RETRY_MIN);
+		task->tk_status = 0;
+		rpc_restart_call(task);
+		return;
+	}
+	nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
+	dprintk("<-- %s\n", __func__);
+}
+
+struct rpc_call_ops nfs4_get_lease_time_ops = {
+	.rpc_call_prepare = nfs4_get_lease_time_prepare,
+	.rpc_call_done = nfs4_get_lease_time_done,
+};
+
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+	struct rpc_task *task;
+	struct nfs4_get_lease_time_args args;
+	struct nfs4_get_lease_time_res res = {
+		.lr_fsinfo = fsinfo,
+	};
+	struct nfs4_get_lease_time_data data = {
+		.args = &args,
+		.res = &res,
+		.clp = clp,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_get_lease_time_ops,
+		.callback_data = &data
+	};
+	int status;
+
+	res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+	dprintk("--> %s\n", __func__);
+	task = rpc_run_task(&task_setup);
+
+	if (IS_ERR(task))
+		status = PTR_ERR(task);
+	else {
+		status = task->tk_status;
+		rpc_put_task(task);
+	}
+	dprintk("<-- %s return %d\n", __func__, status);
+
+	return status;
+}
+
 /* Destroy the slot table */
 static void nfs4_destroy_slot_table(struct nfs4_session *session)
 {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 783c4214dccd..85ee1d17a461 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -622,6 +622,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_exchange_id_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_exchange_id_maxsz)
+#define NFS4_enc_get_lease_time_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putrootfh_maxsz + \
+					 encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putrootfh_maxsz + \
+					 decode_fsinfo_maxsz)
 #endif /* CONFIG_NFS_V4_1 */
 
 static const umode_t nfs_type2fmt[] = {
@@ -2231,6 +2239,27 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
 	encode_nops(&hdr);
 	return 0;
 }
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+				       struct nfs4_get_lease_time_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+	};
+	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_sequence(&xdr, &args->la_seq_args, &hdr);
+	encode_putrootfh(&xdr, &hdr);
+	encode_fsinfo(&xdr, lease_bitmap, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -4908,6 +4937,27 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
 		status = decode_exchange_id(&xdr, res);
 	return status;
 }
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+				       struct nfs4_get_lease_time_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+	if (!status)
+		status = decode_putrootfh(&xdr);
+	if (!status)
+		status = decode_fsinfo(&xdr, res->lr_fsinfo);
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
@@ -5081,6 +5131,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 #if defined(CONFIG_NFS_V4_1)
   PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+  PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5d70b924af5e..ca643aa87d46 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -174,6 +174,15 @@ struct nfs4_sequence_res {
 	int			sr_status;	/* sequence operation status */
 };
 
+struct nfs4_get_lease_time_args {
+	struct nfs4_sequence_args	la_seq_args;
+};
+
+struct nfs4_get_lease_time_res {
+	struct nfs_fsinfo	       *lr_fsinfo;
+	struct nfs4_sequence_res	lr_seq_res;
+};
+
 /*
  * Arguments to the open call.
  */
-- 
cgit v1.2.3-71-gd317


From fc931582c260e53ca5ca23bd70ccc9b2265cca9f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:31 -0400
Subject: nfs41: create_session operation

Implement the create_session operation conforming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

Set the real fore channel max operations to preserve server resources.
Note: If the server returns < NFS4_MAX_OPS, the client will very soon
get an NFS4ERR_TOO_MANY_OPS. A later patch will handle this.

Set the max_rqst_sz and max_resp_sz to PAGE_SIZE - we preallocate the buffers.

Set the back channel max_resp_sz_cached to zero to force the client to
always set csa_cachethis to FALSE because the current implementation
of the back channel DRC only supports caching the CB_SEQUENCE operation.

The client back channel server supports one slot, and desires 2 operations
per compound.

Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Andy Adamson<andros@umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: remove extraneous rpc_clnt pointer]
Use the struct nfs_client cl_rpcclient.
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: nfs4_init_channel_attrs, just use nfs41_create_session_args]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: use rsize and wsize for session channel attributes]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: set channel max operations]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: set back channel attributes]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: obliterate nfs4_adjust_channel_attrs]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: have create_session work on nfs_client]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: move CONFIG_NFS_V4_1 endif]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: pass *session in seq_args and seq_res]
[moved nfs4_init_slot_table definition here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: use kcalloc to allocate slot table]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
[nfs41: fix Xcode_create_session's xdr Xcoding pointer type]
[nfs41: refactor decoding of channel attributes]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4proc.c       | 172 ++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4xdr.c        | 185 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4.h    |  10 +++
 include/linux/nfs_xdr.h |  12 ++++
 4 files changed, 379 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eafc99afd356..7d81d6e57533 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -54,6 +54,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "callback.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
@@ -4283,6 +4284,50 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 	return status;
 }
 
+/*
+ * Initialize slot table
+ */
+static int nfs4_init_slot_table(struct nfs4_session *session)
+{
+	struct nfs4_slot_table *tbl = &session->fc_slot_table;
+	int i, max_slots = session->fc_attrs.max_reqs;
+	struct nfs4_slot *slot;
+	int ret = -ENOMEM;
+
+	BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
+
+	dprintk("--> %s: max_reqs=%u\n", __func__,
+		session->fc_attrs.max_reqs);
+
+	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+	if (!slot)
+		goto out;
+	for (i = 0; i < max_slots; ++i)
+		slot[i].seq_nr = 1;
+	ret = 0;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	if (tbl->slots != NULL) {
+		spin_unlock(&tbl->slot_tbl_lock);
+		dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
+			__func__, tbl, tbl->slots);
+		WARN_ON(1);
+		goto out_free;
+	}
+	tbl->max_slots = max_slots;
+	tbl->slots = slot;
+	tbl->highest_used_slotid = -1;  /* no slot is currently used */
+	spin_unlock(&tbl->slot_tbl_lock);
+	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+		tbl, tbl->slots, tbl->max_slots);
+out:
+	dprintk("<-- %s: return %d\n", __func__, ret);
+	return ret;
+out_free:
+	kfree(slot);
+	goto out;
+}
+
 /* Destroy the slot table */
 static void nfs4_destroy_slot_table(struct nfs4_session *session)
 {
@@ -4314,6 +4359,133 @@ void nfs4_destroy_session(struct nfs4_session *session)
 	kfree(session);
 }
 
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+{
+	struct nfs4_session *session = args->client->cl_session;
+	unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
+		     mxresp_sz = session->fc_attrs.max_resp_sz;
+
+	if (mxrqst_sz == 0)
+		mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
+	if (mxresp_sz == 0)
+		mxresp_sz = NFS_MAX_FILE_IO_SIZE;
+	/* Fore channel attributes */
+	args->fc_attrs.headerpadsz = 0;
+	args->fc_attrs.max_rqst_sz = mxrqst_sz;
+	args->fc_attrs.max_resp_sz = mxresp_sz;
+	args->fc_attrs.max_resp_sz_cached = mxresp_sz;
+	args->fc_attrs.max_ops = NFS4_MAX_OPS;
+	args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+
+	dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+		args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
+		args->fc_attrs.max_reqs);
+
+	/* Back channel attributes */
+	args->bc_attrs.headerpadsz = 0;
+	args->bc_attrs.max_rqst_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz_cached = 0;
+	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+	args->bc_attrs.max_reqs = 1;
+
+	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+		args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+		args->bc_attrs.max_reqs);
+}
+
+static int _nfs4_proc_create_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session = clp->cl_session;
+	struct nfs41_create_session_args args = {
+		.client = clp,
+		.cb_program = NFS4_CALLBACK,
+	};
+	struct nfs41_create_session_res res = {
+		.client = clp,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	nfs4_init_channel_attrs(&args);
+	args.flags = (SESSION4_PERSIST);
+
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+
+	/* Set the negotiated values in the session's channel_attrs struct */
+
+	if (!status) {
+		/* Increment the clientid slot sequence id */
+		clp->cl_seqid++;
+	}
+
+	return status;
+}
+
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp)
+{
+	int status;
+	unsigned *ptr;
+	struct nfs_fsinfo fsinfo;
+	struct nfs4_session *session = clp->cl_session;
+
+	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+
+	status = _nfs4_proc_create_session(clp);
+	if (status)
+		goto out;
+
+	/* Init the fore channel */
+	status = nfs4_init_slot_table(session);
+	dprintk("fore channel slot table initialization returned %d\n", status);
+	if (status)
+		goto out;
+
+	ptr = (unsigned *)&session->sess_id.data[0];
+	dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+		clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+
+	/* Get the lease time */
+	status = nfs4_proc_get_lease_time(clp, &fsinfo);
+	if (status == 0) {
+		/* Update lease time and schedule renewal */
+		spin_lock(&clp->cl_lock);
+		clp->cl_lease_time = fsinfo.lease_time * HZ;
+		clp->cl_last_renewal = jiffies;
+		clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		spin_unlock(&clp->cl_lock);
+
+		nfs4_schedule_state_renewal(clp);
+	}
+out:
+	dprintk("<-- %s\n", __func__);
+	return status;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 85ee1d17a461..7a243a2cf0be 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -246,6 +246,8 @@ static int nfs4_stat_to_errno(int);
 				(0)
 
 #if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+
 #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
 				encode_verifier_maxsz + \
 				1 /* co_ownerid.len */ + \
@@ -267,6 +269,31 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
 				1 /* eir_server_impl_id array length */ + \
 				0 /* ignored eir_server_impl_id contents */)
+#define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz  (6 + \
+				     1 /* ca_rdma_ird.len */ + \
+				     1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz  (op_encode_hdr_maxsz + \
+				     2 /* csa_clientid */ + \
+				     1 /* csa_sequence */ + \
+				     1 /* csa_flags */ + \
+				     encode_channel_attrs_maxsz + \
+				     encode_channel_attrs_maxsz + \
+				     1 /* csa_cb_program */ + \
+				     1 /* csa_sec_parms.len (1) */ + \
+				     1 /* cb_secflavor (AUTH_SYS) */ + \
+				     1 /* stamp */ + \
+				     1 /* machinename.len */ + \
+				     XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+				     1 /* uid */ + \
+				     1 /* gid */ + \
+				     1 /* gids.len (0) */)
+#define decode_create_session_maxsz  (op_decode_hdr_maxsz +	\
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* csr_sequence */ + \
+				     1 /* csr_flags */ + \
+				     decode_channel_attrs_maxsz + \
+				     decode_channel_attrs_maxsz)
 #define encode_sequence_maxsz	0 /* stub */
 #define decode_sequence_maxsz	0 /* stub */
 #else /* CONFIG_NFS_V4_1 */
@@ -622,6 +649,12 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_exchange_id_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_create_session_maxsz)
 #define NFS4_enc_get_lease_time_sz	(compound_encode_hdr_maxsz + \
 					 encode_sequence_maxsz + \
 					 encode_putrootfh_maxsz + \
@@ -712,6 +745,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 static void encode_nops(struct compound_hdr *hdr)
 {
+	BUG_ON(hdr->nops > NFS4_MAX_OPS);
 	*hdr->nops_p = htonl(hdr->nops);
 }
 
@@ -1513,6 +1547,68 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
+
+static void encode_create_session(struct xdr_stream *xdr,
+				  struct nfs41_create_session_args *args,
+				  struct compound_hdr *hdr)
+{
+	__be32 *p;
+	char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
+	uint32_t len;
+	struct nfs_client *clp = args->client;
+
+	RESERVE_SPACE(4);
+	WRITE32(OP_CREATE_SESSION);
+
+	RESERVE_SPACE(8);
+	WRITE64(clp->cl_ex_clid);
+
+	RESERVE_SPACE(8);
+	WRITE32(clp->cl_seqid);			/*Sequence id */
+	WRITE32(args->flags);			/*flags */
+
+	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
+	/* Fore Channel */
+	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
+	WRITE32(args->fc_attrs.max_rqst_sz);	/* max req size */
+	WRITE32(args->fc_attrs.max_resp_sz);	/* max resp size */
+	WRITE32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	WRITE32(args->fc_attrs.max_ops);	/* max operations */
+	WRITE32(args->fc_attrs.max_reqs);	/* max requests */
+	WRITE32(0);				/* rdmachannel_attrs */
+
+	/* Back Channel */
+	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
+	WRITE32(args->bc_attrs.max_rqst_sz);	/* max req size */
+	WRITE32(args->bc_attrs.max_resp_sz);	/* max resp size */
+	WRITE32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	WRITE32(args->bc_attrs.max_ops);	/* max operations */
+	WRITE32(args->bc_attrs.max_reqs);	/* max requests */
+	WRITE32(0);				/* rdmachannel_attrs */
+
+	RESERVE_SPACE(4);
+	WRITE32(args->cb_program);		/* cb_program */
+
+	RESERVE_SPACE(4);			/* # of security flavors */
+	WRITE32(1);
+
+	RESERVE_SPACE(4);
+	WRITE32(RPC_AUTH_UNIX);			/* auth_sys */
+
+	/* authsys_parms rfc1831 */
+	RESERVE_SPACE(4);
+	WRITE32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	len = scnprintf(machine_name, sizeof(machine_name), "%s",
+			clp->cl_ipaddr);
+	RESERVE_SPACE(16 + len);
+	WRITE32(len);
+	WRITEMEM(machine_name, len);
+	WRITE32(0);				/* UID */
+	WRITE32(0);				/* GID */
+	WRITE32(0);				/* No more gids */
+	hdr->nops++;
+	hdr->replen += decode_create_session_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void encode_sequence(struct xdr_stream *xdr,
@@ -2240,6 +2336,24 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
 	return 0;
 }
 
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+				       struct nfs41_create_session_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_minorversion,
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_create_session(&xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+
 /*
  * a GET_LEASE_TIME request
  */
@@ -4021,6 +4135,59 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 
 	return 0;
 }
+
+static int decode_chan_attrs(struct xdr_stream *xdr,
+			     struct nfs4_channel_attrs *attrs)
+{
+	__be32 *p;
+	u32 nr_attrs;
+
+	READ_BUF(28);
+	READ32(attrs->headerpadsz);
+	READ32(attrs->max_rqst_sz);
+	READ32(attrs->max_resp_sz);
+	READ32(attrs->max_resp_sz_cached);
+	READ32(attrs->max_ops);
+	READ32(attrs->max_reqs);
+	READ32(nr_attrs);
+	if (unlikely(nr_attrs > 1)) {
+		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
+			__func__, nr_attrs);
+		return -EINVAL;
+	}
+	if (nr_attrs == 1)
+		READ_BUF(4); /* skip rdma_attrs */
+	return 0;
+}
+
+static int decode_create_session(struct xdr_stream *xdr,
+				 struct nfs41_create_session_res *res)
+{
+	__be32 *p;
+	int status;
+	struct nfs_client *clp = res->client;
+	struct nfs4_session *session = clp->cl_session;
+
+	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+
+	if (status)
+		return status;
+
+	/* sessionid */
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
+
+	/* seqid, flags */
+	READ_BUF(8);
+	READ32(clp->cl_seqid);
+	READ32(session->flags);
+
+	/* Channel attributes */
+	status = decode_chan_attrs(xdr, &session->fc_attrs);
+	if (!status)
+		status = decode_chan_attrs(xdr, &session->bc_attrs);
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static int decode_sequence(struct xdr_stream *xdr,
@@ -4938,6 +5105,23 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
 	return status;
 }
 
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+				       struct nfs41_create_session_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_create_session(&xdr, res);
+	return status;
+}
+
 /*
  * a GET_LEASE_TIME request
  */
@@ -5131,6 +5315,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 #if defined(CONFIG_NFS_V4_1)
   PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+  PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
   PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index ad65709ed8d3..bd2eba530667 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -131,6 +131,16 @@
 
 #define NFS4_MAX_UINT64	(~(u64)0)
 
+/* An NFS4 sessions server must support at least NFS4_MAX_OPS operations.
+ * If a compound requires more operations, adjust NFS4_MAX_OPS accordingly.
+ */
+#define NFS4_MAX_OPS   8
+
+/* Our NFS4 client back channel server only wants the cb_sequene and the
+ * actual operation per compound
+ */
+#define NFS4_MAX_BACK_CHANNEL_OPS 2
+
 enum nfs4_acl_whotype {
 	NFS4_ACL_WHO_NAMED = 0,
 	NFS4_ACL_WHO_OWNER,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ca643aa87d46..62f63fb0c4c8 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -926,6 +926,18 @@ struct nfs41_exchange_id_res {
 	struct nfs_client		*client;
 	u32				flags;
 };
+
+struct nfs41_create_session_args {
+	struct nfs_client	       *client;
+	uint32_t			flags;
+	uint32_t			cb_program;
+	struct nfs4_channel_attrs	fc_attrs;	/* Fore Channel */
+	struct nfs4_channel_attrs	bc_attrs;	/* Back Channel */
+};
+
+struct nfs41_create_session_res {
+	struct nfs_client	       *client;
+};
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs_page;
-- 
cgit v1.2.3-71-gd317


From 76db6d9500caeaa774a3e32a997eba30bbdc176b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:38 -0400
Subject: nfs41: add session setup to the state manager

At mount, nfs_alloc_client sets the cl_state NFS4CLNT_LEASE_EXPIRED bit
and nfs4_alloc_session sets the NFS4CLNT_SESSION_SETUP bit, so both bits are
set when nfs4_lookup_root calls nfs4_recover_expired_lease which schedules
the nfs4_state_manager and waits for it to complete.

Place the session setup after the clientid establishment in nfs4_state_manager
so that the session is setup right after the clientid has been established
without rescheduling the state manager.

Unlike nfsv4.0, the nfs_client struct is not ready to use until the session
has been established.  Postpone marking the nfs_client struct to NFS_CS_READY
until after a successful CREATE_SESSION call so that other threads cannot use
the client until the session is established.

If the EXCHANGE_ID call fails and the session has not been setup (the
NFS4CLNT_SESSION_SETUP bit is set), mark the client with the error and return.

If the session setup CREATE_SESSION call fails with NFS4ERR_STALE_CLIENTID
which could occur due to server reboot or network partition inbetween the
EXCHANGE_ID and CREATE_SESSION call, reset the NFS4CLNT_LEASE_EXPIRED and
NFS4CLNT_SESSION_SETUP bits and try again.

If the CREATE_SESSION call fails with other errors, mark the client with
the error and return.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>

[nfs41: NFS_CS_SESSION_SETUP cl_cons_state for back channel setup]
  On session setup, the CREATE_SESSION reply races with the server back channel
  probe which needs to succeed to setup the back channel. Set a new
  cl_cons_state NFS_CS_SESSION_SETUP just prior to the CREATE_SESSION call
  and add it as a valid state to nfs_find_client so that the client back channel
  can find the nfs_client struct and won't drop the server backchannel probe.
  Use a new cl_cons_state so that NFSv4.0 back channel behaviour which only
  sets NFS_CS_READY is unchanged.
  Adjust waiting on the nfs_client_active_wq accordingly.
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>

[nfs41: rename NFS_CS_SESSION_SETUP to NFS_CS_SESSION_INITING]
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfs41: set NFS_CL_SESSION_INITING in alloc_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfs41: move session setup into a function]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[moved nfs4_proc_create_session declaration here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 10 ++++++----
 fs/nfs/internal.h         |  1 +
 fs/nfs/nfs4_fs.h          |  2 ++
 fs/nfs/nfs4proc.c         | 10 ++++++++++
 fs/nfs/nfs4state.c        | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs_sb.h |  1 +
 6 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bb7432d83b5a..d28a987f569e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -366,7 +366,8 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
 		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 
 		/* Don't match clients that failed to initialise properly */
-		if (clp->cl_cons_state != NFS_CS_READY)
+		if (!(clp->cl_cons_state == NFS_CS_READY ||
+		      clp->cl_cons_state == NFS_CS_SESSION_INITING))
 			continue;
 
 		/* Different NFS versions cannot share the same nfs_client */
@@ -499,7 +500,7 @@ found_client:
 		nfs_free_client(new);
 
 	error = wait_event_killable(nfs_client_active_wq,
-				clp->cl_cons_state != NFS_CS_INITING);
+				clp->cl_cons_state < NFS_CS_INITING);
 	if (error < 0) {
 		nfs_put_client(clp);
 		return ERR_PTR(-ERESTARTSYS);
@@ -520,7 +521,7 @@ found_client:
 /*
  * Mark a server as ready or failed
  */
-static void nfs_mark_client_ready(struct nfs_client *clp, int state)
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
 {
 	clp->cl_cons_state = state;
 	wake_up_all(&nfs_client_active_wq);
@@ -1135,7 +1136,8 @@ static int nfs4_init_client(struct nfs_client *clp,
 	if (error < 0)
 		goto error;
 
-	nfs_mark_client_ready(clp, NFS_CS_READY);
+	if (!nfs4_has_session(clp))
+		nfs_mark_client_ready(clp, NFS_CS_READY);
 	return 0;
 
 error:
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f62bc5226155..f3b310e8ea03 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -100,6 +100,7 @@ extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
 					   struct nfs_fattr *);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index eccf4e93e7d7..288717abaddc 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
 	NFS4CLNT_RECLAIM_REBOOT,
 	NFS4CLNT_RECLAIM_NOGRACE,
 	NFS4CLNT_DELEGRETURN,
+	NFS4CLNT_SESSION_SETUP,
 };
 
 /*
@@ -208,6 +209,7 @@ extern int nfs4_setup_sequence(struct nfs_client *clp,
 		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern int nfs4_proc_create_session(struct nfs_client *, int reset);
 #else /* CONFIG_NFS_v4_1 */
 static inline int nfs4_setup_sequence(struct nfs_client *clp,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0b1214740248..7fc0c9c8f5e3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4382,6 +4382,16 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
 	if (!session)
 		return NULL;
+
+	set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+	/*
+	 * The create session reply races with the server back
+	 * channel probe. Mark the client NFS_CS_SESSION_INITING
+	 * so that the client back channel can find the
+	 * nfs_client struct
+	 */
+	clp->cl_cons_state = NFS_CS_SESSION_INITING;
+
 	tbl = &session->fc_slot_table;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "Slot table");
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index bc683ed477e1..df5b4807daa7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1113,6 +1113,27 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 	return status;
 }
 
+#ifdef CONFIG_NFS_V4_1
+
+static int nfs4_initialize_session(struct nfs_client *clp)
+{
+	int status;
+
+	status = nfs4_proc_create_session(clp, 0);
+	if (!status) {
+		nfs_mark_client_ready(clp, NFS_CS_READY);
+	} else if (status == -NFS4ERR_STALE_CLIENTID) {
+		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+	} else {
+		nfs_mark_client_ready(clp, status);
+	}
+	return status;
+}
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_initialize_session(struct nfs_client *clp) { return 0; }
+#endif /* CONFIG_NFS_V4_1 */
+
 static void nfs4_state_manager(struct nfs_client *clp)
 {
 	int status = 0;
@@ -1126,6 +1147,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 				if (status == -EAGAIN)
 					continue;
+				if (clp->cl_cons_state ==
+							NFS_CS_SESSION_INITING)
+					nfs_mark_client_ready(clp, status);
 				goto out_error;
 			}
 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
@@ -1136,7 +1160,16 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			if (status != 0)
 				continue;
 		}
-
+		/* Setup the session */
+		if (nfs4_has_session(clp) &&
+		   test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+			status = nfs4_initialize_session(clp);
+			if (status) {
+				if (status == -NFS4ERR_STALE_CLIENTID)
+					continue;
+				goto out_error;
+			}
+		}
 		/* First recover reboot state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 435ed556efb5..d0902ccec9ce 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -24,6 +24,7 @@ struct nfs_client {
 	int			cl_cons_state;	/* current construction state (-ve: init error) */
 #define NFS_CS_READY		0		/* ready to be used */
 #define NFS_CS_INITING		1		/* busy initialising */
+#define NFS_CS_SESSION_INITING	2		/* busy initialising  session */
 	unsigned long		cl_res_state;	/* NFS resources state */
 #define NFS_CS_CALLBACK		1		/* - callback started */
 #define NFS_CS_IDMAP		2		/* - idmap started */
-- 
cgit v1.2.3-71-gd317


From aae2006e9b0c294114915c13022fa348e1a88023 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:40 -0400
Subject: nfs41: sunrpc: Export the call prepare state for session reset

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h  |  1 +
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/clnt.c            | 13 +++++++++++++
 net/sunrpc/sched.c           |  2 +-
 4 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index c39a21040dcb..37881f1a0bd7 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -143,6 +143,7 @@ int		rpc_call_sync(struct rpc_clnt *clnt,
 			      const struct rpc_message *msg, int flags);
 struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
 			       int flags);
+void		rpc_restart_call_prepare(struct rpc_task *);
 void		rpc_restart_call(struct rpc_task *);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 64981a2f1cae..177376880fab 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -237,6 +237,7 @@ void		rpc_show_tasks(void);
 int		rpc_init_mempool(void);
 void		rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
+void		rpc_prepare_task(struct rpc_task *task);
 
 static inline void rpc_exit(struct rpc_task *task, int status)
 {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 5abab094441f..d00e8135f866 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -694,6 +694,19 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_force_rebind);
 
+/*
+ * Restart an (async) RPC call from the call_prepare state.
+ * Usually called from within the exit handler.
+ */
+void
+rpc_restart_call_prepare(struct rpc_task *task)
+{
+	if (RPC_ASSASSINATED(task))
+		return;
+	task->tk_action = rpc_prepare_task;
+}
+EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
+
 /*
  * Restart an (async) RPC call. Usually called from within the
  * exit handler.
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index ff50a0546865..1102ce1251f7 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -569,7 +569,7 @@ EXPORT_SYMBOL_GPL(rpc_delay);
 /*
  * Helper to call task->tk_ops->rpc_call_prepare
  */
-static void rpc_prepare_task(struct rpc_task *task)
+void rpc_prepare_task(struct rpc_task *task)
 {
 	task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
 }
-- 
cgit v1.2.3-71-gd317


From 56632b5bff5af10eb12d7e9499b5ffcadcb7a7b2 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:22:58 -0400
Subject: nfs41: client callback structures

Adds new list of rpc_xprt structures, and a readers/writers lock to
protect the list.  The list is used to preallocate resources for
the backchannel during backchannel requests.  Callbacks are not
expected to cause significant latency, so only one callback will
be allowed at this time.

It also adds a pointer to the NFS callback service so that
requests can be directed to it for processing.

New callback members added to svc_serv. The NFSv4.1 callback service will
sleep on the svc_serv->svc_cb_waitq until new callback requests arrive.
The request will be queued in svc_serv->svc_cb_list. This patch adds this
list, the sleep queue and spinlock to svc_serv.

[nfs41: NFSv4.1 callback support]
Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svc.h  |  8 ++++++++
 include/linux/sunrpc/xprt.h | 22 ++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 2a30775959e9..4a8afbd62007 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -96,6 +96,14 @@ struct svc_serv {
 	svc_thread_fn		sv_function;	/* main function for threads */
 	unsigned int		sv_drc_max_pages; /* Total pages for DRC */
 	unsigned int		sv_drc_pages_used;/* DRC pages used */
+#if defined(CONFIG_NFS_V4_1)
+	struct list_head	sv_cb_list;	/* queue for callback requests
+						 * that arrive over the same
+						 * connection */
+	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
+	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
+						 * entries in the svc_cb_list */
+#endif /* CONFIG_NFS_V4_1 */
 };
 
 /*
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 08afe43118f4..703af7ebf6cf 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -97,6 +97,12 @@ struct rpc_rqst {
 
 	unsigned long		rq_xtime;	/* when transmitted */
 	int			rq_ntrans;
+
+#if defined(CONFIG_NFS_V4_1)
+	struct list_head	rq_bc_list;	/* Callback service list */
+	unsigned long		rq_bc_pa_state;	/* Backchannel prealloc state */
+	struct list_head	rq_bc_pa_list;	/* Backchannel prealloc list */
+#endif /* CONFIG_NFS_V4_1 */
 };
 #define rq_svec			rq_snd_buf.head
 #define rq_slen			rq_snd_buf.len
@@ -174,6 +180,14 @@ struct rpc_xprt {
 	spinlock_t		reserve_lock;	/* lock slot table */
 	u32			xid;		/* Next XID value to use */
 	struct rpc_task *	snd_task;	/* Task blocked in send */
+#if defined(CONFIG_NFS_V4_1)
+	struct svc_serv		*bc_serv;       /* The RPC service which will */
+						/* process the callback */
+	spinlock_t		bc_pa_lock;	/* Protects the preallocated
+						 * items */
+	struct list_head	bc_pa_list;	/* List of preallocated
+						 * backchannel rpc_rqst's */
+#endif /* CONFIG_NFS_V4_1 */
 	struct list_head	recv;
 
 	struct {
@@ -192,6 +206,14 @@ struct rpc_xprt {
 	const char		*address_strings[RPC_DISPLAY_MAX];
 };
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Backchannel flags
+ */
+#define	RPC_BC_PA_IN_USE	0x0001		/* Preallocated backchannel */
+						/* buffer in use */
+#endif /* CONFIG_NFS_V4_1 */
+
 struct xprt_create {
 	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct sockaddr *	srcaddr;	/* optional local address */
-- 
cgit v1.2.3-71-gd317


From fb7a0b9addbdbbb13b7bc02abf55ee524ea19ce1 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:00 -0400
Subject: nfs41: New backchannel helper routines

This patch introduces support to setup the callback xprt on the client side.
It allocates/ destroys the preallocated memory structures used to process
backchannel requests.

At setup time, xprt_setup_backchannel() is invoked to allocate one or
more rpc_rqst structures and substructures.  This ensures that they
are available when an RPC callback arrives.  The rpc_rqst structures
are maintained in a linked list attached to the rpc_xprt structure.
We keep track of the number of allocations so that they can be correctly
removed when the channel is destroyed.

When an RPC callback arrives, xprt_alloc_bc_request() is invoked to
obtain a preallocated rpc_rqst structure.  An rpc_xprt structure is
returned, and its RPC_BC_PREALLOC_IN_USE bit is set in
rpc_xprt->bc_flags.  The structure is removed from the the list
since it is now in use, and it will be later added back when its
user is done with it.

After the RPC callback replies, the rpc_rqst structure is returned
by invoking xprt_free_bc_request().  This clears the
RPC_BC_PREALLOC_IN_USE bit and adds it back to the list, allowing it
to be reused by a subsequent RPC callback request.

To be consistent with the reception of RPC messages, the backchannel requests
should be placed into the 'struct rpc_rqst' rq_rcv_buf, which is then in turn
copied to the 'struct rpc_rqst' rq_private_buf.

[nfs41: Preallocate rpc_rqst receive buffer for handling callbacks]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Update copyright notice and explain page allocation]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/xprt.h   |   1 +
 net/sunrpc/Makefile           |   1 +
 net/sunrpc/backchannel_rqst.c | 278 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 280 insertions(+)
 create mode 100644 net/sunrpc/backchannel_rqst.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 703af7ebf6cf..beae030e80b5 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -183,6 +183,7 @@ struct rpc_xprt {
 #if defined(CONFIG_NFS_V4_1)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
+	unsigned int		bc_alloc_count;	/* Total number of preallocs */
 	spinlock_t		bc_pa_lock;	/* Protects the preallocated
 						 * items */
 	struct list_head	bc_pa_list;	/* List of preallocated
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 5369aa369b35..4a01f9684b85 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -13,5 +13,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    rpcb_clnt.o timer.o xdr.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
+sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
new file mode 100644
index 000000000000..f56e18a23498
--- /dev/null
+++ b/net/sunrpc/backchannel_rqst.c
@@ -0,0 +1,278 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
+(c) 2009 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+#include <linux/tcp.h>
+#include <linux/sunrpc/xprt.h>
+
+#ifdef RPC_DEBUG
+#define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+#if defined(CONFIG_NFS_V4_1)
+
+/*
+ * Helper routines that track the number of preallocation elements
+ * on the transport.
+ */
+static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
+{
+	return xprt->bc_alloc_count > 0;
+}
+
+static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+	xprt->bc_alloc_count += n;
+}
+
+static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+	return xprt->bc_alloc_count -= n;
+}
+
+/*
+ * Free the preallocated rpc_rqst structure and the memory
+ * buffers hanging off of it.
+ */
+static void xprt_free_allocation(struct rpc_rqst *req)
+{
+	struct xdr_buf *xbufp;
+
+	dprintk("RPC:        free allocations for req= %p\n", req);
+	BUG_ON(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	xbufp = &req->rq_private_buf;
+	free_page((unsigned long)xbufp->head[0].iov_base);
+	xbufp = &req->rq_snd_buf;
+	free_page((unsigned long)xbufp->head[0].iov_base);
+	list_del(&req->rq_bc_pa_list);
+	kfree(req);
+}
+
+/*
+ * Preallocate up to min_reqs structures and related buffers for use
+ * by the backchannel.  This function can be called multiple times
+ * when creating new sessions that use the same rpc_xprt.  The
+ * preallocated buffers are added to the pool of resources used by
+ * the rpc_xprt.  Anyone of these resources may be used used by an
+ * incoming callback request.  It's up to the higher levels in the
+ * stack to enforce that the maximum number of session slots is not
+ * being exceeded.
+ *
+ * Some callback arguments can be large.  For example, a pNFS server
+ * using multiple deviceids.  The list can be unbound, but the client
+ * has the ability to tell the server the maximum size of the callback
+ * requests.  Each deviceID is 16 bytes, so allocate one page
+ * for the arguments to have enough room to receive a number of these
+ * deviceIDs.  The NFS client indicates to the pNFS server that its
+ * callback requests can be up to 4096 bytes in size.
+ */
+int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
+{
+	struct page *page_rcv = NULL, *page_snd = NULL;
+	struct xdr_buf *xbufp = NULL;
+	struct rpc_rqst *req, *tmp;
+	struct list_head tmp_list;
+	int i;
+
+	dprintk("RPC:       setup backchannel transport\n");
+
+	/*
+	 * We use a temporary list to keep track of the preallocated
+	 * buffers.  Once we're done building the list we splice it
+	 * into the backchannel preallocation list off of the rpc_xprt
+	 * struct.  This helps minimize the amount of time the list
+	 * lock is held on the rpc_xprt struct.  It also makes cleanup
+	 * easier in case of memory allocation errors.
+	 */
+	INIT_LIST_HEAD(&tmp_list);
+	for (i = 0; i < min_reqs; i++) {
+		/* Pre-allocate one backchannel rpc_rqst */
+		req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+		if (req == NULL) {
+			printk(KERN_ERR "Failed to create bc rpc_rqst\n");
+			goto out_free;
+		}
+
+		/* Add the allocated buffer to the tmp list */
+		dprintk("RPC:       adding req= %p\n", req);
+		list_add(&req->rq_bc_pa_list, &tmp_list);
+
+		req->rq_xprt = xprt;
+		INIT_LIST_HEAD(&req->rq_list);
+		INIT_LIST_HEAD(&req->rq_bc_list);
+
+		/* Preallocate one XDR receive buffer */
+		page_rcv = alloc_page(GFP_KERNEL);
+		if (page_rcv == NULL) {
+			printk(KERN_ERR "Failed to create bc receive xbuf\n");
+			goto out_free;
+		}
+		xbufp = &req->rq_rcv_buf;
+		xbufp->head[0].iov_base = page_address(page_rcv);
+		xbufp->head[0].iov_len = PAGE_SIZE;
+		xbufp->tail[0].iov_base = NULL;
+		xbufp->tail[0].iov_len = 0;
+		xbufp->page_len = 0;
+		xbufp->len = PAGE_SIZE;
+		xbufp->buflen = PAGE_SIZE;
+
+		/* Preallocate one XDR send buffer */
+		page_snd = alloc_page(GFP_KERNEL);
+		if (page_snd == NULL) {
+			printk(KERN_ERR "Failed to create bc snd xbuf\n");
+			goto out_free;
+		}
+
+		xbufp = &req->rq_snd_buf;
+		xbufp->head[0].iov_base = page_address(page_snd);
+		xbufp->head[0].iov_len = 0;
+		xbufp->tail[0].iov_base = NULL;
+		xbufp->tail[0].iov_len = 0;
+		xbufp->page_len = 0;
+		xbufp->len = 0;
+		xbufp->buflen = PAGE_SIZE;
+	}
+
+	/*
+	 * Add the temporary list to the backchannel preallocation list
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_splice(&tmp_list, &xprt->bc_pa_list);
+	xprt_inc_alloc_count(xprt, min_reqs);
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	dprintk("RPC:       setup backchannel transport done\n");
+	return 0;
+
+out_free:
+	/*
+	 * Memory allocation failed, free the temporary list
+	 */
+	list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list)
+		xprt_free_allocation(req);
+
+	dprintk("RPC:       setup backchannel transport failed\n");
+	return -1;
+}
+EXPORT_SYMBOL(xprt_setup_backchannel);
+
+/*
+ * Destroys the backchannel preallocated structures.
+ * Since these structures may have been allocated by multiple calls
+ * to xprt_setup_backchannel, we only destroy up to the maximum number
+ * of reqs specified by the caller.
+ * @xprt:	the transport holding the preallocated strucures
+ * @max_reqs	the maximum number of preallocated structures to destroy
+ */
+void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
+{
+	struct rpc_rqst *req = NULL, *tmp = NULL;
+
+	dprintk("RPC:        destroy backchannel transport\n");
+
+	BUG_ON(max_reqs == 0);
+	spin_lock_bh(&xprt->bc_pa_lock);
+	xprt_dec_alloc_count(xprt, max_reqs);
+	list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+		dprintk("RPC:        req=%p\n", req);
+		xprt_free_allocation(req);
+		if (--max_reqs == 0)
+			break;
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	dprintk("RPC:        backchannel list empty= %s\n",
+		list_empty(&xprt->bc_pa_list) ? "true" : "false");
+}
+EXPORT_SYMBOL(xprt_destroy_backchannel);
+
+/*
+ * One or more rpc_rqst structure have been preallocated during the
+ * backchannel setup.  Buffer space for the send and private XDR buffers
+ * has been preallocated as well.  Use xprt_alloc_bc_request to allocate
+ * to this request.  Use xprt_free_bc_request to return it.
+ *
+ * Return an available rpc_rqst, otherwise NULL if non are available.
+ */
+struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
+{
+	struct rpc_rqst *req;
+
+	dprintk("RPC:       allocate a backchannel request\n");
+	spin_lock_bh(&xprt->bc_pa_lock);
+	if (!list_empty(&xprt->bc_pa_list)) {
+		req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
+				rq_bc_pa_list);
+		list_del(&req->rq_bc_pa_list);
+	} else {
+		req = NULL;
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	if (req != NULL) {
+		set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+		req->rq_received = 0;
+		req->rq_bytes_sent = 0;
+		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+			sizeof(req->rq_private_buf));
+	}
+	dprintk("RPC:       backchannel req=%p\n", req);
+	return req;
+}
+
+/*
+ * Return the preallocated rpc_rqst structure and XDR buffers
+ * associated with this rpc_task.
+ */
+void xprt_free_bc_request(struct rpc_rqst *req)
+{
+	struct rpc_xprt *xprt = req->rq_xprt;
+
+	dprintk("RPC:       free backchannel req=%p\n", req);
+
+	smp_mb__before_clear_bit();
+	BUG_ON(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+	smp_mb__after_clear_bit();
+
+	if (!xprt_need_to_requeue(xprt)) {
+		/*
+		 * The last remaining session was destroyed while this
+		 * entry was in use.  Free the entry and don't attempt
+		 * to add back to the list because there is no need to
+		 * have anymore preallocated entries.
+		 */
+		dprintk("RPC:       Last session removed req=%p\n", req);
+		xprt_free_allocation(req);
+		return;
+	}
+
+	/*
+	 * Return it to the list of preallocations so that it
+	 * may be reused by a new callback request.
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+	spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-71-gd317


From 4a8d70bfef01f8e6b27785e2625e88e9a80924a5 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:01 -0400
Subject: nfs41: New include/linux/sunrpc/bc_xprt.h

Contains prototype for backchannel helper routines.

Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: xprt_setup_backchannel v4.0 only inline]
    Fix compile error when CONFIG_NFS_V4_1 is not set.
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Update Copyright notice and fix formatting]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/bc_xprt.h | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 include/linux/sunrpc/bc_xprt.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
new file mode 100644
index 000000000000..5965ae4f902d
--- /dev/null
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -0,0 +1,46 @@
+/******************************************************************************
+
+(c) 2008 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * Functions to create and manage the backchannel
+ */
+
+#ifndef _LINUX_SUNRPC_BC_XPRT_H
+#define _LINUX_SUNRPC_BC_XPRT_H
+
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/xprt.h>
+
+#ifdef CONFIG_NFS_V4_1
+struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt);
+void xprt_free_bc_request(struct rpc_rqst *req);
+int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
+void xprt_destroy_backchannel(struct rpc_xprt *, int max_reqs);
+#else /* CONFIG_NFS_V4_1 */
+static inline int xprt_setup_backchannel(struct rpc_xprt *xprt,
+					 unsigned int min_reqs)
+{
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+#endif /* _LINUX_SUNRPC_BC_XPRT_H */
+
-- 
cgit v1.2.3-71-gd317


From 55ae1aabfb108106dd095de2578ceef1c755a8b8 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:03 -0400
Subject: nfs41: Add backchannel processing support to RPC state machine

Adds rpc_run_bc_task() which is called by the NFS callback service to
process backchannel requests.  It performs similar work to rpc_run_task()
though "schedules" the backchannel task to be executed starting at the
call_trasmit state in the RPC state machine.

It also introduces some miscellaneous updates to the argument validation,
call_transmit, and transport cleanup functions to take into account
that there are now forechannel and backchannel tasks.

Backchannel requests do not carry an RPC message structure, since the
payload has already been XDR encoded using the existing NFSv4 callback
mechanism.

Introduce a new transmit state for the client to reply on to backchannel
requests.  This new state simply reserves the transport and issues the
reply.  In case of a connection related error, disconnects the transport and
drops the reply.  It requires the forechannel to re-establish the connection
and the server to retransmit the request, as stated in NFSv4.1 section
2.9.2 "Client and Server Transport Behavior".

Note: There is no need to loop attempting to reserve the transport.  If EAGAIN
is returned by xprt_prepare_transmit(), return with tk_status == 0,
setting tk_action to call_bc_transmit.  rpc_execute() will invoke it again
after the task is taken off the sleep queue.

[nfs41: rpc_run_bc_task() need not be exported outside RPC module]
[nfs41: New call_bc_transmit RPC state]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: Backchannel: No need to loop in call_bc_transmit()]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[rpc_count_iostats incorrectly exits early]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Convert rpc_reply_expected() to inline function]
[Remove unnecessary BUG_ON()]
[Rename variable]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/sched.h |   2 +
 include/linux/sunrpc/xprt.h  |  12 +++++
 net/sunrpc/clnt.c            | 117 ++++++++++++++++++++++++++++++++++++++++++-
 net/sunrpc/stats.c           |   6 ++-
 net/sunrpc/sunrpc.h          |  37 ++++++++++++++
 net/sunrpc/xprt.c            |  38 +++++++++++---
 6 files changed, 203 insertions(+), 9 deletions(-)
 create mode 100644 net/sunrpc/sunrpc.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 177376880fab..401097781fc0 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -210,6 +210,8 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+				const struct rpc_call_ops *ops);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_release_calldata(const struct rpc_call_ops *, void *);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index beae030e80b5..55c6c37e249e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -215,6 +215,18 @@ struct rpc_xprt {
 						/* buffer in use */
 #endif /* CONFIG_NFS_V4_1 */
 
+#if defined(CONFIG_NFS_V4_1)
+static inline int bc_prealloc(struct rpc_rqst *req)
+{
+	return test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+}
+#else
+static inline int bc_prealloc(struct rpc_rqst *req)
+{
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 struct xprt_create {
 	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct sockaddr *	srcaddr;	/* optional local address */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index aca3ab6fc140..f3e93b8eb90f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -36,7 +36,9 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/bc_xprt.h>
 
+#include "sunrpc.h"
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_CALL
@@ -63,6 +65,9 @@ static void	call_decode(struct rpc_task *task);
 static void	call_bind(struct rpc_task *task);
 static void	call_bind_status(struct rpc_task *task);
 static void	call_transmit(struct rpc_task *task);
+#if defined(CONFIG_NFS_V4_1)
+static void	call_bc_transmit(struct rpc_task *task);
+#endif /* CONFIG_NFS_V4_1 */
 static void	call_status(struct rpc_task *task);
 static void	call_transmit_status(struct rpc_task *task);
 static void	call_refresh(struct rpc_task *task);
@@ -613,6 +618,50 @@ rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags,
 }
 EXPORT_SYMBOL_GPL(rpc_call_async);
 
+#if defined(CONFIG_NFS_V4_1)
+/**
+ * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
+ * rpc_execute against it
+ * @ops: RPC call ops
+ */
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+					const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_task *task;
+	struct xdr_buf *xbufp = &req->rq_snd_buf;
+	struct rpc_task_setup task_setup_data = {
+		.callback_ops = tk_ops,
+	};
+
+	dprintk("RPC: rpc_run_bc_task req= %p\n", req);
+	/*
+	 * Create an rpc_task to send the data
+	 */
+	task = rpc_new_task(&task_setup_data);
+	if (!task) {
+		xprt_free_bc_request(req);
+		goto out;
+	}
+	task->tk_rqstp = req;
+
+	/*
+	 * Set up the xdr_buf length.
+	 * This also indicates that the buffer is XDR encoded already.
+	 */
+	xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
+			xbufp->tail[0].iov_len;
+
+	task->tk_action = call_bc_transmit;
+	atomic_inc(&task->tk_count);
+	BUG_ON(atomic_read(&task->tk_count) != 2);
+	rpc_execute(task);
+
+out:
+	dprintk("RPC: rpc_run_bc_task: task= %p\n", task);
+	return task;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 void
 rpc_call_start(struct rpc_task *task)
 {
@@ -1098,7 +1147,7 @@ call_transmit(struct rpc_task *task)
 	 * in order to allow access to the socket to other RPC requests.
 	 */
 	call_transmit_status(task);
-	if (task->tk_msg.rpc_proc->p_decode != NULL)
+	if (rpc_reply_expected(task))
 		return;
 	task->tk_action = rpc_exit_task;
 	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
@@ -1133,6 +1182,72 @@ call_transmit_status(struct rpc_task *task)
 	}
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * 5b.	Send the backchannel RPC reply.  On error, drop the reply.  In
+ * addition, disconnect on connectivity errors.
+ */
+static void
+call_bc_transmit(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	BUG_ON(task->tk_status != 0);
+	task->tk_status = xprt_prepare_transmit(task);
+	if (task->tk_status == -EAGAIN) {
+		/*
+		 * Could not reserve the transport. Try again after the
+		 * transport is released.
+		 */
+		task->tk_status = 0;
+		task->tk_action = call_bc_transmit;
+		return;
+	}
+
+	task->tk_action = rpc_exit_task;
+	if (task->tk_status < 0) {
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		return;
+	}
+
+	xprt_transmit(task);
+	xprt_end_transmit(task);
+	dprint_status(task);
+	switch (task->tk_status) {
+	case 0:
+		/* Success */
+		break;
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -ETIMEDOUT:
+		/*
+		 * Problem reaching the server.  Disconnect and let the
+		 * forechannel reestablish the connection.  The server will
+		 * have to retransmit the backchannel request and we'll
+		 * reprocess it.  Since these ops are idempotent, there's no
+		 * need to cache our reply at this time.
+		 */
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		xprt_conditional_disconnect(task->tk_xprt,
+			req->rq_connect_cookie);
+		break;
+	default:
+		/*
+		 * We were unable to reply and will have to drop the
+		 * request.  The server should reconnect and retransmit.
+		 */
+		BUG_ON(task->tk_status == -EAGAIN);
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		break;
+	}
+	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * 6.	Sort out the RPC call status
  */
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 1ef6e46d9da2..8487aa0f1f5a 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -141,12 +141,14 @@ EXPORT_SYMBOL_GPL(rpc_free_iostats);
 void rpc_count_iostats(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
-	struct rpc_iostats *stats = task->tk_client->cl_metrics;
+	struct rpc_iostats *stats;
 	struct rpc_iostats *op_metrics;
 	long rtt, execute, queue;
 
-	if (!stats || !req)
+	if (!task->tk_client || !task->tk_client->cl_metrics || !req)
 		return;
+
+	stats = task->tk_client->cl_metrics;
 	op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx];
 
 	op_metrics->om_ops++;
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
new file mode 100644
index 000000000000..5d9dd742264b
--- /dev/null
+++ b/net/sunrpc/sunrpc.h
@@ -0,0 +1,37 @@
+/******************************************************************************
+
+(c) 2008 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * Functions and macros used internally by RPC
+ */
+
+#ifndef _NET_SUNRPC_SUNRPC_H
+#define _NET_SUNRPC_SUNRPC_H
+
+static inline int rpc_reply_expected(struct rpc_task *task)
+{
+	return (task->tk_msg.rpc_proc != NULL) &&
+		(task->tk_msg.rpc_proc->p_decode != NULL);
+}
+
+#endif /* _NET_SUNRPC_SUNRPC_H */
+
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 52739f82df1e..0eea2bfe111b 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -12,8 +12,9 @@
  *  -	Next, the caller puts together the RPC message, stuffs it into
  *	the request struct, and calls xprt_transmit().
  *  -	xprt_transmit sends the message and installs the caller on the
- *	transport's wait list. At the same time, it installs a timer that
- *	is run after the packet's timeout has expired.
+ *	transport's wait list. At the same time, if a reply is expected,
+ *	it installs a timer that is run after the packet's timeout has
+ *	expired.
  *  -	When a packet arrives, the data_ready handler walks the list of
  *	pending requests for that transport. If a matching XID is found, the
  *	caller is woken up, and the timer removed.
@@ -46,6 +47,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/metrics.h>
 
+#include "sunrpc.h"
+
 /*
  * Local variables
  */
@@ -873,7 +876,10 @@ void xprt_transmit(struct rpc_task *task)
 	dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
 
 	if (!req->rq_received) {
-		if (list_empty(&req->rq_list)) {
+		if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
+			/*
+			 * Add to the list only if we're expecting a reply
+			 */
 			spin_lock_bh(&xprt->transport_lock);
 			/* Update the softirq receive buffer */
 			memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
@@ -908,8 +914,13 @@ void xprt_transmit(struct rpc_task *task)
 	/* Don't race with disconnect */
 	if (!xprt_connected(xprt))
 		task->tk_status = -ENOTCONN;
-	else if (!req->rq_received)
+	else if (!req->rq_received && rpc_reply_expected(task)) {
+		/*
+		 * Sleep on the pending queue since
+		 * we're expecting a reply.
+		 */
 		rpc_sleep_on(&xprt->pending, task, xprt_timer);
+	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
@@ -982,11 +993,17 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
  */
 void xprt_release(struct rpc_task *task)
 {
-	struct rpc_xprt	*xprt = task->tk_xprt;
+	struct rpc_xprt	*xprt;
 	struct rpc_rqst	*req;
+	int is_bc_request;
 
 	if (!(req = task->tk_rqstp))
 		return;
+
+	/* Preallocated backchannel request? */
+	is_bc_request = bc_prealloc(req);
+
+	xprt = req->rq_xprt;
 	rpc_count_iostats(task);
 	spin_lock_bh(&xprt->transport_lock);
 	xprt->ops->release_xprt(xprt, task);
@@ -999,10 +1016,19 @@ void xprt_release(struct rpc_task *task)
 		mod_timer(&xprt->timer,
 				xprt->last_used + xprt->idle_timeout);
 	spin_unlock_bh(&xprt->transport_lock);
-	xprt->ops->buf_free(req->rq_buffer);
+	if (!bc_prealloc(req))
+		xprt->ops->buf_free(req->rq_buffer);
 	task->tk_rqstp = NULL;
 	if (req->rq_release_snd_buf)
 		req->rq_release_snd_buf(req);
+
+	/*
+	 * Early exit if this is a backchannel preallocated request.
+	 * There is no need to have it added to the RPC slot list.
+	 */
+	if (is_bc_request)
+		return;
+
 	memset(req, 0, sizeof(*req));	/* mark unused */
 
 	dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
-- 
cgit v1.2.3-71-gd317


From 0d90ba1cd416525c4825c111db862d8b15a02e9b Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:04 -0400
Subject: nfs41: Backchannel callback service helper routines

Executes the backchannel task on the RPC state machine using
the existing open connection previously established by the client.

Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>

nfs41: Add bc_svc.o to sunrpc Makefile.

[nfs41: bc_send() does not need to be exported outside RPC module]
[nfs41: xprt_free_bc_request() need not be exported outside RPC module]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Update copyright]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/bc_xprt.h |  3 ++
 net/sunrpc/Makefile            |  2 +-
 net/sunrpc/bc_svc.c            | 81 ++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprtsock.c          |  3 ++
 4 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 net/sunrpc/bc_svc.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 5965ae4f902d..6508f0dc0eff 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -29,12 +29,15 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/sched.h>
 
 #ifdef CONFIG_NFS_V4_1
 struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, int max_reqs);
+void bc_release_request(struct rpc_task *);
+int bc_send(struct rpc_rqst *req);
 #else /* CONFIG_NFS_V4_1 */
 static inline int xprt_setup_backchannel(struct rpc_xprt *xprt,
 					 unsigned int min_reqs)
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 4a01f9684b85..db73fd2a3f0e 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -13,6 +13,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    rpcb_clnt.o timer.o xdr.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
-sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o
+sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
new file mode 100644
index 000000000000..13f214f53120
--- /dev/null
+++ b/net/sunrpc/bc_svc.c
@@ -0,0 +1,81 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
+(c) 2009 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * The NFSv4.1 callback service helper routines.
+ * They implement the transport level processing required to send the
+ * reply over an existing open connection previously established by the client.
+ */
+
+#if defined(CONFIG_NFS_V4_1)
+
+#include <linux/module.h>
+
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCDSP
+
+void bc_release_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	dprintk("RPC:       bc_release_request: task= %p\n", task);
+
+	/*
+	 * Release this request only if it's a backchannel
+	 * preallocated request
+	 */
+	if (!bc_prealloc(req))
+		return;
+	xprt_free_bc_request(req);
+}
+
+/* Empty callback ops */
+static const struct rpc_call_ops nfs41_callback_ops = {
+};
+
+
+/*
+ * Send the callback reply
+ */
+int bc_send(struct rpc_rqst *req)
+{
+	struct rpc_task *task;
+	int ret;
+
+	dprintk("RPC:       bc_send req= %p\n", req);
+	task = rpc_run_bc_task(req, &nfs41_callback_ops);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else {
+		BUG_ON(atomic_read(&task->tk_count) != 1);
+		ret = task->tk_status;
+		rpc_put_task(task);
+	}
+	return ret;
+	dprintk("RPC:       bc_send ret= %d \n", ret);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e3e3a57116fb..8a721867b601 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2183,6 +2183,9 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.buf_free		= rpc_free,
 	.send_request		= xs_tcp_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+#if defined(CONFIG_NFS_V4_1)
+	.release_request	= bc_release_request,
+#endif /* CONFIG_NFS_V4_1 */
 	.close			= xs_tcp_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_tcp_print_stats,
-- 
cgit v1.2.3-71-gd317


From 4d6bbb6233c9cf23822a2f66f8470c9f40854b77 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:07 -0400
Subject: nfs41: Backchannel bc_svc_process()

Implement the NFSv4.1 backchannel service.  Invokes the common callback
processing logic svc_process_common() to authenticate the call and
dispatch the appropriate NFSv4.1 XDR decoder and operation procedure.
It then invokes bc_send() to send the reply over the same connection.
bc_send() is implemented in a separate patch.

At this time there is no slot validation or reply cache handling.

[nfs41: Preallocate rpc_rqst receive buffer for handling callbacks]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Move bc_svc_process() declaration to correct patch]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svc.h |  2 ++
 net/sunrpc/svc.c           | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 4a8afbd62007..16043c4a8bf4 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -419,6 +419,8 @@ int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
+int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
+			struct svc_rqst *);
 int		   svc_register(const struct svc_serv *, const int,
 				const unsigned short, const unsigned short);
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bfda66db2f4f..06b52e465f47 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -25,6 +25,7 @@
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/bc_xprt.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCDSP
 
@@ -1239,6 +1240,54 @@ svc_process(struct svc_rqst *rqstp)
 	return svc_send(rqstp);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Process a backchannel RPC request that arrived over an existing
+ * outbound connection
+ */
+int
+bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
+	       struct svc_rqst *rqstp)
+{
+	struct kvec	*argv = &rqstp->rq_arg.head[0];
+	struct kvec	*resv = &rqstp->rq_res.head[0];
+	int 		error;
+
+	/* Build the svc_rqst used by the common processing routine */
+	rqstp->rq_xid = req->rq_xid;
+	rqstp->rq_prot = req->rq_xprt->prot;
+	rqstp->rq_server = serv;
+
+	rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
+	memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
+	memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg));
+	memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res));
+
+	/* reset result send buffer "put" position */
+	resv->iov_len = 0;
+
+	if (rqstp->rq_prot != IPPROTO_TCP) {
+		printk(KERN_ERR "No support for Non-TCP transports!\n");
+		BUG();
+	}
+
+	/*
+	 * Skip the next two words because they've already been
+	 * processed in the trasport
+	 */
+	svc_getu32(argv);	/* XID */
+	svc_getnl(argv);	/* CALLDIR */
+
+	error = svc_process_common(rqstp, argv, resv);
+	if (error <= 0)
+		return error;
+
+	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
+	return bc_send(req);
+}
+EXPORT_SYMBOL(bc_svc_process);
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * Return (transport-specific) limit on the rpc payload.
  */
-- 
cgit v1.2.3-71-gd317


From 7652e5a09ba319241607b22d9055ce93fd5b8039 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:23:09 -0400
Subject: nfs41: sunrpc: provide functions to create and destroy a svc_xprt for
 backchannel use

For nfs41 callbacks we need an svc_xprt to process requests coming up the
backchannel socket as rpc_rqst's that are transformed into svc_rqst's that
need a rq_xprt to be processed.

The svc_{udp,tcp}_create methods are too heavy for this job as svc_create_socket
creates an actual socket to listen on while for nfs41 we're "reusing" the
fore channel's socket.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svcsock.h |  2 ++
 net/sunrpc/svcsock.c           | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 483e10380aae..6bb1ec4ae310 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -42,6 +42,8 @@ int		svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
 int		svc_addsock(struct svc_serv *serv, int fd, char *name_return);
 void		svc_init_xprt_sock(void);
 void		svc_cleanup_xprt_sock(void);
+struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot);
+void		svc_sock_destroy(struct svc_xprt *);
 
 /*
  * svc_makesock socket characteristics
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9d504234af4a..a2a03e500533 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1327,3 +1327,42 @@ static void svc_sock_free(struct svc_xprt *xprt)
 		sock_release(svsk->sk_sock);
 	kfree(svsk);
 }
+
+/*
+ * Create a svc_xprt.
+ *
+ * For internal use only (e.g. nfsv4.1 backchannel).
+ * Callers should typically use the xpo_create() method.
+ */
+struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot)
+{
+	struct svc_sock *svsk;
+	struct svc_xprt *xprt = NULL;
+
+	dprintk("svc: %s\n", __func__);
+	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
+	if (!svsk)
+		goto out;
+
+	xprt = &svsk->sk_xprt;
+	if (prot == IPPROTO_TCP)
+		svc_xprt_init(&svc_tcp_class, xprt, serv);
+	else if (prot == IPPROTO_UDP)
+		svc_xprt_init(&svc_udp_class, xprt, serv);
+	else
+		BUG();
+out:
+	dprintk("svc: %s return %p\n", __func__, xprt);
+	return xprt;
+}
+EXPORT_SYMBOL_GPL(svc_sock_create);
+
+/*
+ * Destroy a svc_sock.
+ */
+void svc_sock_destroy(struct svc_xprt *xprt)
+{
+	if (xprt)
+		kfree(container_of(xprt, struct svc_sock, sk_xprt));
+}
+EXPORT_SYMBOL_GPL(svc_sock_destroy);
-- 
cgit v1.2.3-71-gd317


From 9c9f3f5fa62cc4959e4d4d1cf1ec74f2d6ac1197 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:23:10 -0400
Subject: nfs41: sunrpc: add a struct svc_xprt pointer to struct svc_serv for
 backchannel use

This svc_xprt is passed on to the callback service thread to be later used
to processes incoming svc_rqst's

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svc.h | 1 +
 net/sunrpc/svc.c           | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 16043c4a8bf4..ea8009695c69 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -103,6 +103,7 @@ struct svc_serv {
 	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
 	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
 						 * entries in the svc_cb_list */
+	struct svc_xprt		*bc_xprt;
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 06b52e465f47..b35048fabe22 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -487,6 +487,10 @@ svc_destroy(struct svc_serv *serv)
 	if (svc_serv_is_pooled(serv))
 		svc_pool_map_put();
 
+#if defined(CONFIG_NFS_V4_1)
+	svc_sock_destroy(serv->bc_xprt);
+#endif /* CONFIG_NFS_V4_1 */
+
 	svc_unregister(serv);
 	kfree(serv->sv_pools);
 	kfree(serv);
-- 
cgit v1.2.3-71-gd317


From dd2b63d049480979016b959abc2d141cdddb1389 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:28 -0400
Subject: nfs41: Rename rq_received to rq_reply_bytes_recvd

The 'rq_received' member of 'struct rpc_rqst' is used to track when we
have received a reply to our request.  With v4.1, the backchannel
can now accept callback requests over the existing connection.  Rename
this field to make it clear that it is only used for tracking reply bytes
and not all bytes received on the connection.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/xprt.h   |  3 ++-
 net/sunrpc/backchannel_rqst.c |  2 +-
 net/sunrpc/clnt.c             |  8 ++++----
 net/sunrpc/stats.c            |  2 +-
 net/sunrpc/xprt.c             | 15 ++++++++-------
 5 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 55c6c37e249e..1175d58efc2e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -67,7 +67,8 @@ struct rpc_rqst {
 	struct rpc_task *	rq_task;	/* RPC task data */
 	__be32			rq_xid;		/* request XID */
 	int			rq_cong;	/* has incremented xprt->cong */
-	int			rq_received;	/* receive completed */
+	int			rq_reply_bytes_recvd;	/* number of reply */
+							/* bytes received */
 	u32			rq_seqno;	/* gss seq no. used on req. */
 	int			rq_enc_pages_num;
 	struct page		**rq_enc_pages;	/* scratch pages for use by
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index f56e18a23498..5a7d342e3087 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -230,7 +230,7 @@ struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
 
 	if (req != NULL) {
 		set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
-		req->rq_received = 0;
+		req->rq_reply_bytes_recvd = 0;
 		req->rq_bytes_sent = 0;
 		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
 			sizeof(req->rq_private_buf));
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f3e93b8eb90f..5bc2f45bddf0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1258,8 +1258,8 @@ call_status(struct rpc_task *task)
 	struct rpc_rqst	*req = task->tk_rqstp;
 	int		status;
 
-	if (req->rq_received > 0 && !req->rq_bytes_sent)
-		task->tk_status = req->rq_received;
+	if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
+		task->tk_status = req->rq_reply_bytes_recvd;
 
 	dprint_status(task);
 
@@ -1376,7 +1376,7 @@ call_decode(struct rpc_task *task)
 
 	/*
 	 * Ensure that we see all writes made by xprt_complete_rqst()
-	 * before it changed req->rq_received.
+	 * before it changed req->rq_reply_bytes_recvd.
 	 */
 	smp_rmb();
 	req->rq_rcv_buf.len = req->rq_private_buf.len;
@@ -1417,7 +1417,7 @@ out_retry:
 	task->tk_status = 0;
 	/* Note: rpc_verify_header() may have freed the RPC slot */
 	if (task->tk_rqstp == req) {
-		req->rq_received = req->rq_rcv_buf.len = 0;
+		req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
 		if (task->tk_client->cl_discrtry)
 			xprt_conditional_disconnect(task->tk_xprt,
 					req->rq_connect_cookie);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 8487aa0f1f5a..1b4e6791ecf3 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -156,7 +156,7 @@ void rpc_count_iostats(struct rpc_task *task)
 	op_metrics->om_timeouts += task->tk_timeouts;
 
 	op_metrics->om_bytes_sent += task->tk_bytes_sent;
-	op_metrics->om_bytes_recv += req->rq_received;
+	op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
 
 	queue = (long)req->rq_xtime - task->tk_start;
 	if (queue < 0)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c144611223fc..f412a852bc73 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -806,9 +806,10 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 
 	list_del_init(&req->rq_list);
 	req->rq_private_buf.len = copied;
-	/* Ensure all writes are done before we update req->rq_received */
+	/* Ensure all writes are done before we update */
+	/* req->rq_reply_bytes_recvd */
 	smp_wmb();
-	req->rq_received = copied;
+	req->rq_reply_bytes_recvd = copied;
 	rpc_wake_up_queued_task(&xprt->pending, task);
 }
 EXPORT_SYMBOL_GPL(xprt_complete_rqst);
@@ -823,7 +824,7 @@ static void xprt_timer(struct rpc_task *task)
 	dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
 
 	spin_lock_bh(&xprt->transport_lock);
-	if (!req->rq_received) {
+	if (!req->rq_reply_bytes_recvd) {
 		if (xprt->ops->timer)
 			xprt->ops->timer(task);
 	} else
@@ -845,8 +846,8 @@ int xprt_prepare_transmit(struct rpc_task *task)
 	dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
 
 	spin_lock_bh(&xprt->transport_lock);
-	if (req->rq_received && !req->rq_bytes_sent) {
-		err = req->rq_received;
+	if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) {
+		err = req->rq_reply_bytes_recvd;
 		goto out_unlock;
 	}
 	if (!xprt->ops->reserve_xprt(task))
@@ -875,7 +876,7 @@ void xprt_transmit(struct rpc_task *task)
 
 	dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
 
-	if (!req->rq_received) {
+	if (!req->rq_reply_bytes_recvd) {
 		if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
 			/*
 			 * Add to the list only if we're expecting a reply
@@ -914,7 +915,7 @@ void xprt_transmit(struct rpc_task *task)
 	/* Don't race with disconnect */
 	if (!xprt_connected(xprt))
 		task->tk_status = -ENOTCONN;
-	else if (!req->rq_received && rpc_reply_expected(task)) {
+	else if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) {
 		/*
 		 * Sleep on the pending queue since
 		 * we're expecting a reply.
-- 
cgit v1.2.3-71-gd317


From f8625a6a4bb76207302be58453603d8e324df490 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:33 -0400
Subject: nfs41: Backchannel: Add a backchannel slot table to the session

Defines a new 'struct nfs4_slot_table' in the 'struct nfs4_session'
for use by the backchannel.  Initializes, resets, and destroys the backchannel
slot table in the same manner the forechannel slot table is initialized,
reset, and destroyed.

The sequenceid for each slot in the backchannel slot table is initialized
to 0, whereas the forechannel slotid's sequenceid is set to 1.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4proc.c         | 48 +++++++++++++++++++++++++++++++++++------------
 include/linux/nfs_fs_sb.h |  2 +-
 2 files changed, 37 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c3019ad85893..57dabb8a048e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4414,9 +4414,30 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
 			session->fc_attrs.max_reqs,
 			session->fc_slot_table.max_slots,
 			1);
+	if (status)
+		return status;
+
+	status = nfs4_reset_slot_table(&session->bc_slot_table,
+			session->bc_attrs.max_reqs,
+			session->bc_slot_table.max_slots,
+			0);
 	return status;
 }
 
+/* Destroy the slot table */
+static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+{
+	if (session->fc_slot_table.slots != NULL) {
+		kfree(session->fc_slot_table.slots);
+		session->fc_slot_table.slots = NULL;
+	}
+	if (session->bc_slot_table.slots != NULL) {
+		kfree(session->bc_slot_table.slots);
+		session->bc_slot_table.slots = NULL;
+	}
+	return;
+}
+
 /*
  * Initialize slot table
  */
@@ -4470,17 +4491,15 @@ static int nfs4_init_slot_tables(struct nfs4_session *session)
 
 	status = nfs4_init_slot_table(&session->fc_slot_table,
 			session->fc_attrs.max_reqs, 1);
-	return status;
-}
+	if (status)
+		return status;
 
-/* Destroy the slot table */
-static void nfs4_destroy_slot_table(struct nfs4_session *session)
-{
-	if (session->fc_slot_table.slots == NULL)
-		return;
-	kfree(session->fc_slot_table.slots);
-	session->fc_slot_table.slots = NULL;
-	return;
+	status = nfs4_init_slot_table(&session->bc_slot_table,
+			session->bc_attrs.max_reqs, 0);
+	if (status)
+		nfs4_destroy_slot_tables(session);
+
+	return status;
 }
 
 struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
@@ -4503,7 +4522,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 
 	tbl = &session->fc_slot_table;
 	spin_lock_init(&tbl->slot_tbl_lock);
-	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "Slot table");
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+
+	tbl = &session->bc_slot_table;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+
 	session->clp = clp;
 	return session;
 }
@@ -4515,7 +4539,7 @@ void nfs4_destroy_session(struct nfs4_session *session)
 		__func__, session->clp->cl_rpcclient->cl_xprt);
 	xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
 				NFS41_BC_MIN_CALLBACKS);
-	nfs4_destroy_slot_table(session);
+	nfs4_destroy_slot_tables(session);
 	kfree(session);
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d0902ccec9ce..19fe15d12042 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -205,7 +205,7 @@ struct nfs4_session {
 	struct nfs4_channel_attrs	fc_attrs;
 	struct nfs4_slot_table		fc_slot_table;
 	struct nfs4_channel_attrs	bc_attrs;
-					/* back channel has one slot */
+	struct nfs4_slot_table		bc_slot_table;
 	struct nfs_client		*clp;
 };
 
-- 
cgit v1.2.3-71-gd317


From 6c9dc4255108bab4ef5c177d369b99c3c23492a7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 17 Jun 2009 18:02:10 -0700
Subject: lockd: Update NSM state from SM_MON replies

When rpc.statd starts up in user space at boot time, it attempts to
write the latest NSM local state number into
/proc/sys/fs/nfs/nsm_local_state.

If lockd.ko isn't loaded yet (as is the case in most configurations),
that file doesn't exist, thus the kernel's NSM state remains set to
its initial value of zero during lockd operation.

This is a problem because rpc.statd and lockd use the NSM state number
to prevent repeated lock recovery on rebooted hosts.  If lockd sends
a zero NSM state, but then a delayed SM_NOTIFY with a real NSM state
number is received, there is no way for lockd or rpc.statd to
distinguish that stale SM_NOTIFY from an actual reboot.  Thus lock
recovery could be performed after the rebooted host has already
started reclaiming locks, and those locks will be lost.

We could change /etc/init.d/nfslock so it always modprobes lockd.ko
before starting rpc.statd.  However, if lockd.ko is ever unloaded
and reloaded, we are back at square one, since the NSM state is not
preserved across an unload/reload cycle.  This may happen frequently
on clients that use automounter.  A period of NFS inactivity causes
lockd.ko to be unloaded, and the kernel loses its NSM state setting.

Instead, let's use the fact that rpc.statd plants the local system's
NSM state in every SM_MON (and SM_UNMON) reply.  lockd performs a
synchronous SM_MON upcall to the local rpc.statd _before_ sending its
first NLM request to a new remote.  This would permit rpc.statd to
provide the current NSM state to lockd, even after lockd.ko had been
unloaded and reloaded.

Note that NLMPROC_LOCK arguments are constructed before the
nsm_monitor() call, so we have to rearrange argument construction very
slightly to make this all work out.

And, the kernel appears to treat NSM state as a u32 (see struct
nlm_args and nsm_res).  Make nsm_local_state a u32 as well, to ensure
we don't get bogus comparison results.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         |  2 +-
 fs/lockd/mon.c              | 18 ++++++++++++------
 include/linux/lockd/lockd.h |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 273e229353f3..f2fdcbce143e 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -126,7 +126,6 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 	struct nlm_lock	*lock = &argp->lock;
 
 	nlmclnt_next_cookie(&argp->cookie);
-	argp->state   = nsm_local_state;
 	memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
 	lock->caller  = utsname()->nodename;
 	lock->oh.data = req->a_owner;
@@ -521,6 +520,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 
 	if (nsm_monitor(host) < 0)
 		goto out;
+	req->a_args.state = nsm_local_state;
 
 	fl->fl_flags |= FL_ACCESS;
 	status = do_vfs_lock(fl);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6d5d4a4169e5..38385336614c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -53,7 +53,7 @@ static				DEFINE_SPINLOCK(nsm_lock);
 /*
  * Local NSM state
  */
-int	__read_mostly		nsm_local_state;
+u32	__read_mostly		nsm_local_state;
 int	__read_mostly		nsm_use_hostnames;
 
 static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
@@ -184,13 +184,19 @@ int nsm_monitor(const struct nlm_host *host)
 	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
 	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
-	if (res.status != 0)
+	if (unlikely(res.status != 0))
 		status = -EIO;
-	if (status < 0)
+	if (unlikely(status < 0)) {
 		printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
-	else
-		nsm->sm_monitored = 1;
-	return status;
+		return status;
+	}
+
+	nsm->sm_monitored = 1;
+	if (unlikely(nsm_local_state != res.state)) {
+		nsm_local_state = res.state;
+		dprintk("lockd: NSM state changed to %d\n", nsm_local_state);
+	}
+	return 0;
 }
 
 /**
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 51855dfd8adb..c325b187966b 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -195,7 +195,7 @@ extern struct svc_procedure	nlmsvc_procedures4[];
 extern int			nlmsvc_grace_period;
 extern unsigned long		nlmsvc_timeout;
 extern int			nsm_use_hostnames;
-extern int			nsm_local_state;
+extern u32			nsm_local_state;
 
 /*
  * Lockd client functions
-- 
cgit v1.2.3-71-gd317


From 2ad780978b7c0c3e7877949f098cbd06e7c73839 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 17 Jun 2009 18:02:11 -0700
Subject: NFS: Clean up MNT program definitions

Clean up:  Relocate MNT program procedure number definitions to the
only file that uses them.  Relocate the version number definitions,
which are shared, to nfs.h.  Remove duplicate program number
definitions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c  | 32 ++++++++++++++++++++++++++++----
 fs/nfs/nfsroot.c     |  3 +++
 include/linux/nfs.h  |  5 +++--
 include/linux/nfs2.h |  7 -------
 include/linux/nfs3.h |  5 -----
 5 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index ca905a5bb1ba..af45a374d56f 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -20,6 +20,30 @@
 # define NFSDBG_FACILITY	NFSDBG_MOUNT
 #endif
 
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+	MOUNTPROC_NULL		= 0,
+	MOUNTPROC_MNT		= 1,
+	MOUNTPROC_DUMP		= 2,
+	MOUNTPROC_UMNT		= 3,
+	MOUNTPROC_UMNTALL	= 4,
+	MOUNTPROC_EXPORT	= 5,
+};
+
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+	MOUNTPROC3_NULL		= 0,
+	MOUNTPROC3_MNT		= 1,
+	MOUNTPROC3_DUMP		= 2,
+	MOUNTPROC3_UMNT		= 3,
+	MOUNTPROC3_UMNTALL	= 4,
+	MOUNTPROC3_EXPORT	= 5,
+};
+
 static struct rpc_program	mnt_program;
 
 struct mnt_fhstatus {
@@ -68,7 +92,7 @@ int nfs_mount(struct nfs_mount_request *info)
 	if (info->version == NFS_MNT3_VERSION)
 		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
 	else
-		msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
 
 	status = rpc_call_sync(mnt_clnt, &msg, 0);
 	rpc_shutdown_client(mnt_clnt);
@@ -145,13 +169,13 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
 #define MNT_fhstatus3_sz	(1 + 16)
 
 static struct rpc_procinfo mnt_procedures[] = {
-	[MNTPROC_MNT] = {
-		.p_proc		= MNTPROC_MNT,
+	[MOUNTPROC_MNT] = {
+		.p_proc		= MOUNTPROC_MNT,
 		.p_encode	= (kxdrproc_t) xdr_encode_dirpath,
 		.p_decode	= (kxdrproc_t) xdr_decode_fhstatus,
 		.p_arglen	= MNT_dirpath_sz,
 		.p_replen	= MNT_fhstatus_sz,
-		.p_statidx	= MNTPROC_MNT,
+		.p_statidx	= MOUNTPROC_MNT,
 		.p_name		= "MOUNT",
 	},
 };
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index e3ed5908820b..24c1b93874c4 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -92,6 +92,9 @@
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
 
+/* Default port to use if server is not running a portmapper */
+#define NFS_MNT_PORT	627
+
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT		"/tftpboot/%s"
 
diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 214d499718f7..f387919bbc59 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -25,8 +25,9 @@
 #define NFSMODE_SOCK	0140000
 #define NFSMODE_FIFO	0010000
 
-#define NFS_MNT_PROGRAM	100005
-#define NFS_MNT_PORT	627
+#define NFS_MNT_PROGRAM		100005
+#define NFS_MNT_VERSION		1
+#define NFS_MNT3_VERSION	3
 
 /*
  * NFS stats. The good thing with these values is that NFSv3 errors are
diff --git a/include/linux/nfs2.h b/include/linux/nfs2.h
index 0ed9517138fc..fde24b30cc9e 100644
--- a/include/linux/nfs2.h
+++ b/include/linux/nfs2.h
@@ -64,11 +64,4 @@ struct nfs2_fh {
 #define NFSPROC_READDIR		16
 #define NFSPROC_STATFS		17
 
-#define NFS_MNT_PROGRAM		100005
-#define NFS_MNT_VERSION		1
-#define MNTPROC_NULL		0
-#define MNTPROC_MNT		1
-#define MNTPROC_UMNT		3
-#define MNTPROC_UMNTALL		4
-
 #endif /* _LINUX_NFS2_H */
diff --git a/include/linux/nfs3.h b/include/linux/nfs3.h
index 539f3b550eab..ac33806ec7f9 100644
--- a/include/linux/nfs3.h
+++ b/include/linux/nfs3.h
@@ -88,12 +88,7 @@ struct nfs3_fh {
 #define NFS3PROC_PATHCONF	20
 #define NFS3PROC_COMMIT		21
 
-#define NFS_MNT3_PROGRAM	100005
 #define NFS_MNT3_VERSION	3
-#define MOUNTPROC3_NULL		0
-#define MOUNTPROC3_MNT		1
-#define MOUNTPROC3_UMNT		3
-#define MOUNTPROC3_UMNTALL	4
  
 
 #if defined(__KERNEL__)
-- 
cgit v1.2.3-71-gd317


From 275582031f9b3597a1b973f3ff617adfe639faa2 Mon Sep 17 00:00:00 2001
From: Alexander Chiang <achiang@hp.com>
Date: Wed, 10 Jun 2009 19:55:14 +0000
Subject: ACPI: Introduce acpi_is_root_bridge()

Returns whether an ACPI CA node is a PCI root bridge or not.

This API is generically useful, and shouldn't just be a hotplug function.

The implementation becomes much simpler as well.

Signed-off-by: Alex Chiang <achiang@hp.com>
Acked-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/pci_root.c            | 24 +++++++++++++++++++++++
 drivers/pci/hotplug/acpi_pcihp.c   | 40 ++------------------------------------
 drivers/pci/hotplug/acpiphp_glue.c |  2 +-
 include/acpi/acpi_bus.h            |  1 +
 include/linux/pci_hotplug.h        |  1 -
 5 files changed, 28 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index ca8dba3b40b9..888cb9f5c5fb 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -142,6 +142,30 @@ acpi_handle acpi_get_pci_rootbridge_handle(unsigned int seg, unsigned int bus)
 
 EXPORT_SYMBOL_GPL(acpi_get_pci_rootbridge_handle);
 
+/**
+ * acpi_is_root_bridge - determine whether an ACPI CA node is a PCI root bridge
+ * @handle - the ACPI CA node in question.
+ *
+ * Note: we could make this API take a struct acpi_device * instead, but
+ * for now, it's more convenient to operate on an acpi_handle.
+ */
+int acpi_is_root_bridge(acpi_handle handle)
+{
+	int ret;
+	struct acpi_device *device;
+
+	ret = acpi_bus_get_device(handle, &device);
+	if (ret)
+		return 0;
+
+	ret = acpi_match_device_ids(device, root_device_ids);
+	if (ret)
+		return 0;
+	else
+		return 1;
+}
+EXPORT_SYMBOL_GPL(acpi_is_root_bridge);
+
 static acpi_status
 get_root_bridge_busnr_callback(struct acpi_resource *resource, void *data)
 {
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index fbc63d5e459f..eb159587d0bf 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -354,7 +354,7 @@ acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 		status = acpi_run_hpp(handle, hpp);
 		if (ACPI_SUCCESS(status))
 			break;
-		if (acpi_root_bridge(handle))
+		if (acpi_is_root_bridge(handle))
 			break;
 		status = acpi_get_parent(handle, &phandle);
 		if (ACPI_FAILURE(status))
@@ -428,7 +428,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev, u32 flags)
 		status = acpi_run_oshp(handle);
 		if (ACPI_SUCCESS(status))
 			goto got_one;
-		if (acpi_root_bridge(handle))
+		if (acpi_is_root_bridge(handle))
 			break;
 		chandle = handle;
 		status = acpi_get_parent(chandle, &handle);
@@ -449,42 +449,6 @@ got_one:
 }
 EXPORT_SYMBOL(acpi_get_hp_hw_control_from_firmware);
 
-/* acpi_root_bridge - check to see if this acpi object is a root bridge
- *
- * @handle - the acpi object in question.
- */
-int acpi_root_bridge(acpi_handle handle)
-{
-	acpi_status status;
-	struct acpi_device_info *info;
-	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
-	int i;
-
-	status = acpi_get_object_info(handle, &buffer);
-	if (ACPI_SUCCESS(status)) {
-		info = buffer.pointer;
-		if ((info->valid & ACPI_VALID_HID) &&
-			!strcmp(PCI_ROOT_HID_STRING,
-					info->hardware_id.value)) {
-			kfree(buffer.pointer);
-			return 1;
-		}
-		if (info->valid & ACPI_VALID_CID) {
-			for (i=0; i < info->compatibility_id.count; i++) {
-				if (!strcmp(PCI_ROOT_HID_STRING,
-					info->compatibility_id.id[i].value)) {
-					kfree(buffer.pointer);
-					return 1;
-				}
-			}
-		}
-		kfree(buffer.pointer);
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(acpi_root_bridge);
-
-
 static int is_ejectable(acpi_handle handle)
 {
 	acpi_status status;
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 3a6064bce561..fc6636e3300b 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -1631,7 +1631,7 @@ find_root_bridges(acpi_handle handle, u32 lvl, void *context, void **rv)
 {
 	int *count = (int *)context;
 
-	if (acpi_root_bridge(handle)) {
+	if (acpi_is_root_bridge(handle)) {
 		acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
 				handle_hotplug_event_bridge, NULL);
 			(*count)++;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index c34b11022908..96d593ee4859 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -369,6 +369,7 @@ struct device *acpi_get_physical_pci_device(acpi_handle);
 
 /* helper */
 acpi_handle acpi_get_child(acpi_handle, acpi_integer);
+int acpi_is_root_bridge(acpi_handle);
 acpi_handle acpi_get_pci_rootbridge_handle(unsigned int, unsigned int);
 #define DEVICE_ACPI_HANDLE(dev) ((acpi_handle)((dev)->archdata.acpi_handle))
 
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 20998746518e..a3576ef9fc74 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -226,7 +226,6 @@ struct hotplug_params {
 extern acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 				struct hotplug_params *hpp);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
-int acpi_root_bridge(acpi_handle handle);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(struct pci_bus *pbus);
 #endif
-- 
cgit v1.2.3-71-gd317


From ab52ae6db035fa425f90146327ab7d2c5d3e5654 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 16 Jun 2009 04:20:53 +0300
Subject: nfsd41: Backchannel: minorversion support for the back channel

Prepare to share backchannel code with NFSv4.1.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
[nfsd41: use nfsd4_cb_sequence for callback minorversion]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 3 ++-
 fs/nfsd/nfs4state.c        | 1 +
 include/linux/nfsd/state.h | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 353eb4a0b847..3fd23f7aceca 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -141,6 +141,7 @@ struct nfs4_cb_compound_hdr {
 	u32		ident;
 	u32		nops;
 	__be32		*nops_p;
+	u32		minorversion;
 	u32		taglen;
 	char		*tag;
 };
@@ -209,7 +210,7 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 
 	RESERVE_SPACE(16);
 	WRITE32(0);            /* tag length is always 0 */
-	WRITE32(NFS4_MINOR_VERSION);
+	WRITE32(hdr->minorversion);
 	WRITE32(hdr->ident);
 	hdr->nops_p = p;
 	WRITE32(hdr->nops);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ef6944b19f06..980a216a48c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -984,6 +984,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 	if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
 	                 &cb->cb_addr, &cb->cb_port)))
 		goto out_err;
+	cb->cb_minorversion = 0;
 	cb->cb_prog = se->se_callback_prog;
 	cb->cb_ident = se->se_callback_ident;
 	return;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 105cc100de05..f5a95fd34312 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -85,7 +85,8 @@ struct nfs4_cb_conn {
 	u32                     cb_addr;
 	unsigned short          cb_port;
 	u32                     cb_prog;
-	u32                     cb_ident;
+	u32			cb_minorversion;
+	u32                     cb_ident;	/* minorversion 0 only */
 	/* RPC client info */
 	atomic_t		cb_set;     /* successful CB_NULL call */
 	struct rpc_clnt *       cb_client;
-- 
cgit v1.2.3-71-gd317


From 06d5caf47ef4fbd9efdceae33293c42778cb7b0c Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Tue, 16 Jun 2009 15:39:51 +0100
Subject: rfkill: don't restore software blocked state on persistent devices

The setting of the "persistent" flag is also made more explicit using
a new rfkill_init_sw_state() function, instead of special-casing
rfkill_set_sw_state() when it is called before registration.

Suspend is a bit of a corner case so we try to get away without adding
another hack to rfkill-input - it's going to be removed soon.
If the state does change over suspend, users will simply have to prod
rfkill-input twice in order to toggle the state.

Userspace policy agents will be able to implement a more consistent user
experience.  For example, they can avoid the above problem if they
toggle devices individually.  Then there would be no "global state"
to get out of sync.

Currently there are only two rfkill drivers with persistent soft-blocked
state.  thinkpad-acpi already checks the software state on resume.
eeepc-laptop will require modification.

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
CC: Marcel Holtmann <marcel@holtmann.org>
Acked-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/platform/x86/eeepc-laptop.c  |  8 ++++----
 drivers/platform/x86/thinkpad_acpi.c | 14 ++++++-------
 include/linux/rfkill.h               | 32 ++++++++++++++++++++++++-----
 net/rfkill/core.c                    | 40 ++++++++++++++++++++++--------------
 4 files changed, 63 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index 03bf522bd7ab..01682eca4360 100644
--- a/drivers/platform/x86/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
@@ -675,8 +675,8 @@ static int eeepc_hotk_add(struct acpi_device *device)
 		if (!ehotk->eeepc_wlan_rfkill)
 			goto wlan_fail;
 
-		rfkill_set_sw_state(ehotk->eeepc_wlan_rfkill,
-				    get_acpi(CM_ASL_WLAN) != 1);
+		rfkill_init_sw_state(ehotk->eeepc_wlan_rfkill,
+				     get_acpi(CM_ASL_WLAN) != 1);
 		result = rfkill_register(ehotk->eeepc_wlan_rfkill);
 		if (result)
 			goto wlan_fail;
@@ -693,8 +693,8 @@ static int eeepc_hotk_add(struct acpi_device *device)
 		if (!ehotk->eeepc_bluetooth_rfkill)
 			goto bluetooth_fail;
 
-		rfkill_set_sw_state(ehotk->eeepc_bluetooth_rfkill,
-				    get_acpi(CM_ASL_BLUETOOTH) != 1);
+		rfkill_init_sw_state(ehotk->eeepc_bluetooth_rfkill,
+				     get_acpi(CM_ASL_BLUETOOTH) != 1);
 		result = rfkill_register(ehotk->eeepc_bluetooth_rfkill);
 		if (result)
 			goto bluetooth_fail;
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 86e958539f46..40d64c03278c 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -1163,8 +1163,8 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 {
 	struct tpacpi_rfk *atp_rfk;
 	int res;
-	bool initial_sw_state = false;
-	int initial_sw_status;
+	bool sw_state = false;
+	int sw_status;
 
 	BUG_ON(id >= TPACPI_RFK_SW_MAX || tpacpi_rfkill_switches[id]);
 
@@ -1185,17 +1185,17 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 	atp_rfk->id = id;
 	atp_rfk->ops = tp_rfkops;
 
-	initial_sw_status = (tp_rfkops->get_status)();
-	if (initial_sw_status < 0) {
+	sw_status = (tp_rfkops->get_status)();
+	if (sw_status < 0) {
 		printk(TPACPI_ERR
 			"failed to read initial state for %s, error %d\n",
-			name, initial_sw_status);
+			name, sw_status);
 	} else {
-		initial_sw_state = (initial_sw_status == TPACPI_RFK_RADIO_OFF);
+		sw_state = (sw_status == TPACPI_RFK_RADIO_OFF);
 		if (set_default) {
 			/* try to keep the initial state, since we ask the
 			 * firmware to preserve it across S5 in NVRAM */
-			rfkill_set_sw_state(atp_rfk->rfkill, initial_sw_state);
+			rfkill_init_sw_state(atp_rfk->rfkill, sw_state);
 		}
 	}
 	rfkill_set_hw_state(atp_rfk->rfkill, tpacpi_rfk_check_hwblock_state());
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 16e39c7a67fc..dcac724340d8 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -160,8 +160,9 @@ struct rfkill * __must_check rfkill_alloc(const char *name,
  * the rfkill structure. Before calling this function the driver needs
  * to be ready to service method calls from rfkill.
  *
- * If the software blocked state is not set before registration,
- * set_block will be called to initialize it to a default value.
+ * If rfkill_init_sw_state() is not called before registration,
+ * set_block() will be called to initialize the software blocked state
+ * to a default value.
  *
  * If the hardware blocked state is not set before registration,
  * it is assumed to be unblocked.
@@ -234,9 +235,11 @@ bool __must_check rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
  * rfkill drivers that get events when the soft-blocked state changes
  * (yes, some platforms directly act on input but allow changing again)
  * use this function to notify the rfkill core (and through that also
- * userspace) of the current state.  It is not necessary to notify on
- * resume; since hibernation can always change the soft-blocked state,
- * the rfkill core will unconditionally restore the previous state.
+ * userspace) of the current state.
+ *
+ * Drivers should also call this function after resume if the state has
+ * been changed by the user.  This only makes sense for "persistent"
+ * devices (see rfkill_init_sw_state()).
  *
  * This function can be called in any context, even from within rfkill
  * callbacks.
@@ -246,6 +249,21 @@ bool __must_check rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
  */
 bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked);
 
+/**
+ * rfkill_init_sw_state - Initialize persistent software block state
+ * @rfkill: pointer to the rfkill class to modify.
+ * @state: the current software block state to set
+ *
+ * rfkill drivers that preserve their software block state over power off
+ * use this function to notify the rfkill core (and through that also
+ * userspace) of their initial state.  It should only be used before
+ * registration.
+ *
+ * In addition, it marks the device as "persistent".  Persistent devices
+ * are expected to preserve preserve their own state when suspended.
+ */
+void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked);
+
 /**
  * rfkill_set_states - Set the internal rfkill block states
  * @rfkill: pointer to the rfkill class to modify.
@@ -307,6 +325,10 @@ static inline bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 	return blocked;
 }
 
+static inline void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked)
+{
+}
+
 static inline void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 {
 }
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 868d79f8ac1d..dcf8df7c573c 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -56,7 +56,6 @@ struct rfkill {
 	u32			idx;
 
 	bool			registered;
-	bool			suspended;
 	bool			persistent;
 
 	const struct rfkill_ops	*ops;
@@ -224,7 +223,7 @@ static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op)
 
 static void rfkill_event(struct rfkill *rfkill)
 {
-	if (!rfkill->registered || rfkill->suspended)
+	if (!rfkill->registered)
 		return;
 
 	kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
@@ -508,19 +507,32 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 	blocked = blocked || hwblock;
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
-	if (!rfkill->registered) {
-		rfkill->persistent = true;
-	} else {
-		if (prev != blocked && !hwblock)
-			schedule_work(&rfkill->uevent_work);
+	if (!rfkill->registered)
+		return blocked;
 
-		rfkill_led_trigger_event(rfkill);
-	}
+	if (prev != blocked && !hwblock)
+		schedule_work(&rfkill->uevent_work);
+
+	rfkill_led_trigger_event(rfkill);
 
 	return blocked;
 }
 EXPORT_SYMBOL(rfkill_set_sw_state);
 
+void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	unsigned long flags;
+
+	BUG_ON(!rfkill);
+	BUG_ON(rfkill->registered);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	__rfkill_set_sw_state(rfkill, blocked);
+	rfkill->persistent = true;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+}
+EXPORT_SYMBOL(rfkill_init_sw_state);
+
 void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 {
 	unsigned long flags;
@@ -718,8 +730,6 @@ static int rfkill_suspend(struct device *dev, pm_message_t state)
 
 	rfkill_pause_polling(rfkill);
 
-	rfkill->suspended = true;
-
 	return 0;
 }
 
@@ -728,10 +738,10 @@ static int rfkill_resume(struct device *dev)
 	struct rfkill *rfkill = to_rfkill(dev);
 	bool cur;
 
-	cur = !!(rfkill->state & RFKILL_BLOCK_SW);
-	rfkill_set_block(rfkill, cur);
-
-	rfkill->suspended = false;
+	if (!rfkill->persistent) {
+		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
+		rfkill_set_block(rfkill, cur);
+	}
 
 	rfkill_resume_polling(rfkill);
 
-- 
cgit v1.2.3-71-gd317


From 464902e812025792c9e33e19e1555c343672d5cf Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Tue, 16 Jun 2009 14:54:04 +0100
Subject: rfkill: export persistent attribute in sysfs

This information allows userspace to implement a hybrid policy where
it can store the rfkill soft-blocked state in platform non-volatile
storage if available, and if not then file-based storage can be used.

Some users prefer platform non-volatile storage because of the behaviour
when dual-booting multiple versions of Linux, or if the rfkill setting
is changed in the BIOS setting screens, or if the BIOS responds to
wireless-toggle hotkeys itself before the relevant platform driver has
been loaded.

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Acked-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/rfkill.txt |  2 ++
 include/linux/rfkill.h   |  5 +++--
 net/rfkill/core.c        | 10 ++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index c8acd8659e91..b4860509c319 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -111,6 +111,8 @@ following attributes:
 
 	name: Name assigned by driver to this key (interface or driver name).
 	type: Driver type string ("wlan", "bluetooth", etc).
+	persistent: Whether the soft blocked state is initialised from
+	            non-volatile storage at startup.
 	state: Current state of the transmitter
 		0: RFKILL_STATE_SOFT_BLOCKED
 			transmitter is turned off by software
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index dcac724340d8..e73e2429a1b1 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -259,8 +259,9 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked);
  * userspace) of their initial state.  It should only be used before
  * registration.
  *
- * In addition, it marks the device as "persistent".  Persistent devices
- * are expected to preserve preserve their own state when suspended.
+ * In addition, it marks the device as "persistent", an attribute which
+ * can be read by userspace.  Persistent devices are expected to preserve
+ * their own state when suspended.
  */
 void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked);
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index dcf8df7c573c..79693fe2001e 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -610,6 +610,15 @@ static ssize_t rfkill_idx_show(struct device *dev,
 	return sprintf(buf, "%d\n", rfkill->idx);
 }
 
+static ssize_t rfkill_persistent_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", rfkill->persistent);
+}
+
 static u8 user_state_from_blocked(unsigned long state)
 {
 	if (state & RFKILL_BLOCK_HW)
@@ -668,6 +677,7 @@ static struct device_attribute rfkill_dev_attrs[] = {
 	__ATTR(name, S_IRUGO, rfkill_name_show, NULL),
 	__ATTR(type, S_IRUGO, rfkill_type_show, NULL),
 	__ATTR(index, S_IRUGO, rfkill_idx_show, NULL),
+	__ATTR(persistent, S_IRUGO, rfkill_persistent_show, NULL),
 	__ATTR(state, S_IRUGO|S_IWUSR, rfkill_state_show, rfkill_state_store),
 	__ATTR(claim, S_IRUGO|S_IWUSR, rfkill_claim_show, rfkill_claim_store),
 	__ATTR_NULL
-- 
cgit v1.2.3-71-gd317


From f9ab94cee313746573b2d693bc2afb807ebb0998 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:20 +0100
Subject: dm: introduce num_flush_requests

Introduce num_flush_requests for a target to set to say how many flush
instructions (empty barriers) it wants to receive.  These are sent by
__clone_and_map_empty_barrier with map_info->flush_request going from 0
to (num_flush_requests - 1).

Old targets without flush support won't receive any flush requests.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c               | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h | 11 +++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7d9ca7094337..badb7519cccb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -750,6 +750,40 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	return clone;
 }
 
+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
+			  unsigned flush_nr)
+{
+	struct dm_target_io *tio = alloc_tio(ci->md);
+	struct bio *clone;
+
+	tio->io = ci->io;
+	tio->ti = ti;
+
+	memset(&tio->info, 0, sizeof(tio->info));
+	tio->info.flush_request = flush_nr;
+
+	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+	__bio_clone(clone, ci->bio);
+	clone->bi_destructor = dm_bio_destructor;
+
+	__map_bio(ti, clone, tio);
+}
+
+static int __clone_and_map_empty_barrier(struct clone_info *ci)
+{
+	unsigned target_nr = 0, flush_nr;
+	struct dm_target *ti;
+
+	while ((ti = dm_table_get_target(ci->map, target_nr++)))
+		for (flush_nr = 0; flush_nr < ti->num_flush_requests;
+		     flush_nr++)
+			__flush_target(ci, ti, flush_nr);
+
+	ci->sector_count = 0;
+
+	return 0;
+}
+
 static int __clone_and_map(struct clone_info *ci)
 {
 	struct bio *clone, *bio = ci->bio;
@@ -757,6 +791,9 @@ static int __clone_and_map(struct clone_info *ci)
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
 
+	if (unlikely(bio_empty_barrier(bio)))
+		return __clone_and_map_empty_barrier(ci);
+
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
 		return -EIO;
@@ -877,6 +914,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.io->md = md;
 	ci.sector = bio->bi_sector;
 	ci.sector_count = bio_sectors(bio);
+	if (unlikely(bio_empty_barrier(bio)))
+		ci.sector_count = 1;
 	ci.idx = bio->bi_idx;
 
 	start_io_acct(ci.io);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 49c2362977fd..fc36a4d07723 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -21,6 +21,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 union map_info {
 	void *ptr;
 	unsigned long long ll;
+	unsigned flush_request;
 };
 
 /*
@@ -167,6 +168,16 @@ struct dm_target {
 	/* Always a power of 2 */
 	sector_t split_io;
 
+	/*
+	 * A number of zero-length barrier requests that will be submitted
+	 * to the target for the purpose of flushing cache.
+	 *
+	 * The request number will be placed in union map_info->flush_request.
+	 * It is a responsibility of the target driver to remap these requests
+	 * to the real underlying devices.
+	 */
+	unsigned num_flush_requests;
+
 	/*
 	 * These are automatically filled in by
 	 * dm_table_get_device.
-- 
cgit v1.2.3-71-gd317


From 60935eb21d3c5bac79618000f38f92c249d153c4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Mon, 22 Jun 2009 10:12:30 +0100
Subject: dm ioctl: support cookies for udev

Add support for passing a 32 bit "cookie" into the kernel with the
DM_SUSPEND, DM_DEV_RENAME and DM_DEV_REMOVE ioctls.  The (unsigned)
value of this cookie is returned to userspace alongside the uevents
issued by these ioctls in the variable DM_COOKIE.

This means the userspace process issuing these ioctls can be notified
by udev after udev has completed any actions triggered.

To minimise the interface extension, we pass the cookie into the
kernel in the event_nr field which is otherwise unused when calling
these ioctls.  Incrementing the version number allows userspace to
determine in advance whether or not the kernel supports the cookie.
If the kernel does support this but userspace does not, there should
be no impact as the new variable will just get ignored.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c    | 14 ++++++++++----
 drivers/md/dm.c          | 25 +++++++++++++++++++------
 drivers/md/dm.h          |  3 ++-
 include/linux/dm-ioctl.h | 14 ++++++++++++--
 4 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1128d3fba797..1c871736f48c 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
 	up_write(&_hash_lock);
 }
 
-static int dm_hash_rename(const char *old, const char *new)
+static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 {
 	char *new_name, *old_name;
 	struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
 		dm_table_put(table);
 	}
 
-	dm_kobject_uevent(hc->md);
+	dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
 
 	dm_put(hc->md);
 	up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 
 	__hash_remove(hc);
 	up_write(&_hash_lock);
+
+	dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
+
 	dm_put(md);
 	param->data_size = 0;
 	return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 		return r;
 
 	param->data_size = 0;
-	return dm_hash_rename(param->name, new_name);
+	return dm_hash_rename(param->event_nr, param->name, new_name);
 }
 
 static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
 	if (dm_suspended(md))
 		r = dm_resume(md);
 
-	if (!r)
+
+	if (!r) {
+		dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
 		r = __dev_status(md, param);
+	}
 
 	dm_put(md);
 	return r;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 36142e947ffc..a9210bb594e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,13 @@
 
 #define DM_MSG_PREFIX "core"
 
+/*
+ * Cookies are numeric values sent with CHANGE and REMOVE
+ * uevents while resuming, removing or renaming the device.
+ */
+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
+#define DM_COOKIE_LENGTH 24
+
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -1731,11 +1738,7 @@ int dm_resume(struct mapped_device *md)
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);
-
-	dm_kobject_uevent(md);
-
 	r = 0;
-
 out:
 	dm_table_put(map);
 	mutex_unlock(&md->suspend_lock);
@@ -1746,9 +1749,19 @@ out:
 /*-----------------------------------------------------------------
  * Event notification.
  *---------------------------------------------------------------*/
-void dm_kobject_uevent(struct mapped_device *md)
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+		       unsigned cookie)
 {
-	kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+	char udev_cookie[DM_COOKIE_LENGTH];
+	char *envp[] = { udev_cookie, NULL };
+
+	if (!cookie)
+		kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+	else {
+		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
+			 DM_COOKIE_ENV_VAR_NAME, cookie);
+		kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
+	}
 }
 
 uint32_t dm_next_uevent_seq(struct mapped_device *md)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e91..b5935c610c44 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -92,7 +92,8 @@ void dm_stripe_exit(void);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
 
-void dm_kobject_uevent(struct mapped_device *md);
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+		       unsigned cookie);
 
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 48e44ee2b466..2ab84c83c31a 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -123,6 +123,16 @@ struct dm_ioctl {
 	__u32 target_count;	/* in/out */
 	__s32 open_count;	/* out */
 	__u32 flags;		/* in/out */
+
+	/*
+	 * event_nr holds either the event number (input and output) or the
+	 * udev cookie value (input only).
+	 * The DM_DEV_WAIT ioctl takes an event number as input.
+	 * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
+	 * use the field as a cookie to return in the DM_COOKIE
+	 * variable with the uevents they issue.
+	 * For output, the ioctls return the event number, not the cookie.
+	 */
 	__u32 event_nr;      	/* in/out */
 	__u32 padding;
 
@@ -256,9 +266,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	14
+#define DM_VERSION_MINOR	15
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2008-04-23)"
+#define DM_VERSION_EXTRA	"-ioctl (2009-04-01)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3-71-gd317


From 5ab97588fb266187b88d1ad893251c94388f18ba Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:32 +0100
Subject: dm table: replace struct io_restrictions with struct queue_limits

Use blk_stack_limits() to stack block limits (including topology) rather
than duplicate the equivalent within Device Mapper.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 138 +++++++++++++-----------------------------
 include/linux/device-mapper.h |  16 +----
 2 files changed, 45 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e3bcfb8b15a1..41ec2bf9fbe9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -66,7 +66,7 @@ struct dm_table {
 	 * These are optimistic limits taken from all the
 	 * targets, some targets will need smaller limits.
 	 */
-	struct io_restrictions limits;
+	struct queue_limits limits;
 
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
@@ -88,43 +88,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
 	return result;
 }
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
-/*
- * Combine two io_restrictions, always taking the lower value.
- */
-static void combine_restrictions_low(struct io_restrictions *lhs,
-				     struct io_restrictions *rhs)
-{
-	lhs->max_sectors =
-		min_not_zero(lhs->max_sectors, rhs->max_sectors);
-
-	lhs->max_phys_segments =
-		min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
-
-	lhs->max_hw_segments =
-		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
-
-	lhs->logical_block_size = max(lhs->logical_block_size,
-				      rhs->logical_block_size);
-
-	lhs->max_segment_size =
-		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
-
-	lhs->max_hw_sectors =
-		min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
-
-	lhs->seg_boundary_mask =
-		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
-
-	lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
-
-	lhs->no_cluster |= rhs->no_cluster;
-}
-
 /*
  * Calculate the index of the child node of the n'th node k'th key.
  */
@@ -511,10 +474,14 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	return 0;
 }
 
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+
 void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-	struct io_restrictions *rs = &ti->limits;
 	char b[BDEVNAME_SIZE];
 
 	if (unlikely(!q)) {
@@ -523,15 +490,9 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 		return;
 	}
 
-	/*
-	 * Combine the device limits low.
-	 *
-	 * FIXME: if we move an io_restriction struct
-	 *        into q this would just be a call to
-	 *        combine_restrictions_low()
-	 */
-	rs->max_sectors =
-		min_not_zero(rs->max_sectors, queue_max_sectors(q));
+	if (blk_stack_limits(&ti->limits, &q->limits, 0) < 0)
+		DMWARN("%s: target device %s is misaligned",
+		       dm_device_name(ti->table->md), bdevname(bdev, b));
 
 	/*
 	 * Check if merge fn is supported.
@@ -540,33 +501,9 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 */
 
 	if (q->merge_bvec_fn && !ti->type->merge)
-		rs->max_sectors =
-			min_not_zero(rs->max_sectors,
+		ti->limits.max_sectors =
+			min_not_zero(ti->limits.max_sectors,
 				     (unsigned int) (PAGE_SIZE >> 9));
-
-	rs->max_phys_segments =
-		min_not_zero(rs->max_phys_segments,
-			     queue_max_phys_segments(q));
-
-	rs->max_hw_segments =
-		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
-
-	rs->logical_block_size = max(rs->logical_block_size,
-				     queue_logical_block_size(q));
-
-	rs->max_segment_size =
-		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
-
-	rs->max_hw_sectors =
-		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
-
-	rs->seg_boundary_mask =
-		min_not_zero(rs->seg_boundary_mask,
-			     queue_segment_boundary(q));
-
-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
-
-	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
@@ -704,24 +641,32 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 	return 0;
 }
 
-static void check_for_valid_limits(struct io_restrictions *rs)
+static void init_valid_queue_limits(struct queue_limits *limits)
 {
-	if (!rs->max_sectors)
-		rs->max_sectors = SAFE_MAX_SECTORS;
-	if (!rs->max_hw_sectors)
-		rs->max_hw_sectors = SAFE_MAX_SECTORS;
-	if (!rs->max_phys_segments)
-		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
-	if (!rs->max_hw_segments)
-		rs->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!rs->logical_block_size)
-		rs->logical_block_size = 1 << SECTOR_SHIFT;
-	if (!rs->max_segment_size)
-		rs->max_segment_size = MAX_SEGMENT_SIZE;
-	if (!rs->seg_boundary_mask)
-		rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-	if (!rs->bounce_pfn)
-		rs->bounce_pfn = -1;
+	if (!limits->max_sectors)
+		limits->max_sectors = SAFE_MAX_SECTORS;
+	if (!limits->max_hw_sectors)
+		limits->max_hw_sectors = SAFE_MAX_SECTORS;
+	if (!limits->max_phys_segments)
+		limits->max_phys_segments = MAX_PHYS_SEGMENTS;
+	if (!limits->max_hw_segments)
+		limits->max_hw_segments = MAX_HW_SEGMENTS;
+	if (!limits->logical_block_size)
+		limits->logical_block_size = 1 << SECTOR_SHIFT;
+	if (!limits->physical_block_size)
+		limits->physical_block_size = 1 << SECTOR_SHIFT;
+	if (!limits->io_min)
+		limits->io_min = 1 << SECTOR_SHIFT;
+	if (!limits->max_segment_size)
+		limits->max_segment_size = MAX_SEGMENT_SIZE;
+	if (!limits->seg_boundary_mask)
+		limits->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	if (!limits->bounce_pfn)
+		limits->bounce_pfn = -1;
+	/*
+	 * The other fields (alignment_offset, io_opt, misaligned)
+	 * hold 0 from the kzalloc().
+	 */
 }
 
 /*
@@ -841,9 +786,12 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	/* FIXME: the plan is to combine high here and then have
-	 * the merge fn apply the target level restrictions. */
-	combine_restrictions_low(&t->limits, &tgt->limits);
+	if (blk_stack_limits(&t->limits, &tgt->limits, 0) < 0)
+		DMWARN("%s: target device (start sect %llu len %llu) "
+		       "is misaligned",
+		       dm_device_name(t->md),
+		       (unsigned long long) tgt->begin,
+		       (unsigned long long) tgt->len);
 	return 0;
 
  bad:
@@ -886,7 +834,7 @@ int dm_table_complete(struct dm_table *t)
 	int r = 0;
 	unsigned int leaf_nodes;
 
-	check_for_valid_limits(&t->limits);
+	init_valid_queue_limits(&t->limits);
 
 	r = validate_hardware_logical_block_alignment(t);
 	if (r)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index fc36a4d07723..236880c1dc3f 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -144,18 +144,6 @@ struct target_type {
 	struct list_head list;
 };
 
-struct io_restrictions {
-	unsigned long bounce_pfn;
-	unsigned long seg_boundary_mask;
-	unsigned max_hw_sectors;
-	unsigned max_sectors;
-	unsigned max_segment_size;
-	unsigned short logical_block_size;
-	unsigned short max_hw_segments;
-	unsigned short max_phys_segments;
-	unsigned char no_cluster; /* inverted so that 0 is default */
-};
-
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
@@ -164,7 +152,7 @@ struct dm_target {
 	sector_t begin;
 	sector_t len;
 
-	/* FIXME: turn this into a mask, and merge with io_restrictions */
+	/* FIXME: turn this into a mask, and merge with queue_limits */
 	/* Always a power of 2 */
 	sector_t split_io;
 
@@ -182,7 +170,7 @@ struct dm_target {
 	 * These are automatically filled in by
 	 * dm_table_get_device.
 	 */
-	struct io_restrictions limits;
+	struct queue_limits limits;
 
 	/* target specific data */
 	void *private;
-- 
cgit v1.2.3-71-gd317


From af4874e03ed82f050d5872d8c39ce64bf16b5c38 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:33 +0100
Subject: dm target:s introduce iterate devices fn

Add .iterate_devices to 'struct target_type' to allow a function to be
called for all devices in a DM target.  Implemented it for all targets
except those in dm-snap.c (origin and snapshot).

(The raid1 version number jumps to 1.12 because we originally reserved
1.1 to 1.11 for 'block_on_error' but ended up using 'handle_errors'
instead.)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: martin.petersen@oracle.com
---
 drivers/md/dm-crypt.c         | 11 ++++++++++-
 drivers/md/dm-delay.c         | 20 +++++++++++++++++++-
 drivers/md/dm-linear.c        | 11 ++++++++++-
 drivers/md/dm-mpath.c         | 23 ++++++++++++++++++++++-
 drivers/md/dm-raid1.c         | 17 ++++++++++++++++-
 drivers/md/dm-stripe.c        | 18 +++++++++++++++++-
 include/linux/device-mapper.h | 11 +++++++++++
 7 files changed, 105 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 04db6c4004a8..9933eb861c71 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1313,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static int crypt_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	struct crypt_config *cc = ti->private;
+
+	return fn(ti, cc->dev, cc->start, data);
+}
+
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version= {1, 6, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
@@ -1326,6 +1334,7 @@ static struct target_type crypt_target = {
 	.resume = crypt_resume,
 	.message = crypt_message,
 	.merge  = crypt_merge,
+	.iterate_devices = crypt_iterate_devices,
 };
 
 static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 8ad8a9044bbf..4e5b843cd4d7 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -318,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int delay_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	struct delay_c *dc = ti->private;
+	int ret = 0;
+
+	ret = fn(ti, dc->dev_read, dc->start_read, data);
+	if (ret)
+		goto out;
+
+	if (dc->dev_write)
+		ret = fn(ti, dc->dev_write, dc->start_write, data);
+
+out:
+	return ret;
+}
+
 static struct target_type delay_target = {
 	.name	     = "delay",
-	.version     = {1, 0, 2},
+	.version     = {1, 1, 0},
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
@@ -328,6 +345,7 @@ static struct target_type delay_target = {
 	.presuspend  = delay_presuspend,
 	.resume	     = delay_resume,
 	.status	     = delay_status,
+	.iterate_devices = delay_iterate_devices,
 };
 
 static int __init dm_delay_init(void)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ecbb17421da4..9184b6deb868 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -134,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static int linear_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct linear_c *lc = ti->private;
+
+	return fn(ti, lc->dev, lc->start, data);
+}
+
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version= {1, 0, 3},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
@@ -144,6 +152,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
 	.merge  = linear_merge,
+	.iterate_devices = linear_iterate_devices,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 890c0e8ed13e..f8aeaaa54afe 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1450,12 +1450,32 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
 
+static int multipath_iterate_devices(struct dm_target *ti,
+				     iterate_devices_callout_fn fn, void *data)
+{
+	struct multipath *m = ti->private;
+	struct priority_group *pg;
+	struct pgpath *p;
+	int ret = 0;
+
+	list_for_each_entry(pg, &m->priority_groups, list) {
+		list_for_each_entry(p, &pg->pgpaths, list) {
+			ret = fn(ti, p->path.dev, ti->begin, data);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
 /*-----------------------------------------------------------------
  * Module setup
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 0, 5},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
@@ -1466,6 +1486,7 @@ static struct target_type multipath_target = {
 	.status = multipath_status,
 	.message = multipath_message,
 	.ioctl  = multipath_ioctl,
+	.iterate_devices = multipath_iterate_devices,
 };
 
 static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 076fbb4e967a..ce8868c768cc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int mirror_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct mirror_set *ms = ti->private;
+	int ret = 0;
+	unsigned i;
+
+	for (i = 0; !ret && i < ms->nr_mirrors; i++)
+		ret = fn(ti, ms->mirror[i].dev,
+			 ms->mirror[i].offset, data);
+
+	return ret;
+}
+
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 0, 20},
+	.version = {1, 12, 0},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
 	.postsuspend = mirror_postsuspend,
 	.resume	 = mirror_resume,
 	.status	 = mirror_status,
+	.iterate_devices = mirror_iterate_devices,
 };
 
 static int __init dm_mirror_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c64fe827a5f1..b240e85ae39a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -313,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 	return error;
 }
 
+static int stripe_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct stripe_c *sc = ti->private;
+	int ret = 0;
+	unsigned i = 0;
+
+	do
+		ret = fn(ti, sc->stripe[i].dev,
+			 sc->stripe[i].physical_start, data);
+	while (!ret && ++i < sc->stripes);
+
+	return ret;
+}
+
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
 	.map    = stripe_map,
 	.end_io = stripe_end_io,
 	.status = stripe_status,
+	.iterate_devices = stripe_iterate_devices,
 };
 
 int __init dm_stripe_init(void)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 236880c1dc3f..deac3b4e5e18 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -11,6 +11,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 
+struct dm_dev;
 struct dm_target;
 struct dm_table;
 struct mapped_device;
@@ -81,6 +82,15 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
 			    struct bio_vec *biovec, int max_size);
 
+typedef int (*iterate_devices_callout_fn) (struct dm_target *ti,
+					   struct dm_dev *dev,
+					   sector_t physical_start,
+					   void *data);
+
+typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
+				      iterate_devices_callout_fn fn,
+				      void *data);
+
 /*
  * Returns:
  *    0: The target can handle the next I/O immediately.
@@ -139,6 +149,7 @@ struct target_type {
 	dm_ioctl_fn ioctl;
 	dm_merge_fn merge;
 	dm_busy_fn busy;
+	dm_iterate_devices_fn iterate_devices;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.3-71-gd317


From 754c5fc7ebb417b23601a6222a6005cc2e7f2913 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:34 +0100
Subject: dm: calculate queue limits during resume not load

Currently, device-mapper maintains a separate instance of 'struct
queue_limits' for each table of each device.  When the configuration of
a device is to be changed, first its table is loaded and this structure
is populated, then the device is 'resumed' and the calculated
queue_limits are applied.

This places restrictions on how userspace may process related devices,
where it is often advantageous to 'load' tables for several devices
at once before 'resuming' them together.  As the new queue_limits
only take effect after the 'resume', if they are changing and one
device uses another, the latter must be 'resumed' before the former
may be 'loaded'.

This patch moves the calculation of these queue_limits out of
the 'load' operation into 'resume'.  Since we are no longer
pre-calculating this struct, we no longer need to maintain copies
within our dm structs.

dm_set_device_limits() now passes the 'start' of the device's
data area (aka pe_start) as the 'offset' to blk_stack_limits().

init_valid_queue_limits() is replaced by blk_set_default_limits().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: martin.petersen@oracle.com
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 185 +++++++++++++++++++++++-------------------
 drivers/md/dm.c               |  12 ++-
 drivers/md/dm.h               |   5 +-
 include/linux/device-mapper.h |  10 +--
 4 files changed, 117 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 267817edc844..09a57113955e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -62,12 +62,6 @@ struct dm_table {
 	/* a list of devices used by this table */
 	struct list_head devices;
 
-	/*
-	 * These are optimistic limits taken from all the
-	 * targets, some targets will need smaller limits.
-	 */
-	struct queue_limits limits;
-
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
 	void *event_context;
@@ -346,18 +340,21 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 /*
  * If possible, this checks an area of a destination device is valid.
  */
-static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
-			     sector_t start, sector_t len)
+static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, void *data)
 {
-	sector_t dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+	struct queue_limits *limits = data;
+	struct block_device *bdev = dev->bdev;
+	sector_t dev_size =
+		i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 	unsigned short logical_block_size_sectors =
-		ti->limits.logical_block_size >> SECTOR_SHIFT;
+		limits->logical_block_size >> SECTOR_SHIFT;
 	char b[BDEVNAME_SIZE];
 
 	if (!dev_size)
 		return 1;
 
-	if ((start >= dev_size) || (start + len > dev_size)) {
+	if ((start >= dev_size) || (start + ti->len > dev_size)) {
 		DMWARN("%s: %s too small for target",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 		return 0;
@@ -371,16 +368,16 @@ static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
 		       "logical block size %hu of %s",
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)start,
-		       ti->limits.logical_block_size, bdevname(bdev, b));
+		       limits->logical_block_size, bdevname(bdev, b));
 		return 0;
 	}
 
-	if (len & (logical_block_size_sectors - 1)) {
+	if (ti->len & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: len=%llu not aligned to h/w "
 		       "logical block size %hu of %s",
 		       dm_device_name(ti->table->md),
-		       (unsigned long long)len,
-		       ti->limits.logical_block_size, bdevname(bdev, b));
+		       (unsigned long long)ti->len,
+		       limits->logical_block_size, bdevname(bdev, b));
 		return 0;
 	}
 
@@ -479,18 +476,21 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
  */
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+			 sector_t start, void *data)
 {
+	struct queue_limits *limits = data;
+	struct block_device *bdev = dev->bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
 	char b[BDEVNAME_SIZE];
 
 	if (unlikely(!q)) {
 		DMWARN("%s: Cannot set limits for nonexistent device %s",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
-		return;
+		return 0;
 	}
 
-	if (blk_stack_limits(&ti->limits, &q->limits, 0) < 0)
+	if (blk_stack_limits(limits, &q->limits, start) < 0)
 		DMWARN("%s: target device %s is misaligned",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 
@@ -501,32 +501,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 */
 
 	if (q->merge_bvec_fn && !ti->type->merge)
-		ti->limits.max_sectors =
-			min_not_zero(ti->limits.max_sectors,
+		limits->max_sectors =
+			min_not_zero(limits->max_sectors,
 				     (unsigned int) (PAGE_SIZE >> 9));
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 		  sector_t len, fmode_t mode, struct dm_dev **result)
 {
-	int r = __table_get_device(ti->table, ti, path,
-				   start, len, mode, result);
-
-	if (r)
-		return r;
-
-	dm_set_device_limits(ti, (*result)->bdev);
-
-	if (!device_area_is_valid(ti, (*result)->bdev, start, len)) {
-		dm_put_device(ti, *result);
-		*result = NULL;
-		return -EINVAL;
-	}
-
-	return r;
+	return __table_get_device(ti->table, ti, path,
+				  start, len, mode, result);
 }
 
+
 /*
  * Decrement a devices use count and remove it if necessary.
  */
@@ -641,34 +630,6 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 	return 0;
 }
 
-static void init_valid_queue_limits(struct queue_limits *limits)
-{
-	if (!limits->max_sectors)
-		limits->max_sectors = SAFE_MAX_SECTORS;
-	if (!limits->max_hw_sectors)
-		limits->max_hw_sectors = SAFE_MAX_SECTORS;
-	if (!limits->max_phys_segments)
-		limits->max_phys_segments = MAX_PHYS_SEGMENTS;
-	if (!limits->max_hw_segments)
-		limits->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!limits->logical_block_size)
-		limits->logical_block_size = 1 << SECTOR_SHIFT;
-	if (!limits->physical_block_size)
-		limits->physical_block_size = 1 << SECTOR_SHIFT;
-	if (!limits->io_min)
-		limits->io_min = 1 << SECTOR_SHIFT;
-	if (!limits->max_segment_size)
-		limits->max_segment_size = MAX_SEGMENT_SIZE;
-	if (!limits->seg_boundary_mask)
-		limits->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-	if (!limits->bounce_pfn)
-		limits->bounce_pfn = -1;
-	/*
-	 * The other fields (alignment_offset, io_opt, misaligned)
-	 * hold 0 from the kzalloc().
-	 */
-}
-
 /*
  * Impose necessary and sufficient conditions on a devices's table such
  * that any incoming bio which respects its logical_block_size can be
@@ -676,14 +637,15 @@ static void init_valid_queue_limits(struct queue_limits *limits)
  * two or more targets, the size of each piece it gets split into must
  * be compatible with the logical_block_size of the target processing it.
  */
-static int validate_hardware_logical_block_alignment(struct dm_table *table)
+static int validate_hardware_logical_block_alignment(struct dm_table *table,
+						 struct queue_limits *limits)
 {
 	/*
 	 * This function uses arithmetic modulo the logical_block_size
 	 * (in units of 512-byte sectors).
 	 */
 	unsigned short device_logical_block_size_sects =
-		table->limits.logical_block_size >> SECTOR_SHIFT;
+		limits->logical_block_size >> SECTOR_SHIFT;
 
 	/*
 	 * Offset of the start of the next table entry, mod logical_block_size.
@@ -697,6 +659,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 	unsigned short remaining = 0;
 
 	struct dm_target *uninitialized_var(ti);
+	struct queue_limits ti_limits;
 	unsigned i = 0;
 
 	/*
@@ -705,12 +668,19 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 	while (i < dm_table_get_num_targets(table)) {
 		ti = dm_table_get_target(table, i++);
 
+		blk_set_default_limits(&ti_limits);
+
+		/* combine all target devices' limits */
+		if (ti->type->iterate_devices)
+			ti->type->iterate_devices(ti, dm_set_device_limits,
+						  &ti_limits);
+
 		/*
 		 * If the remaining sectors fall entirely within this
 		 * table entry are they compatible with its logical_block_size?
 		 */
 		if (remaining < ti->len &&
-		    remaining & ((ti->limits.logical_block_size >>
+		    remaining & ((ti_limits.logical_block_size >>
 				  SECTOR_SHIFT) - 1))
 			break;	/* Error */
 
@@ -723,11 +693,11 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 
 	if (remaining) {
 		DMWARN("%s: table line %u (start sect %llu len %llu) "
-		       "not aligned to hardware logical block size %hu",
+		       "not aligned to h/w logical block size %hu",
 		       dm_device_name(table->md), i,
 		       (unsigned long long) ti->begin,
 		       (unsigned long long) ti->len,
-		       table->limits.logical_block_size);
+		       limits->logical_block_size);
 		return -EINVAL;
 	}
 
@@ -786,12 +756,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	if (blk_stack_limits(&t->limits, &tgt->limits, 0) < 0)
-		DMWARN("%s: target device (start sect %llu len %llu) "
-		       "is misaligned",
-		       dm_device_name(t->md),
-		       (unsigned long long) tgt->begin,
-		       (unsigned long long) tgt->len);
 	return 0;
 
  bad:
@@ -834,12 +798,6 @@ int dm_table_complete(struct dm_table *t)
 	int r = 0;
 	unsigned int leaf_nodes;
 
-	init_valid_queue_limits(&t->limits);
-
-	r = validate_hardware_logical_block_alignment(t);
-	if (r)
-		return r;
-
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -914,6 +872,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+/*
+ * Establish the new table's queue_limits and validate them.
+ */
+int dm_calculate_queue_limits(struct dm_table *table,
+			      struct queue_limits *limits)
+{
+	struct dm_target *uninitialized_var(ti);
+	struct queue_limits ti_limits;
+	unsigned i = 0;
+
+	blk_set_default_limits(limits);
+
+	while (i < dm_table_get_num_targets(table)) {
+		blk_set_default_limits(&ti_limits);
+
+		ti = dm_table_get_target(table, i++);
+
+		if (!ti->type->iterate_devices)
+			goto combine_limits;
+
+		/*
+		 * Combine queue limits of all the devices this target uses.
+		 */
+		ti->type->iterate_devices(ti, dm_set_device_limits,
+					  &ti_limits);
+
+		/*
+		 * Check each device area is consistent with the target's
+		 * overall queue limits.
+		 */
+		if (!ti->type->iterate_devices(ti, device_area_is_valid,
+					       &ti_limits))
+			return -EINVAL;
+
+combine_limits:
+		/*
+		 * Merge this target's queue limits into the overall limits
+		 * for the table.
+		 */
+		if (blk_stack_limits(limits, &ti_limits, 0) < 0)
+			DMWARN("%s: target device "
+			       "(start sect %llu len %llu) "
+			       "is misaligned",
+			       dm_device_name(table->md),
+			       (unsigned long long) ti->begin,
+			       (unsigned long long) ti->len);
+	}
+
+	return validate_hardware_logical_block_alignment(table, limits);
+}
+
 /*
  * Set the integrity profile for this device if all devices used have
  * matching profiles.
@@ -953,14 +962,24 @@ no_integrity:
 	return;
 }
 
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			       struct queue_limits *limits)
 {
+	/*
+	 * Each target device in the table has a data area that should normally
+	 * be aligned such that the DM device's alignment_offset is 0.
+	 * FIXME: Propagate alignment_offsets up the stack and warn of
+	 *	  sub-optimal or inconsistent settings.
+	 */
+	limits->alignment_offset = 0;
+	limits->misaligned = 0;
+
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
-	q->limits = t->limits;
+	q->limits = *limits;
 
-	if (t->limits.no_cluster)
+	if (limits->no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a9210bb594e7..f609793a92d0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1313,7 +1313,8 @@ static void __set_size(struct mapped_device *md, sector_t size)
 	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
-static int __bind(struct mapped_device *md, struct dm_table *t)
+static int __bind(struct mapped_device *md, struct dm_table *t,
+		  struct queue_limits *limits)
 {
 	struct request_queue *q = md->queue;
 	sector_t size;
@@ -1337,7 +1338,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 
 	write_lock(&md->map_lock);
 	md->map = t;
-	dm_table_set_restrictions(t, q);
+	dm_table_set_restrictions(t, q, limits);
 	write_unlock(&md->map_lock);
 
 	return 0;
@@ -1562,6 +1563,7 @@ static void dm_queue_flush(struct mapped_device *md)
  */
 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
+	struct queue_limits limits;
 	int r = -EINVAL;
 
 	mutex_lock(&md->suspend_lock);
@@ -1570,8 +1572,12 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (!dm_suspended(md))
 		goto out;
 
+	r = dm_calculate_queue_limits(table, &limits);
+	if (r)
+		goto out;
+
 	__unbind(md);
-	r = __bind(md, table);
+	r = __bind(md, table, &limits);
 
 out:
 	mutex_unlock(&md->suspend_lock);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b5935c610c44..604e85caadf6 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -41,7 +41,10 @@ void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
+int dm_calculate_queue_limits(struct dm_table *table,
+			      struct queue_limits *limits);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			       struct queue_limits *limits);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index deac3b4e5e18..e6bf3b8c7bf2 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -103,7 +103,8 @@ void dm_error(const char *message);
 /*
  * Combine device limits.
  */
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev);
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+			 sector_t start, void *data);
 
 struct dm_dev {
 	struct block_device *bdev;
@@ -163,7 +164,6 @@ struct dm_target {
 	sector_t begin;
 	sector_t len;
 
-	/* FIXME: turn this into a mask, and merge with queue_limits */
 	/* Always a power of 2 */
 	sector_t split_io;
 
@@ -177,12 +177,6 @@ struct dm_target {
 	 */
 	unsigned num_flush_requests;
 
-	/*
-	 * These are automatically filled in by
-	 * dm_table_get_device.
-	 */
-	struct queue_limits limits;
-
 	/* target specific data */
 	void *private;
 
-- 
cgit v1.2.3-71-gd317


From f5db4af466e2dca0fe822019812d586ca910b00c Mon Sep 17 00:00:00 2001
From: Jonthan Brassow <jbrassow@redhat.com>
Date: Mon, 22 Jun 2009 10:12:35 +0100
Subject: dm raid1: add userspace log

This patch contains a device-mapper mirror log module that forwards
requests to userspace for processing.

The structures used for communication between kernel and userspace are
located in include/linux/dm-log-userspace.h.  Due to the frequency,
diversity, and 2-way communication nature of the exchanges between
kernel and userspace, 'connector' was chosen as the interface for
communication.

The first log implementations written in userspace - "clustered-disk"
and "clustered-core" - support clustered shared storage.   A userspace
daemon (in the LVM2 source code repository) uses openAIS/corosync to
process requests in an ordered fashion with the rest of the nodes in the
cluster so as to prevent log state corruption.  Other implementations
with no association to LVM or openAIS/corosync, are certainly possible.

(Imagine if two machines are writing to the same region of a mirror.
They would both mark the region dirty, but you need a cluster-aware
entity that can handle properly marking the region clean when they are
done.  Otherwise, you might clear the region when the first machine is
done, not the second.)

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-log.txt |  54 +++
 drivers/md/Kconfig                     |  11 +
 drivers/md/Makefile                    |   3 +
 drivers/md/dm-log-userspace-base.c     | 696 +++++++++++++++++++++++++++++++++
 drivers/md/dm-log-userspace-transfer.c | 276 +++++++++++++
 drivers/md/dm-log-userspace-transfer.h |  18 +
 include/linux/Kbuild                   |   1 +
 include/linux/connector.h              |   4 +-
 include/linux/dm-log-userspace.h       | 386 ++++++++++++++++++
 9 files changed, 1448 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/device-mapper/dm-log.txt
 create mode 100644 drivers/md/dm-log-userspace-base.c
 create mode 100644 drivers/md/dm-log-userspace-transfer.c
 create mode 100644 drivers/md/dm-log-userspace-transfer.h
 create mode 100644 include/linux/dm-log-userspace.h

(limited to 'include/linux')

diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
new file mode 100644
index 000000000000..994dd75475a6
--- /dev/null
+++ b/Documentation/device-mapper/dm-log.txt
@@ -0,0 +1,54 @@
+Device-Mapper Logging
+=====================
+The device-mapper logging code is used by some of the device-mapper
+RAID targets to track regions of the disk that are not consistent.
+A region (or portion of the address space) of the disk may be
+inconsistent because a RAID stripe is currently being operated on or
+a machine died while the region was being altered.  In the case of
+mirrors, a region would be considered dirty/inconsistent while you
+are writing to it because the writes need to be replicated for all
+the legs of the mirror and may not reach the legs at the same time.
+Once all writes are complete, the region is considered clean again.
+
+There is a generic logging interface that the device-mapper RAID
+implementations use to perform logging operations (see
+dm_dirty_log_type in include/linux/dm-dirty-log.h).  Various different
+logging implementations are available and provide different
+capabilities.  The list includes:
+
+Type		Files
+====		=====
+disk		drivers/md/dm-log.c
+core		drivers/md/dm-log.c
+userspace	drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
+
+The "disk" log type
+-------------------
+This log implementation commits the log state to disk.  This way, the
+logging state survives reboots/crashes.
+
+The "core" log type
+-------------------
+This log implementation keeps the log state in memory.  The log state
+will not survive a reboot or crash, but there may be a small boost in
+performance.  This method can also be used if no storage device is
+available for storing log state.
+
+The "userspace" log type
+------------------------
+This log type simply provides a way to export the log API to userspace,
+so log implementations can be done there.  This is done by forwarding most
+logging requests to userspace, where a daemon receives and processes the
+request.
+
+The structure used for communication between kernel and userspace are
+located in include/linux/dm-log-userspace.h.  Due to the frequency,
+diversity, and 2-way communication nature of the exchanges between
+kernel and userspace, 'connector' is used as the interface for
+communication.
+
+There are currently two userspace log implementations that leverage this
+framework - "clustered_disk" and "clustered_core".  These implementations
+provide a cluster-coherent log for shared-storage.  Device-mapper mirroring
+can be used in a shared-storage environment when the cluster log implementations
+are employed.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 09f93fa68912..020f9573fd82 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,6 +231,17 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_LOG_USERSPACE
+	tristate "Mirror userspace logging (EXPERIMENTAL)"
+	depends on DM_MIRROR && EXPERIMENTAL && NET
+	select CONNECTOR
+	---help---
+	  The userspace logging module provides a mechanism for
+	  relaying the dm-dirty-log API to userspace.  Log designs
+	  which are more suited to userspace implementation (e.g.
+	  shared storage logs) or experimental logs can be implemented
+	  by leveraging this framework.
+
 config DM_ZERO
 	tristate "Zero target"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index dade52f60733..1dc4185bd781 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
 dm-mirror-y	+= dm-raid1.o
+dm-log-userspace-y \
+		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o
 raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
@@ -40,6 +42,7 @@ obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
 quiet_cmd_unroll = UNROLL  $@
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/bio.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+struct flush_entry {
+	int type;
+	region_t region;
+	struct list_head list;
+};
+
+struct log_c {
+	struct dm_target *ti;
+	uint32_t region_size;
+	region_t region_count;
+	char uuid[DM_UUID_LEN];
+
+	char *usr_argv_str;
+	uint32_t usr_argc;
+
+	/*
+	 * in_sync_hint gets set when doing is_remote_recovering.  It
+	 * represents the first region that needs recovery.  IOW, the
+	 * first zero bit of sync_bits.  This can be useful for to limit
+	 * traffic for calls like is_remote_recovering and get_resync_work,
+	 * but be take care in its use for anything else.
+	 */
+	uint64_t in_sync_hint;
+
+	spinlock_t flush_lock;
+	struct list_head flush_list;  /* only for clear and mark requests */
+};
+
+static mempool_t *flush_entry_pool;
+
+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct flush_entry), gfp_mask);
+}
+
+static void flush_entry_free(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+				int request_type, char *data, size_t data_size,
+				char *rdata, size_t *rdata_size)
+{
+	int r;
+
+	/*
+	 * If the server isn't there, -ESRCH is returned,
+	 * and we must keep trying until the server is
+	 * restored.
+	 */
+retry:
+	r = dm_consult_userspace(uuid, request_type, data,
+				 data_size, rdata, rdata_size);
+
+	if (r != -ESRCH)
+		return r;
+
+	DMERR(" Userspace log server not found.");
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(2*HZ);
+		DMWARN("Attempting to contact userspace log server...");
+		r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
+					 strlen(lc->usr_argv_str) + 1,
+					 NULL, NULL);
+		if (!r)
+			break;
+	}
+	DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+	r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
+				 0, NULL, NULL);
+	if (!r)
+		goto retry;
+
+	DMERR("Error trying to resume userspace log: %d", r);
+
+	return -ESRCH;
+}
+
+static int build_constructor_string(struct dm_target *ti,
+				    unsigned argc, char **argv,
+				    char **ctr_str)
+{
+	int i, str_size;
+	char *str = NULL;
+
+	*ctr_str = NULL;
+
+	for (i = 0, str_size = 0; i < argc; i++)
+		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+
+	str_size += 20; /* Max number of chars in a printed u64 number */
+
+	str = kzalloc(str_size, GFP_KERNEL);
+	if (!str) {
+		DMWARN("Unable to allocate memory for constructor string");
+		return -ENOMEM;
+	}
+
+	for (i = 0, str_size = 0; i < argc; i++)
+		str_size += sprintf(str + str_size, "%s ", argv[i]);
+	str_size += sprintf(str + str_size, "%llu",
+			    (unsigned long long)ti->len);
+
+	*ctr_str = str;
+	return str_size;
+}
+
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ *	<UUID> <other args>
+ * Where 'other args' is the userspace implementation specific log
+ * arguments.  An example might be:
+ *	<UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * So, this module will strip off the <UUID> for identification purposes
+ * when communicating with userspace about a log; but will pass on everything
+ * else.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+			 unsigned argc, char **argv)
+{
+	int r = 0;
+	int str_size;
+	char *ctr_str = NULL;
+	struct log_c *lc = NULL;
+	uint64_t rdata;
+	size_t rdata_size = sizeof(rdata);
+
+	if (argc < 3) {
+		DMWARN("Too few arguments to userspace dirty log");
+		return -EINVAL;
+	}
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc) {
+		DMWARN("Unable to allocate userspace log context.");
+		return -ENOMEM;
+	}
+
+	lc->ti = ti;
+
+	if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+		DMWARN("UUID argument too long.");
+		kfree(lc);
+		return -EINVAL;
+	}
+
+	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+	spin_lock_init(&lc->flush_lock);
+	INIT_LIST_HEAD(&lc->flush_list);
+
+	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+	if (str_size < 0) {
+		kfree(lc);
+		return str_size;
+	}
+
+	/* Send table string */
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
+				 ctr_str, str_size, NULL, NULL);
+
+	if (r == -ESRCH) {
+		DMERR("Userspace log server not found");
+		goto out;
+	}
+
+	/* Since the region size does not change, get it now */
+	rdata_size = sizeof(rdata);
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
+				 NULL, 0, (char *)&rdata, &rdata_size);
+
+	if (r) {
+		DMERR("Failed to get region size of dirty log");
+		goto out;
+	}
+
+	lc->region_size = (uint32_t)rdata;
+	lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+
+out:
+	if (r) {
+		kfree(lc);
+		kfree(ctr_str);
+	} else {
+		lc->usr_argv_str = ctr_str;
+		lc->usr_argc = argc;
+		log->context = lc;
+	}
+
+	return r;
+}
+
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
+				 NULL, 0,
+				 NULL, NULL);
+
+	kfree(lc->usr_argv_str);
+	kfree(lc);
+
+	return;
+}
+
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_resume(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	lc->in_sync_hint = 0;
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+	struct log_c *lc = log->context;
+
+	return lc->region_size;
+}
+
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean.  If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+	int r;
+	uint64_t region64 = (uint64_t)region;
+	int64_t is_clean;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(is_clean);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&is_clean, &rdata_size);
+
+	return (r) ? 0 : (int)is_clean;
+}
+
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync.  If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+			     int can_block)
+{
+	int r;
+	uint64_t region64 = region;
+	int64_t in_sync;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	/*
+	 * We can never respond directly - even if in_sync_hint is
+	 * set.  This is because another machine could see a device
+	 * failure and mark the region out-of-sync.  If we don't go
+	 * to userspace to ask, we might think the region is in-sync
+	 * and allow a read to pick up data that is stale.  (This is
+	 * very unlikely if a device actually fails; but it is very
+	 * likely if a connection to one device from one machine fails.)
+	 *
+	 * There still might be a problem if the mirror caches the region
+	 * state as in-sync... but then this call would not be made.  So,
+	 * that is a mirror problem.
+	 */
+	if (!can_block)
+		return -EWOULDBLOCK;
+
+	rdata_size = sizeof(in_sync);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&in_sync, &rdata_size);
+	return (r) ? 0 : (int)in_sync;
+}
+
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages.  First, it sends all
+ * clear/mark requests that are on the list.  Then it
+ * tells the server to commit them.  This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush.  Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+	int r = 0;
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	LIST_HEAD(flush_list);
+	struct flush_entry *fe, *tmp_fe;
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	list_splice_init(&lc->flush_list, &flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	if (list_empty(&flush_list))
+		return 0;
+
+	/*
+	 * FIXME: Count up requests, group request types,
+	 * allocate memory to stick all requests in and
+	 * send to server in one go.  Failing the allocation,
+	 * do it one by one.
+	 */
+
+	list_for_each_entry(fe, &flush_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			goto fail;
+	}
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+				 NULL, 0, NULL, NULL);
+
+fail:
+	/*
+	 * We can safely remove these entries, even if failure.
+	 * Calling code will receive an error and will know that
+	 * the log facility has failed.
+	 */
+	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+		list_del(&fe->list);
+		mempool_free(fe, flush_entry_pool);
+	}
+
+	if (r)
+		dm_table_event(lc->ti->table);
+
+	return r;
+}
+
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/* Wait for an allocation, but _never_ fail */
+	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+	BUG_ON(!fe);
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_MARK_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block.  In the worst case, it is ok to
+ * fail.  It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/*
+	 * If we fail to allocate, we skip the clearing of
+	 * the region.  This doesn't hurt us in any way, except
+	 * to cause the region to be resync'ed when the
+	 * device is activated next time.
+	 */
+	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+	if (!fe) {
+		DMERR("Failed to allocate memory to clear region.");
+		return;
+	}
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_CLEAR_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery.  It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+	int r;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+	struct {
+		int64_t i; /* 64-bit for mix arch compatibility */
+		region_t r;
+	} pkg;
+
+	if (lc->in_sync_hint >= lc->region_count)
+		return 0;
+
+	rdata_size = sizeof(pkg);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+				 NULL, 0,
+				 (char *)&pkg, &rdata_size);
+
+	*region = pkg.r;
+	return (r) ? r : (int)pkg.i;
+}
+
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region.  This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+				      region_t region, int in_sync)
+{
+	int r;
+	struct log_c *lc = log->context;
+	struct {
+		region_t r;
+		int64_t i;
+	} pkg;
+
+	pkg.r = region;
+	pkg.i = (int64_t)in_sync;
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+				 (char *)&pkg, sizeof(pkg),
+				 NULL, NULL);
+
+	/*
+	 * It would be nice to be able to report failures.
+	 * However, it is easy emough to detect and resolve.
+	 */
+	return;
+}
+
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+	int r;
+	size_t rdata_size;
+	uint64_t sync_count;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(sync_count);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+				 NULL, 0,
+				 (char *)&sync_count, &rdata_size);
+
+	if (r)
+		return 0;
+
+	if (sync_count >= lc->region_count)
+		lc->in_sync_hint = lc->region_count;
+
+	return (region_t)sync_count;
+}
+
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+			    char *result, unsigned maxlen)
+{
+	int r = 0;
+	size_t sz = (size_t)maxlen;
+	struct log_c *lc = log->context;
+
+	switch (status_type) {
+	case STATUSTYPE_INFO:
+		r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+					 NULL, 0,
+					 result, &sz);
+
+		if (r) {
+			sz = 0;
+			DMEMIT("%s 1 COM_FAILURE", log->type->name);
+		}
+		break;
+	case STATUSTYPE_TABLE:
+		sz = 0;
+		DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
+		       lc->uuid, lc->usr_argv_str);
+		break;
+	}
+	return (r) ? 0 : (int)sz;
+}
+
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+					  region_t region)
+{
+	int r;
+	uint64_t region64 = region;
+	struct log_c *lc = log->context;
+	static unsigned long long limit;
+	struct {
+		int64_t is_recovering;
+		uint64_t in_sync_hint;
+	} pkg;
+	size_t rdata_size = sizeof(pkg);
+
+	/*
+	 * Once the mirror has been reported to be in-sync,
+	 * it will never again ask for recovery work.  So,
+	 * we can safely say there is not a remote machine
+	 * recovering if the device is in-sync.  (in_sync_hint
+	 * must be reset at resume time.)
+	 */
+	if (region < lc->in_sync_hint)
+		return 0;
+	else if (jiffies < limit)
+		return 1;
+
+	limit = jiffies + (HZ / 4);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&pkg, &rdata_size);
+	if (r)
+		return 1;
+
+	lc->in_sync_hint = pkg.in_sync_hint;
+
+	return (int)pkg.is_recovering;
+}
+
+static struct dm_dirty_log_type _userspace_type = {
+	.name = "userspace",
+	.module = THIS_MODULE,
+	.ctr = userspace_ctr,
+	.dtr = userspace_dtr,
+	.presuspend = userspace_presuspend,
+	.postsuspend = userspace_postsuspend,
+	.resume = userspace_resume,
+	.get_region_size = userspace_get_region_size,
+	.is_clean = userspace_is_clean,
+	.in_sync = userspace_in_sync,
+	.flush = userspace_flush,
+	.mark_region = userspace_mark_region,
+	.clear_region = userspace_clear_region,
+	.get_resync_work = userspace_get_resync_work,
+	.set_region_sync = userspace_set_region_sync,
+	.get_sync_count = userspace_get_sync_count,
+	.status = userspace_status,
+	.is_remote_recovering = userspace_is_remote_recovering,
+};
+
+static int __init userspace_dirty_log_init(void)
+{
+	int r = 0;
+
+	flush_entry_pool = mempool_create(100, flush_entry_alloc,
+					  flush_entry_free, NULL);
+
+	if (!flush_entry_pool) {
+		DMWARN("Unable to create flush_entry_pool:  No memory.");
+		return -ENOMEM;
+	}
+
+	r = dm_ulog_tfr_init();
+	if (r) {
+		DMWARN("Unable to initialize userspace log communications");
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	r = dm_dirty_log_type_register(&_userspace_type);
+	if (r) {
+		DMWARN("Couldn't register userspace dirty log type");
+		dm_ulog_tfr_exit();
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	DMINFO("version 1.0.0 loaded");
+	return 0;
+}
+
+static void __exit userspace_dirty_log_exit(void)
+{
+	dm_dirty_log_type_unregister(&_userspace_type);
+	dm_ulog_tfr_exit();
+	mempool_destroy(flush_entry_pool);
+
+	DMINFO("version 1.0.0 unloaded");
+	return;
+}
+
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 000000000000..0ca1ee768a1f
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/workqueue.h>
+#include <linux/connector.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+static uint32_t dm_ulog_seq;
+
+/*
+ * Netlink/Connector is an unreliable protocol.  How long should
+ * we wait for a response before assuming it was lost and retrying?
+ * (If we do receive a response after this time, it will be discarded
+ * and the response to the resent request will be waited for.
+ */
+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
+
+/*
+ * Pre-allocated space for speed
+ */
+#define DM_ULOG_PREALLOCED_SIZE 512
+static struct cn_msg *prealloced_cn_msg;
+static struct dm_ulog_request *prealloced_ulog_tfr;
+
+static struct cb_id ulog_cn_id = {
+	.idx = CN_IDX_DM,
+	.val = CN_VAL_DM_USERSPACE_LOG
+};
+
+static DEFINE_MUTEX(dm_ulog_lock);
+
+struct receiving_pkg {
+	struct list_head list;
+	struct completion complete;
+
+	uint32_t seq;
+
+	int error;
+	size_t *data_size;
+	char *data;
+};
+
+static DEFINE_SPINLOCK(receiving_list_lock);
+static struct list_head receiving_list;
+
+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
+{
+	int r;
+	struct cn_msg *msg = prealloced_cn_msg;
+
+	memset(msg, 0, sizeof(struct cn_msg));
+
+	msg->id.idx = ulog_cn_id.idx;
+	msg->id.val = ulog_cn_id.val;
+	msg->ack = 0;
+	msg->seq = tfr->seq;
+	msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
+
+	r = cn_netlink_send(msg, 0, gfp_any());
+
+	return r;
+}
+
+/*
+ * Parameters for this function can be either msg or tfr, but not
+ * both.  This function fills in the reply for a waiting request.
+ * If just msg is given, then the reply is simply an ACK from userspace
+ * that the request was received.
+ *
+ * Returns: 0 on success, -ENOENT on failure
+ */
+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
+{
+	uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
+	struct receiving_pkg *pkg;
+
+	/*
+	 * The 'receiving_pkg' entries in this list are statically
+	 * allocated on the stack in 'dm_consult_userspace'.
+	 * Each process that is waiting for a reply from the user
+	 * space server will have an entry in this list.
+	 *
+	 * We are safe to do it this way because the stack space
+	 * is unique to each process, but still addressable by
+	 * other processes.
+	 */
+	list_for_each_entry(pkg, &receiving_list, list) {
+		if (rtn_seq != pkg->seq)
+			continue;
+
+		if (msg) {
+			pkg->error = -msg->ack;
+			/*
+			 * If we are trying again, we will need to know our
+			 * storage capacity.  Otherwise, along with the
+			 * error code, we make explicit that we have no data.
+			 */
+			if (pkg->error != -EAGAIN)
+				*(pkg->data_size) = 0;
+		} else if (tfr->data_size > *(pkg->data_size)) {
+			DMERR("Insufficient space to receive package [%u] "
+			      "(%u vs %lu)", tfr->request_type,
+			      tfr->data_size, *(pkg->data_size));
+
+			*(pkg->data_size) = 0;
+			pkg->error = -ENOSPC;
+		} else {
+			pkg->error = tfr->error;
+			memcpy(pkg->data, tfr->data, tfr->data_size);
+			*(pkg->data_size) = tfr->data_size;
+		}
+		complete(&pkg->complete);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * This is the connector callback that delivers data
+ * that was sent from userspace.
+ */
+static void cn_ulog_callback(void *data)
+{
+	struct cn_msg *msg = (struct cn_msg *)data;
+	struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
+
+	spin_lock(&receiving_list_lock);
+	if (msg->len == 0)
+		fill_pkg(msg, NULL);
+	else if (msg->len < sizeof(*tfr))
+		DMERR("Incomplete message received (expected %u, got %u): [%u]",
+		      (unsigned)sizeof(*tfr), msg->len, msg->seq);
+	else
+		fill_pkg(NULL, tfr);
+	spin_unlock(&receiving_list_lock);
+}
+
+/**
+ * dm_consult_userspace
+ * @uuid: log's uuid (must be DM_UUID_LEN in size)
+ * @request_type:  found in include/linux/dm-log-userspace.h
+ * @data: data to tx to the server
+ * @data_size: size of data in bytes
+ * @rdata: place to put return data from server
+ * @rdata_size: value-result (amount of space given/amount of space used)
+ *
+ * rdata_size is undefined on failure.
+ *
+ * Memory used to communicate with userspace is zero'ed
+ * before populating to ensure that no unwanted bits leak
+ * from kernel space to user-space.  All userspace log communications
+ * between kernel and user space go through this function.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ **/
+int dm_consult_userspace(const char *uuid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size)
+{
+	int r = 0;
+	size_t dummy = 0;
+	int overhead_size =
+		sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
+	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
+	struct receiving_pkg pkg;
+
+	if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
+		DMINFO("Size of tfr exceeds preallocated size");
+		return -EINVAL;
+	}
+
+	if (!rdata_size)
+		rdata_size = &dummy;
+resend:
+	/*
+	 * We serialize the sending of requests so we can
+	 * use the preallocated space.
+	 */
+	mutex_lock(&dm_ulog_lock);
+
+	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
+	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->seq = dm_ulog_seq++;
+
+	/*
+	 * Must be valid request type (all other bits set to
+	 * zero).  This reserves other bits for possible future
+	 * use.
+	 */
+	tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
+
+	tfr->data_size = data_size;
+	if (data && data_size)
+		memcpy(tfr->data, data, data_size);
+
+	memset(&pkg, 0, sizeof(pkg));
+	init_completion(&pkg.complete);
+	pkg.seq = tfr->seq;
+	pkg.data_size = rdata_size;
+	pkg.data = rdata;
+	spin_lock(&receiving_list_lock);
+	list_add(&(pkg.list), &receiving_list);
+	spin_unlock(&receiving_list_lock);
+
+	r = dm_ulog_sendto_server(tfr);
+
+	mutex_unlock(&dm_ulog_lock);
+
+	if (r) {
+		DMERR("Unable to send log request [%u] to userspace: %d",
+		      request_type, r);
+		spin_lock(&receiving_list_lock);
+		list_del_init(&(pkg.list));
+		spin_unlock(&receiving_list_lock);
+
+		goto out;
+	}
+
+	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+	spin_lock(&receiving_list_lock);
+	list_del_init(&(pkg.list));
+	spin_unlock(&receiving_list_lock);
+	if (!r) {
+		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
+		       (strlen(uuid) > 8) ?
+		       (uuid + (strlen(uuid) - 8)) : (uuid),
+		       request_type, pkg.seq);
+		goto resend;
+	}
+
+	r = pkg.error;
+	if (r == -EAGAIN)
+		goto resend;
+
+out:
+	return r;
+}
+
+int dm_ulog_tfr_init(void)
+{
+	int r;
+	void *prealloced;
+
+	INIT_LIST_HEAD(&receiving_list);
+
+	prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
+	if (!prealloced)
+		return -ENOMEM;
+
+	prealloced_cn_msg = prealloced;
+	prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
+
+	r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
+	if (r) {
+		cn_del_callback(&ulog_cn_id);
+		return r;
+	}
+
+	return 0;
+}
+
+void dm_ulog_tfr_exit(void)
+{
+	cn_del_callback(&ulog_cn_id);
+	kfree(prealloced_cn_msg);
+}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 000000000000..c26d8e4e2710
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
+#define __DM_LOG_USERSPACE_TRANSFER_H__
+
+#define DM_MSG_PREFIX "dm-log-userspace"
+
+int dm_ulog_tfr_init(void);
+void dm_ulog_tfr_exit(void);
+int dm_consult_userspace(const char *uuid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size);
+
+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 03f22076381f..334a3593cdfd 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -57,6 +57,7 @@ header-y += dlmconstants.h
 header-y += dlm_device.h
 header-y += dlm_netlink.h
 header-y += dm-ioctl.h
+header-y += dm-log-userspace.h
 header-y += dn.h
 header-y += dqblk_xfs.h
 header-y += efs_fs_sb.h
diff --git a/include/linux/connector.h b/include/linux/connector.h
index b9966e64604e..b68d27850d51 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -41,8 +41,10 @@
 #define CN_IDX_BB			0x5	/* BlackBoard, from the TSP GPL sampling framework */
 #define CN_DST_IDX			0x6
 #define CN_DST_VAL			0x1
+#define CN_IDX_DM			0x7	/* Device Mapper */
+#define CN_VAL_DM_USERSPACE_LOG		0x1
 
-#define CN_NETLINK_USERS		7
+#define CN_NETLINK_USERS		8
 
 /*
  * Maximum connector's message size.
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
new file mode 100644
index 000000000000..642e3017b51f
--- /dev/null
+++ b/include/linux/dm-log-userspace.h
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_H__
+#define __DM_LOG_USERSPACE_H__
+
+#include <linux/dm-ioctl.h> /* For DM_UUID_LEN */
+
+/*
+ * The device-mapper userspace log module consists of a kernel component and
+ * a user-space component.  The kernel component implements the API defined
+ * in dm-dirty-log.h.  Its purpose is simply to pass the parameters and
+ * return values of those API functions between kernel and user-space.
+ *
+ * Below are defined the 'request_types' - DM_ULOG_CTR, DM_ULOG_DTR, etc.
+ * These request types represent the different functions in the device-mapper
+ * dirty log API.  Each of these is described in more detail below.
+ *
+ * The user-space program must listen for requests from the kernel (representing
+ * the various API functions) and process them.
+ *
+ * User-space begins by setting up the communication link (error checking
+ * removed for clarity):
+ *	fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+ *	addr.nl_family = AF_NETLINK;
+ *	addr.nl_groups = CN_IDX_DM;
+ *	addr.nl_pid = 0;
+ *	r = bind(fd, (struct sockaddr *) &addr, sizeof(addr));
+ *	opt = addr.nl_groups;
+ *	setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &opt, sizeof(opt));
+ *
+ * User-space will then wait to receive requests form the kernel, which it
+ * will process as described below.  The requests are received in the form,
+ * ((struct dm_ulog_request) + (additional data)).  Depending on the request
+ * type, there may or may not be 'additional data'.  In the descriptions below,
+ * you will see 'Payload-to-userspace' and 'Payload-to-kernel'.  The
+ * 'Payload-to-userspace' is what the kernel sends in 'additional data' as
+ * necessary parameters to complete the request.  The 'Payload-to-kernel' is
+ * the 'additional data' returned to the kernel that contains the necessary
+ * results of the request.  The 'data_size' field in the dm_ulog_request
+ * structure denotes the availability and amount of payload data.
+ */
+
+/*
+ * DM_ULOG_CTR corresponds to (found in dm-dirty-log.h):
+ * int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
+ *	      unsigned argc, char **argv);
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is the reference that
+ * will be used by all request types to a specific log.  The constructor must
+ * record this assotiation with instance created.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CTR                    1
+
+/*
+ * DM_ULOG_DTR corresponds to (found in dm-dirty-log.h):
+ * void (*dtr)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being destroyed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_DTR                    2
+
+/*
+ * DM_ULOG_PRESUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*presuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being presuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_PRESUSPEND             3
+
+/*
+ * DM_ULOG_POSTSUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*postsuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being postsuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_POSTSUSPEND            4
+
+/*
+ * DM_ULOG_RESUME corresponds to (found in dm-dirty-log.h):
+ * int (*resume)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being resumed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_RESUME                 5
+
+/*
+ * DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h):
+ * uint32_t (*get_region_size)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	uint64_t - contains the region size
+ *
+ * The region size is something that was determined at constructor time.
+ * It is returned in the payload area and 'data_size' is set to
+ * reflect this.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_REGION_SIZE        6
+
+/*
+ * DM_ULOG_IS_CLEAN corresponds to (found in dm-dirty-log.h):
+ * int (*is_clean)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - the region to get clean status on
+ * Payload-to-kernel:
+ *	int64_t  - 1 if clean, 0 otherwise
+ *
+ * Payload is sizeof(uint64_t) and contains the region for which the clean
+ * status is being made.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - filling the payload with 0 (not clean) or
+ * 1 (clean), setting 'data_size' and 'error' appropriately.
+ */
+#define DM_ULOG_IS_CLEAN               7
+
+/*
+ * DM_ULOG_IN_SYNC corresponds to (found in dm-dirty-log.h):
+ * int (*in_sync)(struct dm_dirty_log *log, region_t region,
+ *		  int can_block);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - the region to get sync status on
+ * Payload-to-kernel:
+ *	int64_t - 1 if in-sync, 0 otherwise
+ *
+ * Exactly the same as 'is_clean' above, except this time asking "has the
+ * region been recovered?" vs. "is the region not being modified?"
+ */
+#define DM_ULOG_IN_SYNC                8
+
+/*
+ * DM_ULOG_FLUSH corresponds to (found in dm-dirty-log.h):
+ * int (*flush)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * No incoming or outgoing payload.  Simply flush log state to disk.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_FLUSH                  9
+
+/*
+ * DM_ULOG_MARK_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*mark_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t [] - region(s) to mark
+ * Payload-to-kernel:
+ *	None.
+ *
+ * Incoming payload contains the one or more regions to mark dirty.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_MARK_REGION           10
+
+/*
+ * DM_ULOG_CLEAR_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*clear_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t [] - region(s) to clear
+ * Payload-to-kernel:
+ *	None.
+ *
+ * Incoming payload contains the one or more regions to mark clean.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CLEAR_REGION          11
+
+/*
+ * DM_ULOG_GET_RESYNC_WORK corresponds to (found in dm-dirty-log.h):
+ * int (*get_resync_work)(struct dm_dirty_log *log, region_t *region);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	{
+ *		int64_t i; -- 1 if recovery necessary, 0 otherwise
+ *		uint64_t r; -- The region to recover if i=1
+ *	}
+ * 'data_size' should be set appropriately.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_RESYNC_WORK       12
+
+/*
+ * DM_ULOG_SET_REGION_SYNC corresponds to (found in dm-dirty-log.h):
+ * void (*set_region_sync)(struct dm_dirty_log *log,
+ *			   region_t region, int in_sync);
+ *
+ * Payload-to-userspace:
+ *	{
+ *		uint64_t - region to set sync state on
+ *		int64_t  - 0 if not-in-sync, 1 if in-sync
+ *	}
+ * Payload-to-kernel:
+ *	None.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_SET_REGION_SYNC       13
+
+/*
+ * DM_ULOG_GET_SYNC_COUNT corresponds to (found in dm-dirty-log.h):
+ * region_t (*get_sync_count)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	uint64_t - the number of in-sync regions
+ *
+ * No incoming payload.  Kernel-bound payload contains the number of
+ * regions that are in-sync (in a size_t).
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_GET_SYNC_COUNT        14
+
+/*
+ * DM_ULOG_STATUS_INFO corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_INFO,
+ *		 char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	Character string containing STATUSTYPE_INFO
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_INFO           15
+
+/*
+ * DM_ULOG_STATUS_TABLE corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_TABLE,
+ *		 char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	Character string containing STATUSTYPE_TABLE
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_TABLE          16
+
+/*
+ * DM_ULOG_IS_REMOTE_RECOVERING corresponds to (found in dm-dirty-log.h):
+ * int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - region to determine recovery status on
+ * Payload-to-kernel:
+ *	{
+ *		int64_t is_recovering;  -- 0 if no, 1 if yes
+ *		uint64_t in_sync_hint;  -- lowest region still needing resync
+ *	}
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_IS_REMOTE_RECOVERING  17
+
+/*
+ * (DM_ULOG_REQUEST_MASK & request_type) to get the request type
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * We are reserving 8 bits of the 32-bit 'request_type' field for the
+ * various request types above.  The remaining 24-bits are currently
+ * set to zero and are reserved for future use and compatibility concerns.
+ *
+ * User-space should always use DM_ULOG_REQUEST_TYPE to aquire the
+ * request type from the 'request_type' field to maintain forward compatibility.
+ */
+#define DM_ULOG_REQUEST_MASK 0xFF
+#define DM_ULOG_REQUEST_TYPE(request_type) \
+	(DM_ULOG_REQUEST_MASK & (request_type))
+
+struct dm_ulog_request {
+	char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */
+	char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+
+	int32_t error;          /* Used to report back processing errors */
+
+	uint32_t seq;           /* Sequence number for request */
+	uint32_t request_type;  /* DM_ULOG_* defined above */
+	uint32_t data_size;     /* How much data (not including this struct) */
+
+	char data[0];
+};
+
+#endif /* __DM_LOG_USERSPACE_H__ */
-- 
cgit v1.2.3-71-gd317


From cec47e3d4a861e1d942b3a580d0bbef2700d2bb2 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:35 +0100
Subject: dm: prepare for request based option

This patch adds core functions for request-based dm.

When struct mapped device (md) is initialized, md->queue has
an I/O scheduler and the following functions are used for
request-based dm as the queue functions:
    make_request_fn: dm_make_request()
    pref_fn:         dm_prep_fn()
    request_fn:      dm_request_fn()
    softirq_done_fn: dm_softirq_done()
    lld_busy_fn:     dm_lld_busy()
Actual initializations are done in another patch (PATCH 2).

Below is a brief summary of how request-based dm behaves, including:
  - making request from bio
  - cloning, mapping and dispatching request
  - completing request and bio
  - suspending md
  - resuming md

  bio to request
  ==============
  md->queue->make_request_fn() (dm_make_request()) calls __make_request()
  for a bio submitted to the md.
  Then, the bio is kept in the queue as a new request or merged into
  another request in the queue if possible.

  Cloning and Mapping
  ===================
  Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()),
  when requests are dispatched after they are sorted by the I/O scheduler.

  dm_request_fn() checks busy state of underlying devices using
  target's busy() function and stops dispatching requests to keep them
  on the dm device's queue if busy.
  It helps better I/O merging, since no merge is done for a request
  once it is dispatched to underlying devices.

  Actual cloning and mapping are done in dm_prep_fn() and map_request()
  called from dm_request_fn().
  dm_prep_fn() clones not only request but also bios of the request
  so that dm can hold bio completion in error cases and prevent
  the bio submitter from noticing the error.
  (See the "Completion" section below for details.)

  After the cloning, the clone is mapped by target's map_rq() function
    and inserted to underlying device's queue using
    blk_insert_cloned_request().

  Completion
  ==========
  Request completion can be hooked by rq->end_io(), but then, all bios
  in the request will have been completed even error cases, and the bio
  submitter will have noticed the error.
  To prevent the bio completion in error cases, request-based dm clones
  both bio and request and hooks both bio->bi_end_io() and rq->end_io():
      bio->bi_end_io(): end_clone_bio()
      rq->end_io():     end_clone_request()

  Summary of the request completion flow is below:
  blk_end_request() for a clone request
    => blk_update_request()
       => bio->bi_end_io() == end_clone_bio() for each clone bio
          => Free the clone bio
          => Success: Complete the original bio (blk_update_request())
             Error:   Don't complete the original bio
    => blk_finish_request()
       => rq->end_io() == end_clone_request()
          => blk_complete_request()
             => dm_softirq_done()
                => Free the clone request
                => Success: Complete the original request (blk_end_request())
                   Error:   Requeue the original request

  end_clone_bio() completes the original request on the size of
  the original bio in successful cases.
  Even if all bios in the original request are completed by that
  completion, the original request must not be completed yet to keep
  the ordering of request completion for the stacking.
  So end_clone_bio() uses blk_update_request() instead of
  blk_end_request().
  In error cases, end_clone_bio() doesn't complete the original bio.
  It just frees the cloned bio and gives over the error handling to
  end_clone_request().

  end_clone_request(), which is called with queue lock held, completes
  the clone request and the original request in a softirq context
  (dm_softirq_done()), which has no queue lock, to avoid a deadlock
  issue on submission of another request during the completion:
      - The submitted request may be mapped to the same device
      - Request submission requires queue lock, but the queue lock
        has been held by itself and it doesn't know that

  The clone request has no clone bio when dm_softirq_done() is called.
  So target drivers can't resubmit it again even error cases.
  Instead, they can ask dm core for requeueing and remapping
  the original request in that cases.

  suspend
  =======
  Request-based dm uses stopping md->queue as suspend of the md.
  For noflush suspend, just stops md->queue.

  For flush suspend, inserts a marker request to the tail of md->queue.
  And dispatches all requests in md->queue until the marker comes to
  the front of md->queue.  Then, stops dispatching request and waits
  for the all dispatched requests to complete.
  After that, completes the marker request, stops md->queue and
  wake up the waiter on the suspend queue, md->wait.

  resume
  ======
  Starts md->queue.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         |  14 +
 drivers/md/dm.c               | 705 +++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.h               |   1 +
 include/linux/device-mapper.h |   9 +
 4 files changed, 725 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 09a57113955e..c5f784419f23 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1080,6 +1080,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
+int dm_table_any_busy_target(struct dm_table *t)
+{
+	unsigned i;
+	struct dm_target *ti;
+
+	for (i = 0; i < t->num_targets; i++) {
+		ti = t->targets + i;
+		if (ti->type->busy && ti->type->busy(ti))
+			return 1;
+	}
+
+	return 0;
+}
+
 void dm_table_unplug_all(struct dm_table *t)
 {
 	struct dm_dev_internal *dd;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f609793a92d0..be003e5fea3d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -78,7 +78,7 @@ struct dm_rq_target_io {
  */
 struct dm_rq_clone_bio_info {
 	struct bio *orig;
-	struct request *rq;
+	struct dm_rq_target_io *tio;
 };
 
 union map_info *dm_get_mapinfo(struct bio *bio)
@@ -88,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 	return NULL;
 }
 
+union map_info *dm_get_rq_mapinfo(struct request *rq)
+{
+	if (rq && rq->end_io_data)
+		return &((struct dm_rq_target_io *)rq->end_io_data)->info;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
+
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -169,6 +177,12 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
+	/* marker of flush suspend for request-based dm */
+	struct request suspend_rq;
+
+	/* For saving the address of __make_request for request based dm */
+	make_request_fn *saved_make_request_fn;
+
 	/* sysfs handle */
 	struct kobject kobj;
 
@@ -406,6 +420,26 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 	mempool_free(tio, md->tio_pool);
 }
 
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
+{
+	return mempool_alloc(md->tio_pool, GFP_ATOMIC);
+}
+
+static void free_rq_tio(struct dm_rq_target_io *tio)
+{
+	mempool_free(tio, tio->md->tio_pool);
+}
+
+static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
+{
+	return mempool_alloc(md->io_pool, GFP_ATOMIC);
+}
+
+static void free_bio_info(struct dm_rq_clone_bio_info *info)
+{
+	mempool_free(info, info->tio->md->io_pool);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
@@ -615,6 +649,262 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+	struct dm_rq_clone_bio_info *info = clone->bi_private;
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_size;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int run_queue)
+{
+	int wakeup_waiters = 0;
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!queue_in_flight(q))
+		wakeup_waiters = 1;
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/* nudge anyone waiting on suspend queue */
+	if (wakeup_waiters)
+		wake_up(&md->wait);
+
+	if (run_queue)
+		blk_run_queue(q);
+
+	/*
+	 * dm_put() must be at the end of this function. See the comment above
+	 */
+	dm_put(md);
+}
+
+static void dm_unprep_request(struct request *rq)
+{
+	struct request *clone = rq->special;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	rq->special = NULL;
+	rq->cmd_flags &= ~REQ_DONTPREP;
+
+	blk_rq_unprep_clone(clone);
+	free_rq_tio(tio);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+void dm_requeue_unmapped_request(struct request *clone)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	dm_unprep_request(rq);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (elv_queue_empty(q))
+		blk_plug_device(q);
+	blk_requeue_request(q, rq);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	rq_completed(md, 0);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
+
+static void __stop_queue(struct request_queue *q)
+{
+	blk_stop_queue(q);
+}
+
+static void stop_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__stop_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void __start_queue(struct request_queue *q)
+{
+	if (blk_queue_stopped(q))
+		blk_start_queue(q);
+}
+
+static void start_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/*
+ * Complete the clone and the original request.
+ * Must be called without queue lock.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+
+	if (blk_pc_request(rq)) {
+		rq->errors = clone->errors;
+		rq->resid_len = clone->resid_len;
+
+		if (rq->sense)
+			/*
+			 * We are using the sense buffer of the original
+			 * request.
+			 * So setting the length of the sense data is enough.
+			 */
+			rq->sense_len = clone->sense_len;
+	}
+
+	BUG_ON(clone->bio);
+	free_rq_tio(tio);
+
+	blk_end_request_all(rq, error);
+
+	rq_completed(md, 1);
+}
+
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+	struct request *clone = rq->completion_data;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
+	int error = tio->error;
+
+	if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
+		error = rq_end_io(tio->ti, clone, error, &tio->info);
+
+	if (error <= 0)
+		/* The target wants to complete the I/O */
+		dm_end_request(clone, error);
+	else if (error == DM_ENDIO_INCOMPLETE)
+		/* The target will handle the I/O */
+		return;
+	else if (error == DM_ENDIO_REQUEUE)
+		/* The target wants to requeue the I/O */
+		dm_requeue_unmapped_request(clone);
+	else {
+		DMWARN("unimplemented target endio return value: %d", error);
+		BUG();
+	}
+}
+
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct request *rq = tio->orig;
+
+	tio->error = error;
+	rq->completion_data = clone;
+	blk_complete_request(rq);
+}
+
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() function fails.
+ */
+void dm_kill_unmapped_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct request *rq = tio->orig;
+
+	rq->cmd_flags |= REQ_FAILED;
+	dm_complete_request(clone, error);
+}
+EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
+
+/*
+ * Called with the queue lock held
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+	/*
+	 * For just cleaning up the information of the queue in which
+	 * the clone was dispatched.
+	 * The clone is *NOT* freed actually here because it is alloced from
+	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
+	 */
+	__blk_put_request(clone->q, clone);
+
+	/*
+	 * Actual request completion is done in a softirq context which doesn't
+	 * hold the queue lock.  Otherwise, deadlock could occur because:
+	 *     - another request may be submitted by the upper level driver
+	 *       of the stacking during the completion
+	 *     - the submission which requires queue lock may be done
+	 *       against this queue
+	 */
+	dm_complete_request(clone, error);
+}
+
 static sector_t max_io_len(struct mapped_device *md,
 			   sector_t sector, struct dm_target *ti)
 {
@@ -998,7 +1288,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static int dm_request(struct request_queue *q, struct bio *bio)
+static int _dm_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
@@ -1035,12 +1325,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	return 0;
 }
 
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct mapped_device *md = q->queuedata;
+
+	if (unlikely(bio_barrier(bio))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	return md->saved_make_request_fn(q, bio); /* call __make_request() */
+}
+
+static int dm_request_based(struct mapped_device *md)
+{
+	return blk_queue_stackable(md->queue);
+}
+
+static int dm_request(struct request_queue *q, struct bio *bio)
+{
+	struct mapped_device *md = q->queuedata;
+
+	if (dm_request_based(md))
+		return dm_make_request(q, bio);
+
+	return _dm_request(q, bio);
+}
+
+void dm_dispatch_request(struct request *rq)
+{
+	int r;
+
+	if (blk_queue_io_stat(rq->q))
+		rq->cmd_flags |= REQ_IO_STAT;
+
+	rq->start_time = jiffies;
+	r = blk_insert_cloned_request(rq->q, rq);
+	if (r)
+		dm_complete_request(rq, r);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+
+static void dm_rq_bio_destructor(struct bio *bio)
+{
+	struct dm_rq_clone_bio_info *info = bio->bi_private;
+	struct mapped_device *md = info->tio->md;
+
+	free_bio_info(info);
+	bio_free(bio, md->bs);
+}
+
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
+{
+	struct dm_rq_target_io *tio = data;
+	struct mapped_device *md = tio->md;
+	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
+
+	if (!info)
+		return -ENOMEM;
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+	bio->bi_private = info;
+	bio->bi_destructor = dm_rq_bio_destructor;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio)
+{
+	int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+				  dm_rq_bio_constructor, tio);
+
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
+	clone->buffer = rq->buffer;
+	clone->end_io = end_clone_request;
+	clone->end_io_data = tio;
+
+	return 0;
+}
+
+static int dm_rq_flush_suspending(struct mapped_device *md)
+{
+	return !md->suspend_rq.special;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_rq_target_io *tio;
+	struct request *clone;
+
+	if (unlikely(rq == &md->suspend_rq)) {
+		if (dm_rq_flush_suspending(md))
+			return BLKPREP_OK;
+		else
+			/* The flush suspend was interrupted */
+			return BLKPREP_KILL;
+	}
+
+	if (unlikely(rq->special)) {
+		DMWARN("Already has something in rq->special.");
+		return BLKPREP_KILL;
+	}
+
+	tio = alloc_rq_tio(md); /* Only one for each original request */
+	if (!tio)
+		/* -ENOMEM */
+		return BLKPREP_DEFER;
+
+	tio->md = md;
+	tio->ti = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	memset(&tio->info, 0, sizeof(tio->info));
+
+	clone = &tio->clone;
+	if (setup_clone(clone, rq, tio)) {
+		/* -ENOMEM */
+		free_rq_tio(tio);
+		return BLKPREP_DEFER;
+	}
+
+	rq->special = clone;
+	rq->cmd_flags |= REQ_DONTPREP;
+
+	return BLKPREP_OK;
+}
+
+static void map_request(struct dm_target *ti, struct request *rq,
+			struct mapped_device *md)
+{
+	int r;
+	struct request *clone = rq->special;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	/*
+	 * Hold the md reference here for the in-flight I/O.
+	 * We can't rely on the reference count by device opener,
+	 * because the device may be closed during the request completion
+	 * when all bios are completed.
+	 * See the comment in rq_completed() too.
+	 */
+	dm_get(md);
+
+	tio->ti = ti;
+	r = ti->type->map_rq(ti, clone, &tio->info);
+	switch (r) {
+	case DM_MAPIO_SUBMITTED:
+		/* The target has taken the I/O to submit by itself later */
+		break;
+	case DM_MAPIO_REMAPPED:
+		/* The target has remapped the I/O so dispatch it */
+		dm_dispatch_request(clone);
+		break;
+	case DM_MAPIO_REQUEUE:
+		/* The target wants to requeue the I/O */
+		dm_requeue_unmapped_request(clone);
+		break;
+	default:
+		if (r > 0) {
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+
+		/* The target wants to complete the I/O */
+		dm_kill_unmapped_request(clone, r);
+		break;
+	}
+}
+
+/*
+ * q->request_fn for request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+	struct dm_target *ti;
+	struct request *rq;
+
+	/*
+	 * For noflush suspend, check blk_queue_stopped() to immediately
+	 * quit I/O dispatching.
+	 */
+	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+		rq = blk_peek_request(q);
+		if (!rq)
+			goto plug_and_out;
+
+		if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
+			if (queue_in_flight(q))
+				/* Not quiet yet.  Wait more */
+				goto plug_and_out;
+
+			/* This device should be quiet now */
+			__stop_queue(q);
+			blk_start_request(rq);
+			__blk_end_request_all(rq, 0);
+			wake_up(&md->wait);
+			goto out;
+		}
+
+		ti = dm_table_find_target(map, blk_rq_pos(rq));
+		if (ti->type->busy && ti->type->busy(ti))
+			goto plug_and_out;
+
+		blk_start_request(rq);
+		spin_unlock(q->queue_lock);
+		map_request(ti, rq, md);
+		spin_lock_irq(q->queue_lock);
+	}
+
+	goto out;
+
+plug_and_out:
+	if (!elv_queue_empty(q))
+		/* Some requests still remain, retry later */
+		blk_plug_device(q);
+
+out:
+	dm_table_put(map);
+
+	return;
+}
+
+int dm_underlying_device_busy(struct request_queue *q)
+{
+	return blk_lld_busy(q);
+}
+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
+
+static int dm_lld_busy(struct request_queue *q)
+{
+	int r;
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+
+	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
+		r = 1;
+	else
+		r = dm_table_any_busy_target(map);
+
+	dm_table_put(map);
+
+	return r;
+}
+
 static void dm_unplug_all(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_table(md);
 
 	if (map) {
+		if (dm_request_based(md))
+			generic_unplug_device(q);
+
 		dm_table_unplug_all(map);
 		dm_table_put(map);
 	}
@@ -1055,7 +1607,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		map = dm_get_table(md);
 		if (map) {
-			r = dm_table_any_congested(map, bdi_bits);
+			/*
+			 * Request-based dm cares about only own queue for
+			 * the query about congestion status of request_queue
+			 */
+			if (dm_request_based(md))
+				r = md->queue->backing_dev_info.state &
+				    bdi_bits;
+			else
+				r = dm_table_any_congested(map, bdi_bits);
+
 			dm_table_put(map);
 		}
 	}
@@ -1458,6 +2019,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
+	struct request_queue *q = md->queue;
+	unsigned long flags;
 
 	dm_unplug_all(md->queue);
 
@@ -1467,7 +2030,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 		set_current_state(interruptible);
 
 		smp_mb();
-		if (!atomic_read(&md->pending))
+		if (dm_request_based(md)) {
+			spin_lock_irqsave(q->queue_lock, flags);
+			if (!queue_in_flight(q) && blk_queue_stopped(q)) {
+				spin_unlock_irqrestore(q->queue_lock, flags);
+				break;
+			}
+			spin_unlock_irqrestore(q->queue_lock, flags);
+		} else if (!atomic_read(&md->pending))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1584,6 +2154,67 @@ out:
 	return r;
 }
 
+static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
+{
+	md->suspend_rq.special = (void *)0x1;
+}
+
+static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
+{
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!noflush)
+		dm_rq_invalidate_suspend_marker(md);
+	__start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
+{
+	struct request *rq = &md->suspend_rq;
+	struct request_queue *q = md->queue;
+
+	if (noflush)
+		stop_queue(q);
+	else {
+		blk_rq_init(q, rq);
+		blk_insert_request(q, rq, 0, NULL);
+	}
+}
+
+static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
+{
+	int r = 1;
+	struct request *rq = &md->suspend_rq;
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	if (noflush)
+		return r;
+
+	/* The marker must be protected by queue lock if it is in use */
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (unlikely(rq->ref_count)) {
+		/*
+		 * This can happen, when the previous flush suspend was
+		 * interrupted, the marker is still in the queue and
+		 * this flush suspend has been invoked, because we don't
+		 * remove the marker at the time of suspend interruption.
+		 * We have only one marker per mapped_device, so we can't
+		 * start another flush suspend while it is in use.
+		 */
+		BUG_ON(!rq->special); /* The marker should be invalidated */
+		DMWARN("Invalidating the previous flush suspend is still in"
+		       " progress.  Please retry later.");
+		r = 0;
+	}
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return r;
+}
+
 /*
  * Functions to lock and unlock any filesystem running on the
  * device.
@@ -1623,6 +2254,53 @@ static void unlock_fs(struct mapped_device *md)
  * dm_bind_table, dm_suspend must be called to flush any in
  * flight bios and ensure that any further io gets deferred.
  */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * After the suspend starts, further incoming requests are kept in
+ * the request_queue and deferred.
+ * Remaining requests in the request_queue at the start of suspend are flushed
+ * if it is flush suspend.
+ * The suspend completes when the following conditions have been satisfied,
+ * so wait for it:
+ *    1. q->in_flight is 0 (which means no in_flight request)
+ *    2. queue has been stopped (which means no request dispatching)
+ *
+ *
+ * Noflush suspend
+ * ---------------
+ * Noflush suspend doesn't need to dispatch remaining requests.
+ * So stop the queue immediately.  Then, wait for all in_flight requests
+ * to be completed or requeued.
+ *
+ * To abort noflush suspend, start the queue.
+ *
+ *
+ * Flush suspend
+ * -------------
+ * Flush suspend needs to dispatch remaining requests.  So stop the queue
+ * after the remaining requests are completed. (Requeued request must be also
+ * re-dispatched and completed.  Until then, we can't stop the queue.)
+ *
+ * During flushing the remaining requests, further incoming requests are also
+ * inserted to the same queue.  To distinguish which requests are to be
+ * flushed, we insert a marker request to the queue at the time of starting
+ * flush suspend, like a barrier.
+ * The dispatching is blocked when the marker is found on the top of the queue.
+ * And the queue is stopped when all in_flight requests are completed, since
+ * that means the remaining requests are completely flushed.
+ * Then, the marker is removed from the queue.
+ *
+ * To abort flush suspend, we also need to take care of the marker, not only
+ * starting the queue.
+ * We don't remove the marker forcibly from the queue since it's against
+ * the block-layer manner.  Instead, we put a invalidated mark on the marker.
+ * When the invalidated marker is found on the top of the queue, it is
+ * immediately removed from the queue, so it doesn't block dispatching.
+ * Because we have only one marker per mapped_device, we can't start another
+ * flush suspend until the invalidated marker is removed from the queue.
+ * So fail and return with -EBUSY in such a case.
+ */
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
@@ -1637,6 +2315,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
+	if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
+		r = -EBUSY;
+		goto out_unlock;
+	}
+
 	map = dm_get_table(md);
 
 	/*
@@ -1682,6 +2365,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	flush_workqueue(md->wq);
 
+	if (dm_request_based(md))
+		dm_rq_start_suspend(md, noflush);
+
 	/*
 	 * At this point no more requests are entering target request routines.
 	 * We call dm_wait_for_completion to wait for all existing requests
@@ -1698,6 +2384,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	if (r < 0) {
 		dm_queue_flush(md);
 
+		if (dm_request_based(md))
+			dm_rq_abort_suspend(md, noflush);
+
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
@@ -1739,6 +2428,14 @@ int dm_resume(struct mapped_device *md)
 
 	dm_queue_flush(md);
 
+	/*
+	 * Flushing deferred I/Os must be done after targets are resumed
+	 * so that mapping of targets can work correctly.
+	 * Request-based dm is queueing the deferred I/Os in its request_queue.
+	 */
+	if (dm_request_based(md))
+		start_queue(md->queue);
+
 	unlock_fs(md);
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 604e85caadf6..8dcabb1caff1 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -50,6 +50,7 @@ void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+int dm_table_any_busy_target(struct dm_table *t);
 
 /*
  * To check the return value from dm_table_find_target().
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e6bf3b8c7bf2..0d6310657f32 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -234,6 +234,7 @@ struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
+union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 /*
  * Geometry functions.
@@ -396,4 +397,12 @@ static inline unsigned long to_bytes(sector_t n)
 	return (n << SECTOR_SHIFT);
 }
 
+/*-----------------------------------------------------------------
+ * Helper for block layer and dm core operations
+ *---------------------------------------------------------------*/
+void dm_dispatch_request(struct request *rq);
+void dm_requeue_unmapped_request(struct request *rq);
+void dm_kill_unmapped_request(struct request *rq, int error);
+int dm_underlying_device_busy(struct request_queue *q);
+
 #endif	/* _LINUX_DEVICE_MAPPER_H */
-- 
cgit v1.2.3-71-gd317


From 099d5270897606473d63091afcc63f53ee1894bc Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@deeprootsystems.com>
Date: Mon, 22 Jun 2009 18:42:42 +0100
Subject: serial@ add OMAP wakeup-enable register

Add the wakeup enable register to the list of OMAP-specific UART
registers.  This is to support forthcoming OMAP PM enhancements which
use the wakeup feature of the OMAP's 8250-based UART.

Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/serial_reg.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_reg.h b/include/linux/serial_reg.h
index 96c0d93fc2ca..850db2e80510 100644
--- a/include/linux/serial_reg.h
+++ b/include/linux/serial_reg.h
@@ -323,6 +323,7 @@
 #define UART_OMAP_MVER		0x14	/* Module version register */
 #define UART_OMAP_SYSC		0x15	/* System configuration register */
 #define UART_OMAP_SYSS		0x16	/* System status register */
+#define UART_OMAP_WER		0x17	/* Wake-up enable register */
 
 #endif /* _LINUX_SERIAL_REG_H */
 
-- 
cgit v1.2.3-71-gd317


From 04896a77a97b87e1611dedd61be88264ef4ac96c Mon Sep 17 00:00:00 2001
From: Robert Love <rlove@google.com>
Date: Mon, 22 Jun 2009 18:43:11 +0100
Subject: msm_serial: serial driver for MSM7K onboard serial peripheral.

Signed-off-by: Brian Swetland <swetland@google.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/Kconfig      |  10 +
 drivers/serial/Makefile     |   1 +
 drivers/serial/msm_serial.c | 767 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/msm_serial.h | 117 +++++++
 include/linux/serial_core.h |   3 +
 5 files changed, 898 insertions(+)
 create mode 100644 drivers/serial/msm_serial.c
 create mode 100644 drivers/serial/msm_serial.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 1132c5cae7ab..037c1e0b7c4c 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1320,6 +1320,16 @@ config SERIAL_SGI_IOC3
 	  If you have an SGI Altix with an IOC3 serial card,
 	  say Y or M.  Otherwise, say N.
 
+config SERIAL_MSM
+	bool "MSM on-chip serial port support"
+	depends on ARM && ARCH_MSM
+	select SERIAL_CORE
+
+config SERIAL_MSM_CONSOLE
+	bool "MSM serial console support"
+	depends on SERIAL_MSM=y
+	select SERIAL_CORE_CONSOLE
+
 config SERIAL_NETX
 	tristate "NetX serial port support"
 	depends on ARM && ARCH_NETX
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 45a8658f54d5..d5a29981c6c4 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_SERIAL_SGI_IOC4) += ioc4_serial.o
 obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_serial.o
 obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o
 obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o
+obj-$(CONFIG_SERIAL_MSM) += msm_serial.o
 obj-$(CONFIG_SERIAL_NETX) += netx-serial.o
 obj-$(CONFIG_SERIAL_OF_PLATFORM) += of_serial.o
 obj-$(CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL) += nwpserial.o
diff --git a/drivers/serial/msm_serial.c b/drivers/serial/msm_serial.c
new file mode 100644
index 000000000000..1a7c856f76f8
--- /dev/null
+++ b/drivers/serial/msm_serial.c
@@ -0,0 +1,767 @@
+/*
+ * drivers/serial/msm_serial.c - driver for msm7k serial device and console
+ *
+ * Copyright (C) 2007 Google, Inc.
+ * Author: Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#if defined(CONFIG_SERIAL_MSM_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+# define SUPPORT_SYSRQ
+#endif
+
+#include <linux/hrtimer.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+
+#include "msm_serial.h"
+
+struct msm_port {
+	struct uart_port	uart;
+	char			name[16];
+	struct clk		*clk;
+	unsigned int		imr;
+};
+
+#define UART_TO_MSM(uart_port)	((struct msm_port *) uart_port)
+
+static inline void msm_write(struct uart_port *port, unsigned int val,
+			     unsigned int off)
+{
+	__raw_writel(val, port->membase + off);
+}
+
+static inline unsigned int msm_read(struct uart_port *port, unsigned int off)
+{
+	return __raw_readl(port->membase + off);
+}
+
+static void msm_stop_tx(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr &= ~UART_IMR_TXLEV;
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void msm_start_tx(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr |= UART_IMR_TXLEV;
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void msm_stop_rx(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr &= ~(UART_IMR_RXLEV | UART_IMR_RXSTALE);
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void msm_enable_ms(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr |= UART_IMR_DELTA_CTS;
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void handle_rx(struct uart_port *port)
+{
+	struct tty_struct *tty = port->info->port.tty;
+	unsigned int sr;
+
+	/*
+	 * Handle overrun. My understanding of the hardware is that overrun
+	 * is not tied to the RX buffer, so we handle the case out of band.
+	 */
+	if ((msm_read(port, UART_SR) & UART_SR_OVERRUN)) {
+		port->icount.overrun++;
+		tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+		msm_write(port, UART_CR_CMD_RESET_ERR, UART_CR);
+	}
+
+	/* and now the main RX loop */
+	while ((sr = msm_read(port, UART_SR)) & UART_SR_RX_READY) {
+		unsigned int c;
+		char flag = TTY_NORMAL;
+
+		c = msm_read(port, UART_RF);
+
+		if (sr & UART_SR_RX_BREAK) {
+			port->icount.brk++;
+			if (uart_handle_break(port))
+				continue;
+		} else if (sr & UART_SR_PAR_FRAME_ERR) {
+			port->icount.frame++;
+		} else {
+			port->icount.rx++;
+		}
+
+		/* Mask conditions we're ignorning. */
+		sr &= port->read_status_mask;
+
+		if (sr & UART_SR_RX_BREAK) {
+			flag = TTY_BREAK;
+		} else if (sr & UART_SR_PAR_FRAME_ERR) {
+			flag = TTY_FRAME;
+		}
+
+		if (!uart_handle_sysrq_char(port, c))
+			tty_insert_flip_char(tty, c, flag);
+	}
+
+	tty_flip_buffer_push(tty);
+}
+
+static void handle_tx(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+	struct msm_port *msm_port = UART_TO_MSM(port);
+	int sent_tx;
+
+	if (port->x_char) {
+		msm_write(port, port->x_char, UART_TF);
+		port->icount.tx++;
+		port->x_char = 0;
+	}
+
+	while (msm_read(port, UART_SR) & UART_SR_TX_READY) {
+		if (uart_circ_empty(xmit)) {
+			/* disable tx interrupts */
+			msm_port->imr &= ~UART_IMR_TXLEV;
+			msm_write(port, msm_port->imr, UART_IMR);
+			break;
+		}
+
+		msm_write(port, xmit->buf[xmit->tail], UART_TF);
+
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		sent_tx = 1;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+}
+
+static void handle_delta_cts(struct uart_port *port)
+{
+	msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
+	port->icount.cts++;
+	wake_up_interruptible(&port->info->delta_msr_wait);
+}
+
+static irqreturn_t msm_irq(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	struct msm_port *msm_port = UART_TO_MSM(port);
+	unsigned int misr;
+
+	spin_lock(&port->lock);
+	misr = msm_read(port, UART_MISR);
+	msm_write(port, 0, UART_IMR); /* disable interrupt */
+
+	if (misr & (UART_IMR_RXLEV | UART_IMR_RXSTALE))
+		handle_rx(port);
+	if (misr & UART_IMR_TXLEV)
+		handle_tx(port);
+	if (misr & UART_IMR_DELTA_CTS)
+		handle_delta_cts(port);
+
+	msm_write(port, msm_port->imr, UART_IMR); /* restore interrupt */
+	spin_unlock(&port->lock);
+
+	return IRQ_HANDLED;
+}
+
+static unsigned int msm_tx_empty(struct uart_port *port)
+{
+	return (msm_read(port, UART_SR) & UART_SR_TX_EMPTY) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int msm_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_CAR | TIOCM_CTS | TIOCM_DSR | TIOCM_RTS;
+}
+
+static void msm_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	unsigned int mr;
+
+	mr = msm_read(port, UART_MR1);
+
+	if (!(mctrl & TIOCM_RTS)) {
+		mr &= ~UART_MR1_RX_RDY_CTL;
+		msm_write(port, mr, UART_MR1);
+		msm_write(port, UART_CR_CMD_RESET_RFR, UART_CR);
+	} else {
+		mr |= UART_MR1_RX_RDY_CTL;
+		msm_write(port, mr, UART_MR1);
+	}
+}
+
+static void msm_break_ctl(struct uart_port *port, int break_ctl)
+{
+	if (break_ctl)
+		msm_write(port, UART_CR_CMD_START_BREAK, UART_CR);
+	else
+		msm_write(port, UART_CR_CMD_STOP_BREAK, UART_CR);
+}
+
+static void msm_set_baud_rate(struct uart_port *port, unsigned int baud)
+{
+	unsigned int baud_code, rxstale, watermark;
+
+	switch (baud) {
+	case 300:
+		baud_code = UART_CSR_300;
+		rxstale = 1;
+		break;
+	case 600:
+		baud_code = UART_CSR_600;
+		rxstale = 1;
+		break;
+	case 1200:
+		baud_code = UART_CSR_1200;
+		rxstale = 1;
+		break;
+	case 2400:
+		baud_code = UART_CSR_2400;
+		rxstale = 1;
+		break;
+	case 4800:
+		baud_code = UART_CSR_4800;
+		rxstale = 1;
+		break;
+	case 9600:
+		baud_code = UART_CSR_9600;
+		rxstale = 2;
+		break;
+	case 14400:
+		baud_code = UART_CSR_14400;
+		rxstale = 3;
+		break;
+	case 19200:
+		baud_code = UART_CSR_19200;
+		rxstale = 4;
+		break;
+	case 28800:
+		baud_code = UART_CSR_28800;
+		rxstale = 6;
+		break;
+	case 38400:
+		baud_code = UART_CSR_38400;
+		rxstale = 8;
+		break;
+	case 57600:
+		baud_code = UART_CSR_57600;
+		rxstale = 16;
+		break;
+	case 115200:
+	default:
+		baud_code = UART_CSR_115200;
+		rxstale = 31;
+		break;
+	}
+
+	msm_write(port, baud_code, UART_CSR);
+
+	/* RX stale watermark */
+	watermark = UART_IPR_STALE_LSB & rxstale;
+	watermark |= UART_IPR_RXSTALE_LAST;
+	watermark |= UART_IPR_STALE_TIMEOUT_MSB & (rxstale << 2);
+	msm_write(port, watermark, UART_IPR);
+
+	/* set RX watermark */
+	watermark = (port->fifosize * 3) / 4;
+	msm_write(port, watermark, UART_RFWR);
+
+	/* set TX watermark */
+	msm_write(port, 10, UART_TFWR);
+}
+
+static void msm_reset(struct uart_port *port)
+{
+	/* reset everything */
+	msm_write(port, UART_CR_CMD_RESET_RX, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_TX, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_ERR, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_BREAK_INT, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
+	msm_write(port, UART_CR_CMD_SET_RFR, UART_CR);
+}
+
+static void msm_init_clock(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	clk_enable(msm_port->clk);
+
+	msm_write(port, 0xC0, UART_MREG);
+	msm_write(port, 0xB2, UART_NREG);
+	msm_write(port, 0x7D, UART_DREG);
+	msm_write(port, 0x1C, UART_MNDREG);
+}
+
+static int msm_startup(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+	unsigned int data, rfr_level;
+	int ret;
+
+	snprintf(msm_port->name, sizeof(msm_port->name),
+		 "msm_serial%d", port->line);
+
+	ret = request_irq(port->irq, msm_irq, IRQF_TRIGGER_HIGH,
+			  msm_port->name, port);
+	if (unlikely(ret))
+		return ret;
+
+	msm_init_clock(port);
+
+	if (likely(port->fifosize > 12))
+		rfr_level = port->fifosize - 12;
+	else
+		rfr_level = port->fifosize;
+
+	/* set automatic RFR level */
+	data = msm_read(port, UART_MR1);
+	data &= ~UART_MR1_AUTO_RFR_LEVEL1;
+	data &= ~UART_MR1_AUTO_RFR_LEVEL0;
+	data |= UART_MR1_AUTO_RFR_LEVEL1 & (rfr_level << 2);
+	data |= UART_MR1_AUTO_RFR_LEVEL0 & rfr_level;
+	msm_write(port, data, UART_MR1);
+
+	/* make sure that RXSTALE count is non-zero */
+	data = msm_read(port, UART_IPR);
+	if (unlikely(!data)) {
+		data |= UART_IPR_RXSTALE_LAST;
+		data |= UART_IPR_STALE_LSB;
+		msm_write(port, data, UART_IPR);
+	}
+
+	msm_reset(port);
+
+	msm_write(port, 0x05, UART_CR);	/* enable TX & RX */
+
+	/* turn on RX and CTS interrupts */
+	msm_port->imr = UART_IMR_RXLEV | UART_IMR_RXSTALE |
+			UART_IMR_CURRENT_CTS;
+	msm_write(port, msm_port->imr, UART_IMR);
+
+	return 0;
+}
+
+static void msm_shutdown(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr = 0;
+	msm_write(port, 0, UART_IMR); /* disable interrupts */
+
+	clk_disable(msm_port->clk);
+
+	free_irq(port->irq, port);
+}
+
+static void msm_set_termios(struct uart_port *port, struct ktermios *termios,
+			    struct ktermios *old)
+{
+	unsigned long flags;
+	unsigned int baud, mr;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* calculate and set baud rate */
+	baud = uart_get_baud_rate(port, termios, old, 300, 115200);
+	msm_set_baud_rate(port, baud);
+
+	/* calculate parity */
+	mr = msm_read(port, UART_MR2);
+	mr &= ~UART_MR2_PARITY_MODE;
+	if (termios->c_cflag & PARENB) {
+		if (termios->c_cflag & PARODD)
+			mr |= UART_MR2_PARITY_MODE_ODD;
+		else if (termios->c_cflag & CMSPAR)
+			mr |= UART_MR2_PARITY_MODE_SPACE;
+		else
+			mr |= UART_MR2_PARITY_MODE_EVEN;
+	}
+
+	/* calculate bits per char */
+	mr &= ~UART_MR2_BITS_PER_CHAR;
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		mr |= UART_MR2_BITS_PER_CHAR_5;
+		break;
+	case CS6:
+		mr |= UART_MR2_BITS_PER_CHAR_6;
+		break;
+	case CS7:
+		mr |= UART_MR2_BITS_PER_CHAR_7;
+		break;
+	case CS8:
+	default:
+		mr |= UART_MR2_BITS_PER_CHAR_8;
+		break;
+	}
+
+	/* calculate stop bits */
+	mr &= ~(UART_MR2_STOP_BIT_LEN_ONE | UART_MR2_STOP_BIT_LEN_TWO);
+	if (termios->c_cflag & CSTOPB)
+		mr |= UART_MR2_STOP_BIT_LEN_TWO;
+	else
+		mr |= UART_MR2_STOP_BIT_LEN_ONE;
+
+	/* set parity, bits per char, and stop bit */
+	msm_write(port, mr, UART_MR2);
+
+	/* calculate and set hardware flow control */
+	mr = msm_read(port, UART_MR1);
+	mr &= ~(UART_MR1_CTS_CTL | UART_MR1_RX_RDY_CTL);
+	if (termios->c_cflag & CRTSCTS) {
+		mr |= UART_MR1_CTS_CTL;
+		mr |= UART_MR1_RX_RDY_CTL;
+	}
+	msm_write(port, mr, UART_MR1);
+
+	/* Configure status bits to ignore based on termio flags. */
+	port->read_status_mask = 0;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= UART_SR_PAR_FRAME_ERR;
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		port->read_status_mask |= UART_SR_RX_BREAK;
+
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *msm_type(struct uart_port *port)
+{
+	return "MSM";
+}
+
+static void msm_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *resource;
+	resource_size_t size;
+
+	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (unlikely(!resource))
+		return;
+	size = resource->end - resource->start + 1;
+
+	release_mem_region(port->mapbase, size);
+	iounmap(port->membase);
+	port->membase = NULL;
+}
+
+static int msm_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *resource;
+	resource_size_t size;
+
+	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (unlikely(!resource))
+		return -ENXIO;
+	size = resource->end - resource->start + 1;
+
+	if (unlikely(!request_mem_region(port->mapbase, size, "msm_serial")))
+		return -EBUSY;
+
+	port->membase = ioremap(port->mapbase, size);
+	if (!port->membase) {
+		release_mem_region(port->mapbase, size);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void msm_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_MSM;
+		msm_request_port(port);
+	}
+}
+
+static int msm_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	if (unlikely(ser->type != PORT_UNKNOWN && ser->type != PORT_MSM))
+		return -EINVAL;
+	if (unlikely(port->irq != ser->irq))
+		return -EINVAL;
+	return 0;
+}
+
+static void msm_power(struct uart_port *port, unsigned int state,
+		      unsigned int oldstate)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	switch (state) {
+	case 0:
+		clk_enable(msm_port->clk);
+		break;
+	case 3:
+		clk_disable(msm_port->clk);
+		break;
+	default:
+		printk(KERN_ERR "msm_serial: Unknown PM state %d\n", state);
+	}
+}
+
+static struct uart_ops msm_uart_pops = {
+	.tx_empty = msm_tx_empty,
+	.set_mctrl = msm_set_mctrl,
+	.get_mctrl = msm_get_mctrl,
+	.stop_tx = msm_stop_tx,
+	.start_tx = msm_start_tx,
+	.stop_rx = msm_stop_rx,
+	.enable_ms = msm_enable_ms,
+	.break_ctl = msm_break_ctl,
+	.startup = msm_startup,
+	.shutdown = msm_shutdown,
+	.set_termios = msm_set_termios,
+	.type = msm_type,
+	.release_port = msm_release_port,
+	.request_port = msm_request_port,
+	.config_port = msm_config_port,
+	.verify_port = msm_verify_port,
+	.pm = msm_power,
+};
+
+static struct msm_port msm_uart_ports[] = {
+	{
+		.uart = {
+			.iotype = UPIO_MEM,
+			.ops = &msm_uart_pops,
+			.flags = UPF_BOOT_AUTOCONF,
+			.fifosize = 512,
+			.line = 0,
+		},
+	},
+	{
+		.uart = {
+			.iotype = UPIO_MEM,
+			.ops = &msm_uart_pops,
+			.flags = UPF_BOOT_AUTOCONF,
+			.fifosize = 512,
+			.line = 1,
+		},
+	},
+	{
+		.uart = {
+			.iotype = UPIO_MEM,
+			.ops = &msm_uart_pops,
+			.flags = UPF_BOOT_AUTOCONF,
+			.fifosize = 64,
+			.line = 2,
+		},
+	},
+};
+
+#define UART_NR	ARRAY_SIZE(msm_uart_ports)
+
+static inline struct uart_port *get_port_from_line(unsigned int line)
+{
+	return &msm_uart_ports[line].uart;
+}
+
+#ifdef CONFIG_SERIAL_MSM_CONSOLE
+
+static void msm_console_putchar(struct uart_port *port, int c)
+{
+	while (!(msm_read(port, UART_SR) & UART_SR_TX_READY))
+		;
+	msm_write(port, c, UART_TF);
+}
+
+static void msm_console_write(struct console *co, const char *s,
+			      unsigned int count)
+{
+	struct uart_port *port;
+	struct msm_port *msm_port;
+
+	BUG_ON(co->index < 0 || co->index >= UART_NR);
+
+	port = get_port_from_line(co->index);
+	msm_port = UART_TO_MSM(port);
+
+	spin_lock(&port->lock);
+	uart_console_write(port, s, count, msm_console_putchar);
+	spin_unlock(&port->lock);
+}
+
+static int __init msm_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud, flow, bits, parity;
+
+	if (unlikely(co->index >= UART_NR || co->index < 0))
+		return -ENXIO;
+
+	port = get_port_from_line(co->index);
+
+	if (unlikely(!port->membase))
+		return -ENXIO;
+
+	port->cons = co;
+
+	msm_init_clock(port);
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	bits = 8;
+	parity = 'n';
+	flow = 'n';
+	msm_write(port, UART_MR2_BITS_PER_CHAR_8 | UART_MR2_STOP_BIT_LEN_ONE,
+		  UART_MR2);	/* 8N1 */
+
+	if (baud < 300 || baud > 115200)
+		baud = 115200;
+	msm_set_baud_rate(port, baud);
+
+	msm_reset(port);
+
+	printk(KERN_INFO "msm_serial: console setup on port #%d\n", port->line);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver msm_uart_driver;
+
+static struct console msm_console = {
+	.name = "ttyMSM",
+	.write = msm_console_write,
+	.device = uart_console_device,
+	.setup = msm_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &msm_uart_driver,
+};
+
+#define MSM_CONSOLE	(&msm_console)
+
+#else
+#define MSM_CONSOLE	NULL
+#endif
+
+static struct uart_driver msm_uart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "msm_serial",
+	.dev_name = "ttyMSM",
+	.nr = UART_NR,
+	.cons = MSM_CONSOLE,
+};
+
+static int __init msm_serial_probe(struct platform_device *pdev)
+{
+	struct msm_port *msm_port;
+	struct resource *resource;
+	struct uart_port *port;
+
+	if (unlikely(pdev->id < 0 || pdev->id >= UART_NR))
+		return -ENXIO;
+
+	printk(KERN_INFO "msm_serial: detected port #%d\n", pdev->id);
+
+	port = get_port_from_line(pdev->id);
+	port->dev = &pdev->dev;
+	msm_port = UART_TO_MSM(port);
+
+	msm_port->clk = clk_get(&pdev->dev, "uart_clk");
+	if (unlikely(IS_ERR(msm_port->clk)))
+		return PTR_ERR(msm_port->clk);
+	port->uartclk = clk_get_rate(msm_port->clk);
+
+	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (unlikely(!resource))
+		return -ENXIO;
+	port->mapbase = resource->start;
+
+	port->irq = platform_get_irq(pdev, 0);
+	if (unlikely(port->irq < 0))
+		return -ENXIO;
+
+	platform_set_drvdata(pdev, port);
+
+	return uart_add_one_port(&msm_uart_driver, port);
+}
+
+static int __devexit msm_serial_remove(struct platform_device *pdev)
+{
+	struct msm_port *msm_port = platform_get_drvdata(pdev);
+
+	clk_put(msm_port->clk);
+
+	return 0;
+}
+
+static struct platform_driver msm_platform_driver = {
+	.probe = msm_serial_probe,
+	.remove = msm_serial_remove,
+	.driver = {
+		.name = "msm_serial",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init msm_serial_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&msm_uart_driver);
+	if (unlikely(ret))
+		return ret;
+
+	ret = platform_driver_probe(&msm_platform_driver, msm_serial_probe);
+	if (unlikely(ret))
+		uart_unregister_driver(&msm_uart_driver);
+
+	printk(KERN_INFO "msm_serial: driver initialized\n");
+
+	return ret;
+}
+
+static void __exit msm_serial_exit(void)
+{
+#ifdef CONFIG_SERIAL_MSM_CONSOLE
+	unregister_console(&msm_console);
+#endif
+	platform_driver_unregister(&msm_platform_driver);
+	uart_unregister_driver(&msm_uart_driver);
+}
+
+module_init(msm_serial_init);
+module_exit(msm_serial_exit);
+
+MODULE_AUTHOR("Robert Love <rlove@google.com>");
+MODULE_DESCRIPTION("Driver for msm7x serial device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/msm_serial.h b/drivers/serial/msm_serial.h
new file mode 100644
index 000000000000..689f1fa0e84e
--- /dev/null
+++ b/drivers/serial/msm_serial.h
@@ -0,0 +1,117 @@
+/*
+ * drivers/serial/msm_serial.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ * Author: Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __DRIVERS_SERIAL_MSM_SERIAL_H
+#define __DRIVERS_SERIAL_MSM_SERIAL_H
+
+#define UART_MR1			0x0000
+
+#define UART_MR1_AUTO_RFR_LEVEL0	0x3F
+#define UART_MR1_AUTO_RFR_LEVEL1	0x3FF00
+#define UART_MR1_RX_RDY_CTL    		(1 << 7)
+#define UART_MR1_CTS_CTL       		(1 << 6)
+
+#define UART_MR2			0x0004
+#define UART_MR2_ERROR_MODE		(1 << 6)
+#define UART_MR2_BITS_PER_CHAR		0x30
+#define UART_MR2_BITS_PER_CHAR_5	(0x0 << 4)
+#define UART_MR2_BITS_PER_CHAR_6	(0x1 << 4)
+#define UART_MR2_BITS_PER_CHAR_7	(0x2 << 4)
+#define UART_MR2_BITS_PER_CHAR_8	(0x3 << 4)
+#define UART_MR2_STOP_BIT_LEN_ONE	(0x1 << 2)
+#define UART_MR2_STOP_BIT_LEN_TWO	(0x3 << 2)
+#define UART_MR2_PARITY_MODE_NONE	0x0
+#define UART_MR2_PARITY_MODE_ODD	0x1
+#define UART_MR2_PARITY_MODE_EVEN	0x2
+#define UART_MR2_PARITY_MODE_SPACE	0x3
+#define UART_MR2_PARITY_MODE		0x3
+
+#define UART_CSR	0x0008
+#define UART_CSR_115200	0xFF
+#define UART_CSR_57600	0xEE
+#define UART_CSR_38400	0xDD
+#define UART_CSR_28800	0xCC
+#define UART_CSR_19200	0xBB
+#define UART_CSR_14400	0xAA
+#define UART_CSR_9600	0x99
+#define UART_CSR_4800	0x77
+#define UART_CSR_2400	0x55
+#define UART_CSR_1200	0x44
+#define UART_CSR_600	0x33
+#define UART_CSR_300	0x22
+
+#define UART_TF		0x000C
+
+#define UART_CR				0x0010
+#define UART_CR_CMD_NULL		(0 << 4)
+#define UART_CR_CMD_RESET_RX		(1 << 4)
+#define UART_CR_CMD_RESET_TX		(2 << 4)
+#define UART_CR_CMD_RESET_ERR		(3 << 4)
+#define UART_CR_CMD_RESET_BREAK_INT	(4 << 4)
+#define UART_CR_CMD_START_BREAK		(5 << 4)
+#define UART_CR_CMD_STOP_BREAK		(6 << 4)
+#define UART_CR_CMD_RESET_CTS		(7 << 4)
+#define UART_CR_CMD_PACKET_MODE		(9 << 4)
+#define UART_CR_CMD_MODE_RESET		(12 << 4)
+#define UART_CR_CMD_SET_RFR		(13 << 4)
+#define UART_CR_CMD_RESET_RFR		(14 << 4)
+#define UART_CR_TX_DISABLE		(1 << 3)
+#define UART_CR_TX_ENABLE		(1 << 3)
+#define UART_CR_RX_DISABLE		(1 << 3)
+#define UART_CR_RX_ENABLE		(1 << 3)
+
+#define UART_IMR		0x0014
+#define UART_IMR_TXLEV		(1 << 0)
+#define UART_IMR_RXSTALE	(1 << 3)
+#define UART_IMR_RXLEV		(1 << 4)
+#define UART_IMR_DELTA_CTS	(1 << 5)
+#define UART_IMR_CURRENT_CTS	(1 << 6)
+
+#define UART_IPR_RXSTALE_LAST		0x20
+#define UART_IPR_STALE_LSB		0x1F
+#define UART_IPR_STALE_TIMEOUT_MSB	0x3FF80
+
+#define UART_IPR	0x0018
+#define UART_TFWR	0x001C
+#define UART_RFWR	0x0020
+#define UART_HCR	0x0024
+
+#define UART_MREG		0x0028
+#define UART_NREG		0x002C
+#define UART_DREG		0x0030
+#define UART_MNDREG		0x0034
+#define UART_IRDA		0x0038
+#define UART_MISR_MODE		0x0040
+#define UART_MISR_RESET		0x0044
+#define UART_MISR_EXPORT	0x0048
+#define UART_MISR_VAL		0x004C
+#define UART_TEST_CTRL		0x0050
+
+#define UART_SR			0x0008
+#define UART_SR_HUNT_CHAR	(1 << 7)
+#define UART_SR_RX_BREAK	(1 << 6)
+#define UART_SR_PAR_FRAME_ERR	(1 << 5)
+#define UART_SR_OVERRUN		(1 << 4)
+#define UART_SR_TX_EMPTY	(1 << 3)
+#define UART_SR_TX_READY	(1 << 2)
+#define UART_SR_RX_FULL		(1 << 1)
+#define UART_SR_RX_READY	(1 << 0)
+
+#define UART_RF		0x000C
+#define UART_MISR	0x0010
+#define UART_ISR	0x0014
+
+#endif	/* __DRIVERS_SERIAL_MSM_SERIAL_H */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 6fd80c4243f1..23d2fb051f97 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -171,6 +171,9 @@
 /* Timberdale UART */
 #define PORT_TIMBUART	87
 
+/* Qualcomm MSM SoCs */
+#define PORT_MSM	88
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3-71-gd317


From 9a7aa12f3911853a3574d47d567b81a2a5df7208 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Jun 2009 15:26:49 +0200
Subject: vfs: Set special lockdep map for dirs only if not set by fs

Some filesystems need to set lockdep map for i_mutex differently for
different directories. For example OCFS2 has system directories (for
orphan inode tracking and for gathering all system files like journal
or quota files into a single place) which have different locking
locking rules than standard directories. For a filesystem setting
lockdep map is naturaly done when the inode is read but we have to
modify unlock_new_inode() not to overwrite the lockdep map the filesystem
has set.

Acked-by: peterz@infradead.org
CC: mingo@redhat.com
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/inode.c              | 17 +++++++++++------
 include/linux/lockdep.h | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index f643be565df8..04c785bb63c3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -665,12 +665,17 @@ void unlock_new_inode(struct inode *inode)
 	if (inode->i_mode & S_IFDIR) {
 		struct file_system_type *type = inode->i_sb->s_type;
 
-		/*
-		 * ensure nobody is actually holding i_mutex
-		 */
-		mutex_destroy(&inode->i_mutex);
-		mutex_init(&inode->i_mutex);
-		lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key);
+		/* Set new key only if filesystem hasn't already changed it */
+		if (!lockdep_match_class(&inode->i_mutex,
+		    &type->i_mutex_key)) {
+			/*
+			 * ensure nobody is actually holding i_mutex
+			 */
+			mutex_destroy(&inode->i_mutex);
+			mutex_init(&inode->i_mutex);
+			lockdep_set_class(&inode->i_mutex,
+					  &type->i_mutex_dir_key);
+		}
 	}
 #endif
 	/*
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index da5a5a1f4cd2..b25d1b53df0d 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -258,6 +258,16 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
 #define lockdep_set_subclass(lock, sub)	\
 		lockdep_init_map(&(lock)->dep_map, #lock, \
 				 (lock)->dep_map.key, sub)
+/*
+ * Compare locking classes
+ */
+#define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key)
+
+static inline int lockdep_match_key(struct lockdep_map *lock,
+				    struct lock_class_key *key)
+{
+	return lock->key == key;
+}
 
 /*
  * Acquire a lock.
@@ -326,6 +336,11 @@ static inline void lockdep_on(void)
 #define lockdep_set_class_and_subclass(lock, key, sub) \
 		do { (void)(key); } while (0)
 #define lockdep_set_subclass(lock, sub)		do { } while (0)
+/*
+ * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
+ * case since the result is not well defined and the caller should rather
+ * #ifdef the call himself.
+ */
 
 # define INIT_LOCKDEP
 # define lockdep_reset()		do { debug_locks = 1; } while (0)
-- 
cgit v1.2.3-71-gd317


From 31950eb66ff47c946fd9c65c2f8c94b6b7ba13fc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 22 Jun 2009 21:18:12 -0700
Subject: mm/init: cpu_hotplug_init() must be initialized before SLAB

SLAB uses get/put_online_cpus() which use a mutex which is itself only
initialized when cpu_hotplug_init() is called.  Currently we hang suring
boot in SLAB due to doing that too late.

Reported by James Bottomley and Sachin Sant (and possibly others).
Debugged by Benjamin Herrenschmidt.

This just removes the dynamic initialization of the data structures, and
replaces it with a static one, avoiding this dependency entirely, and
removing one unnecessary special initcall.

Tested-by: Sachin Sant <sachinp@in.ibm.com>
Tested-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpu.h |  5 -----
 init/main.c         |  1 -
 kernel/cpu.c        | 13 +++++--------
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2643d848df90..4d668e05d458 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -69,7 +69,6 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 
 int cpu_up(unsigned int cpu);
 void notify_cpu_starting(unsigned int cpu);
-extern void cpu_hotplug_init(void);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
 
@@ -84,10 +83,6 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
 
-static inline void cpu_hotplug_init(void)
-{
-}
-
 static inline void cpu_maps_update_begin(void)
 {
 }
diff --git a/init/main.c b/init/main.c
index 09131ec090c1..4870dfeb9ee5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -678,7 +678,6 @@ asmlinkage void __init start_kernel(void)
 #endif
 	page_cgroup_init();
 	enable_debug_pagealloc();
-	cpu_hotplug_init();
 	kmemtrace_init();
 	kmemleak_init();
 	debug_objects_mem_init();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
 	 * an ongoing cpu hotplug operation.
 	 */
 	int refcount;
-} cpu_hotplug;
-
-void __init cpu_hotplug_init(void)
-{
-	cpu_hotplug.active_writer = NULL;
-	mutex_init(&cpu_hotplug.lock);
-	cpu_hotplug.refcount = 0;
-}
+} cpu_hotplug = {
+	.active_writer = NULL,
+	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+	.refcount = 0,
+};
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-- 
cgit v1.2.3-71-gd317


From 616511d039af402670de8500d0e24495113a9cab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 22 Jun 2009 15:09:13 -0400
Subject: VFS: Uninline the function put_mnt_ns()

In order to allow modules to use it without having to export vfsmount_lock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c                | 8 ++++++--
 include/linux/mnt_namespace.h | 9 +--------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 2dd333b0fe7f..6645846f2056 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2246,10 +2246,14 @@ void __init mnt_init(void)
 	init_mount_tree();
 }
 
-void __put_mnt_ns(struct mnt_namespace *ns)
+void put_mnt_ns(struct mnt_namespace *ns)
 {
-	struct vfsmount *root = ns->root;
+	struct vfsmount *root;
 	LIST_HEAD(umount_list);
+
+	if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
+		return;
+	root = ns->root;
 	ns->root = NULL;
 	spin_unlock(&vfsmount_lock);
 	down_write(&namespace_sem);
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 3a059298cc19..299d11af5f79 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -26,14 +26,7 @@ struct fs_struct;
 
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
-extern void __put_mnt_ns(struct mnt_namespace *ns);
-
-static inline void put_mnt_ns(struct mnt_namespace *ns)
-{
-	if (atomic_dec_and_lock(&ns->count, &vfsmount_lock))
-		/* releases vfsmount_lock */
-		__put_mnt_ns(ns);
-}
+extern void put_mnt_ns(struct mnt_namespace *ns);
 
 static inline void exit_mnt_ns(struct task_struct *p)
 {
-- 
cgit v1.2.3-71-gd317


From cf8d2c11cb77f129675478792122f50827e5b0ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 22 Jun 2009 15:09:13 -0400
Subject: VFS: Add VFS helper functions for setting up private namespaces

The purpose of this patch is to improve the remote mount path lookup
support for distributed filesystems such as the NFSv4 client.

When given a mount command of the form "mount server:/foo/bar /mnt", the
NFSv4 client is required to look up the filehandle for "server:/", and
then look up each component of the remote mount path "foo/bar" in order
to find the directory that is actually going to be mounted on /mnt.
Following that remote mount path may involve following symlinks,
crossing server-side mount points and even following referrals to
filesystem volumes on other servers.

Since the standard VFS path lookup code already supports walking paths
that contain all these features (using in-kernel automounts for
following referrals) we would like to be able to reuse that rather than
duplicate the full path traversal functionality in the NFSv4 client code.

This patch therefore defines a VFS helper function create_mnt_ns(), that
sets up a temporary filesystem namespace and attaches a root filesystem to
it. It exports the create_mnt_ns() and put_mnt_ns() function for use by
filesystem modules.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c                | 45 +++++++++++++++++++++++++++++++++++--------
 include/linux/mnt_namespace.h |  1 +
 2 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 6645846f2056..a7bea8c8bd46 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1937,6 +1937,21 @@ dput_out:
 	return retval;
 }
 
+static struct mnt_namespace *alloc_mnt_ns(void)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	atomic_set(&new_ns->count, 1);
+	new_ns->root = NULL;
+	INIT_LIST_HEAD(&new_ns->list);
+	init_waitqueue_head(&new_ns->poll);
+	new_ns->event = 0;
+	return new_ns;
+}
+
 /*
  * Allocate a new namespace structure and populate it with contents
  * copied from the namespace of the passed in task structure.
@@ -1948,14 +1963,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct vfsmount *p, *q;
 
-	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-	if (!new_ns)
-		return ERR_PTR(-ENOMEM);
-
-	atomic_set(&new_ns->count, 1);
-	INIT_LIST_HEAD(&new_ns->list);
-	init_waitqueue_head(&new_ns->poll);
-	new_ns->event = 0;
+	new_ns = alloc_mnt_ns();
+	if (IS_ERR(new_ns))
+		return new_ns;
 
 	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
@@ -2019,6 +2029,24 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	return new_ns;
 }
 
+/**
+ * create_mnt_ns - creates a private namespace and adds a root filesystem
+ * @mnt: pointer to the new root filesystem mountpoint
+ */
+struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = alloc_mnt_ns();
+	if (!IS_ERR(new_ns)) {
+		mnt->mnt_ns = new_ns;
+		new_ns->root = mnt;
+		list_add(&new_ns->list, &new_ns->root->mnt_list);
+	}
+	return new_ns;
+}
+EXPORT_SYMBOL(create_mnt_ns);
+
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 		char __user *, type, unsigned long, flags, void __user *, data)
 {
@@ -2264,3 +2292,4 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	release_mounts(&umount_list);
 	kfree(ns);
 }
+EXPORT_SYMBOL(put_mnt_ns);
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 299d11af5f79..3beb2592b03f 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -24,6 +24,7 @@ struct proc_mounts {
 
 struct fs_struct;
 
+extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt);
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void put_mnt_ns(struct mnt_namespace *ns);
-- 
cgit v1.2.3-71-gd317


From 02ab18b0f497bed623814677577b76cc97234085 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 14 Jun 2009 04:32:04 -0300
Subject: V4L/DVB (12072): gspca-ov519: add extra controls

This patch adds autobrightness (so that it can
be turned off to make the already present brightness
control work) and light frequency filtering controls.

The lightfreq control needed 2 different entries
in the ctrls array, as the number of options differs
depending on the sensor. Always one of the 2 entires is
disabled ofcourse.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/gspca/ov519.c | 214 ++++++++++++++++++++++++++++++++++++--
 include/linux/videodev2.h         |   3 +-
 2 files changed, 207 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/ov519.c b/drivers/media/video/gspca/ov519.c
index 188866ac6cef..baa488dd33d8 100644
--- a/drivers/media/video/gspca/ov519.c
+++ b/drivers/media/video/gspca/ov519.c
@@ -65,6 +65,8 @@ struct sd {
 	__u8 colors;
 	__u8 hflip;
 	__u8 vflip;
+	__u8 autobrightness;
+	__u8 freq;
 
 	__u8 stopped;		/* Streaming is temporarily paused */
 
@@ -94,11 +96,17 @@ static int sd_sethflip(struct gspca_dev *gspca_dev, __s32 val);
 static int sd_gethflip(struct gspca_dev *gspca_dev, __s32 *val);
 static int sd_setvflip(struct gspca_dev *gspca_dev, __s32 val);
 static int sd_getvflip(struct gspca_dev *gspca_dev, __s32 *val);
+static int sd_setautobrightness(struct gspca_dev *gspca_dev, __s32 val);
+static int sd_getautobrightness(struct gspca_dev *gspca_dev, __s32 *val);
+static int sd_setfreq(struct gspca_dev *gspca_dev, __s32 val);
+static int sd_getfreq(struct gspca_dev *gspca_dev, __s32 *val);
 static void setbrightness(struct gspca_dev *gspca_dev);
 static void setcontrast(struct gspca_dev *gspca_dev);
 static void setcolors(struct gspca_dev *gspca_dev);
+static void setautobrightness(struct sd *sd);
+static void setfreq(struct sd *sd);
 
-static struct ctrl sd_ctrls[] = {
+static const struct ctrl sd_ctrls[] = {
 	{
 	    {
 		.id      = V4L2_CID_BRIGHTNESS,
@@ -141,7 +149,7 @@ static struct ctrl sd_ctrls[] = {
 	    .set = sd_setcolors,
 	    .get = sd_getcolors,
 	},
-/* next controls work with ov7670 only */
+/* The flip controls work with ov7670 only */
 #define HFLIP_IDX 3
 	{
 	    {
@@ -172,6 +180,51 @@ static struct ctrl sd_ctrls[] = {
 	    .set = sd_setvflip,
 	    .get = sd_getvflip,
 	},
+#define AUTOBRIGHT_IDX 5
+	{
+	    {
+		.id      = V4L2_CID_AUTOBRIGHTNESS,
+		.type    = V4L2_CTRL_TYPE_BOOLEAN,
+		.name    = "Auto Brightness",
+		.minimum = 0,
+		.maximum = 1,
+		.step    = 1,
+#define AUTOBRIGHT_DEF 1
+		.default_value = AUTOBRIGHT_DEF,
+	    },
+	    .set = sd_setautobrightness,
+	    .get = sd_getautobrightness,
+	},
+#define FREQ_IDX 6
+	{
+	    {
+		.id	 = V4L2_CID_POWER_LINE_FREQUENCY,
+		.type    = V4L2_CTRL_TYPE_MENU,
+		.name    = "Light frequency filter",
+		.minimum = 0,
+		.maximum = 2,	/* 0: 0, 1: 50Hz, 2:60Hz */
+		.step    = 1,
+#define FREQ_DEF 0
+		.default_value = FREQ_DEF,
+	    },
+	    .set = sd_setfreq,
+	    .get = sd_getfreq,
+	},
+#define OV7670_FREQ_IDX 7
+	{
+	    {
+		.id	 = V4L2_CID_POWER_LINE_FREQUENCY,
+		.type    = V4L2_CTRL_TYPE_MENU,
+		.name    = "Light frequency filter",
+		.minimum = 0,
+		.maximum = 3,	/* 0: 0, 1: 50Hz, 2:60Hz 3: Auto Hz */
+		.step    = 1,
+#define OV7670_FREQ_DEF 3
+		.default_value = OV7670_FREQ_DEF,
+	    },
+	    .set = sd_setfreq,
+	    .get = sd_getfreq,
+	},
 };
 
 static const struct v4l2_pix_format ov519_vga_mode[] = {
@@ -416,7 +469,7 @@ static const struct ov_i2c_regvals norm_6x30[] = {
 	{ 0x07, 0x2d }, /* Sharpness */
 	{ 0x0c, 0x20 },
 	{ 0x0d, 0x20 },
-	{ 0x0e, 0x20 },
+	{ 0x0e, 0xa0 }, /* Was 0x20, bit7 enables a 2x gain which we need */
 	{ 0x0f, 0x05 },
 	{ 0x10, 0x9a },
 	{ 0x11, 0x00 }, /* Pixel clock = fastest */
@@ -1659,9 +1712,21 @@ static int sd_config(struct gspca_dev *gspca_dev,
 	sd->colors = COLOR_DEF;
 	sd->hflip = HFLIP_DEF;
 	sd->vflip = VFLIP_DEF;
-	if (sd->sensor != SEN_OV7670)
-		gspca_dev->ctrl_dis = (1 << HFLIP_IDX)
-					| (1 << VFLIP_IDX);
+	sd->autobrightness = AUTOBRIGHT_DEF;
+	if (sd->sensor == SEN_OV7670) {
+		sd->freq = OV7670_FREQ_DEF;
+		gspca_dev->ctrl_dis = 1 << FREQ_IDX;
+	} else {
+		sd->freq = FREQ_DEF;
+		gspca_dev->ctrl_dis = (1 << HFLIP_IDX) | (1 << VFLIP_IDX) |
+				      (1 << OV7670_FREQ_IDX);
+	}
+	if (sd->sensor == SEN_OV7640 || sd->sensor == SEN_OV7670)
+		gspca_dev->ctrl_dis |= 1 << AUTOBRIGHT_IDX;
+	/* OV8610 Frequency filter control should work but needs testing */
+	if (sd->sensor == SEN_OV8610)
+		gspca_dev->ctrl_dis |= 1 << FREQ_IDX;
+
 	return 0;
 error:
 	PDEBUG(D_ERR, "OV519 Config failed");
@@ -2233,7 +2298,6 @@ static int set_ov_sensor_window(struct sd *sd)
 		msleep(10);	/* need to sleep between read and write to
 				 * same reg! */
 		i2c_w(sd, OV7670_REG_VREF, v);
-		sethvflip(sd);
 	} else {
 		i2c_w(sd, 0x17, hwsbase);
 		i2c_w(sd, 0x18, hwebase + (sd->gspca_dev.width >> hwscale));
@@ -2268,6 +2332,9 @@ static int sd_start(struct gspca_dev *gspca_dev)
 	setcontrast(gspca_dev);
 	setbrightness(gspca_dev);
 	setcolors(gspca_dev);
+	sethvflip(sd);
+	setautobrightness(sd);
+	setfreq(sd);
 
 	ret = ov51x_restart(sd);
 	if (ret < 0)
@@ -2394,8 +2461,7 @@ static void setbrightness(struct gspca_dev *gspca_dev)
 		break;
 	case SEN_OV7620:
 		/* 7620 doesn't like manual changes when in auto mode */
-/*fixme
- *		if (!sd->auto_brt) */
+		if (!sd->autobrightness)
 			i2c_w(sd, OV7610_REG_BRT, val);
 		break;
 	case SEN_OV7670:
@@ -2482,6 +2548,70 @@ static void setcolors(struct gspca_dev *gspca_dev)
 	}
 }
 
+static void setautobrightness(struct sd *sd)
+{
+	if (sd->sensor == SEN_OV7640 || sd->sensor == SEN_OV7670)
+		return;
+
+	i2c_w_mask(sd, 0x2d, sd->autobrightness ? 0x10 : 0x00, 0x10);
+}
+
+static void setfreq(struct sd *sd)
+{
+	if (sd->sensor == SEN_OV7670) {
+		switch (sd->freq) {
+		case 0: /* Banding filter disabled */
+			i2c_w_mask(sd, OV7670_REG_COM8, 0, OV7670_COM8_BFILT);
+			break;
+		case 1: /* 50 hz */
+			i2c_w_mask(sd, OV7670_REG_COM8, OV7670_COM8_BFILT,
+				   OV7670_COM8_BFILT);
+			i2c_w_mask(sd, OV7670_REG_COM11, 0x08, 0x18);
+			break;
+		case 2: /* 60 hz */
+			i2c_w_mask(sd, OV7670_REG_COM8, OV7670_COM8_BFILT,
+				   OV7670_COM8_BFILT);
+			i2c_w_mask(sd, OV7670_REG_COM11, 0x00, 0x18);
+			break;
+		case 3: /* Auto hz */
+			i2c_w_mask(sd, OV7670_REG_COM8, OV7670_COM8_BFILT,
+				   OV7670_COM8_BFILT);
+			i2c_w_mask(sd, OV7670_REG_COM11, OV7670_COM11_HZAUTO,
+				   0x18);
+			break;
+		}
+	} else {
+		switch (sd->freq) {
+		case 0: /* Banding filter disabled */
+			i2c_w_mask(sd, 0x2d, 0x00, 0x04);
+			i2c_w_mask(sd, 0x2a, 0x00, 0x80);
+			break;
+		case 1: /* 50 hz (filter on and framerate adj) */
+			i2c_w_mask(sd, 0x2d, 0x04, 0x04);
+			i2c_w_mask(sd, 0x2a, 0x80, 0x80);
+			/* 20 fps -> 16.667 fps */
+			if (sd->sensor == SEN_OV6620 ||
+			    sd->sensor == SEN_OV6630)
+				i2c_w(sd, 0x2b, 0x5e);
+			else
+				i2c_w(sd, 0x2b, 0xac);
+			break;
+		case 2: /* 60 hz (filter on, ...) */
+			i2c_w_mask(sd, 0x2d, 0x04, 0x04);
+			if (sd->sensor == SEN_OV6620 ||
+			    sd->sensor == SEN_OV6630) {
+				/* 20 fps -> 15 fps */
+				i2c_w_mask(sd, 0x2a, 0x80, 0x80);
+				i2c_w(sd, 0x2b, 0xa8);
+			} else {
+				/* no framerate adj. */
+				i2c_w_mask(sd, 0x2a, 0x00, 0x80);
+			}
+			break;
+		}
+	}
+}
+
 static int sd_setbrightness(struct gspca_dev *gspca_dev, __s32 val)
 {
 	struct sd *sd = (struct sd *) gspca_dev;
@@ -2572,6 +2702,71 @@ static int sd_getvflip(struct gspca_dev *gspca_dev, __s32 *val)
 	return 0;
 }
 
+static int sd_setautobrightness(struct gspca_dev *gspca_dev, __s32 val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	sd->autobrightness = val;
+	if (gspca_dev->streaming)
+		setautobrightness(sd);
+	return 0;
+}
+
+static int sd_getautobrightness(struct gspca_dev *gspca_dev, __s32 *val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	*val = sd->autobrightness;
+	return 0;
+}
+
+static int sd_setfreq(struct gspca_dev *gspca_dev, __s32 val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	sd->freq = val;
+	if (gspca_dev->streaming)
+		setfreq(sd);
+	return 0;
+}
+
+static int sd_getfreq(struct gspca_dev *gspca_dev, __s32 *val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	*val = sd->freq;
+	return 0;
+}
+
+static int sd_querymenu(struct gspca_dev *gspca_dev,
+			struct v4l2_querymenu *menu)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	switch (menu->id) {
+	case V4L2_CID_POWER_LINE_FREQUENCY:
+		switch (menu->index) {
+		case 0:		/* V4L2_CID_POWER_LINE_FREQUENCY_DISABLED */
+			strcpy((char *) menu->name, "NoFliker");
+			return 0;
+		case 1:		/* V4L2_CID_POWER_LINE_FREQUENCY_50HZ */
+			strcpy((char *) menu->name, "50 Hz");
+			return 0;
+		case 2:		/* V4L2_CID_POWER_LINE_FREQUENCY_60HZ */
+			strcpy((char *) menu->name, "60 Hz");
+			return 0;
+		case 3:
+			if (sd->sensor != SEN_OV7670)
+				return -EINVAL;
+
+			strcpy((char *) menu->name, "Automatic");
+			return 0;
+		}
+		break;
+	}
+	return -EINVAL;
+}
+
 /* sub-driver description */
 static const struct sd_desc sd_desc = {
 	.name = MODULE_NAME,
@@ -2582,6 +2777,7 @@ static const struct sd_desc sd_desc = {
 	.start = sd_start,
 	.stopN = sd_stopN,
 	.pkt_scan = sd_pkt_scan,
+	.querymenu = sd_querymenu,
 };
 
 /* -- module initialisation -- */
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index f24eceecc5a6..772d226cb5ca 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -894,9 +894,10 @@ enum v4l2_colorfx {
 	V4L2_COLORFX_BW		= 1,
 	V4L2_COLORFX_SEPIA	= 2,
 };
+#define V4L2_CID_AUTOBRIGHTNESS			(V4L2_CID_BASE+32)
 
 /* last CID + 1 */
-#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+32)
+#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+33)
 
 /*  MPEG-class control IDs defined by V4L2 */
 #define V4L2_CID_MPEG_BASE 			(V4L2_CTRL_CLASS_MPEG | 0x900)
-- 
cgit v1.2.3-71-gd317


From 1876bb923c98c605eca69f0bfe295f7b5f5eba28 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 14 Jun 2009 06:45:50 -0300
Subject: V4L/DVB (12079): gspca_ov519: add support for the ov511 bridge

gspca_ov519: add support for the ov511 bridge

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/gspca/ov519.c | 533 +++++++++++++++++++++++++++++++++++++-
 include/linux/videodev2.h         |   1 +
 2 files changed, 521 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/ov519.c b/drivers/media/video/gspca/ov519.c
index 9d4b69dbf966..1f8e2613ecc5 100644
--- a/drivers/media/video/gspca/ov519.c
+++ b/drivers/media/video/gspca/ov519.c
@@ -76,8 +76,8 @@ struct sd {
 
 	__u8 stopped;		/* Streaming is temporarily paused */
 
-	__u8 frame_rate;	/* current Framerate (OV519 only) */
-	__u8 clockdiv;		/* clockdiv override for OV519 only */
+	__u8 frame_rate;	/* current Framerate */
+	__u8 clockdiv;		/* clockdiv override */
 
 	char sensor;		/* Type of image sensor chip (SEN_*) */
 #define SEN_UNKNOWN 0
@@ -304,17 +304,77 @@ static const struct v4l2_pix_format ov518_sif_mode[] = {
 		.priv = 0},
 };
 
+static const struct v4l2_pix_format ov511_vga_mode[] = {
+	{320, 240, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 320,
+		.sizeimage = 320 * 240 * 3,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 1},
+	{640, 480, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 640,
+		.sizeimage = 640 * 480 * 2,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 0},
+};
+static const struct v4l2_pix_format ov511_sif_mode[] = {
+	{160, 120, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 160,
+		.sizeimage = 40000,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 3},
+	{176, 144, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 176,
+		.sizeimage = 40000,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 1},
+	{320, 240, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 320,
+		.sizeimage = 320 * 240 * 3,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 2},
+	{352, 288, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 352,
+		.sizeimage = 352 * 288 * 3,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 0},
+};
 
 /* Registers common to OV511 / OV518 */
+#define R51x_FIFO_PSIZE			0x30	/* 2 bytes wide w/ OV518(+) */
 #define R51x_SYS_RESET          	0x50
+	/* Reset type flags */
+	#define	OV511_RESET_OMNICE	0x08
 #define R51x_SYS_INIT         		0x53
 #define R51x_SYS_SNAP			0x52
 #define R51x_SYS_CUST_ID		0x5F
 #define R51x_COMP_LUT_BEGIN		0x80
 
 /* OV511 Camera interface register numbers */
+#define R511_CAM_DELAY			0x10
+#define R511_CAM_EDGE			0x11
+#define R511_CAM_PXCNT			0x12
+#define R511_CAM_LNCNT			0x13
+#define R511_CAM_PXDIV			0x14
+#define R511_CAM_LNDIV			0x15
+#define R511_CAM_UV_EN			0x16
+#define R511_CAM_LINE_MODE		0x17
+#define R511_CAM_OPTS			0x18
+
+#define R511_SNAP_FRAME			0x19
+#define R511_SNAP_PXCNT			0x1A
+#define R511_SNAP_LNCNT			0x1B
+#define R511_SNAP_PXDIV			0x1C
+#define R511_SNAP_LNDIV			0x1D
+#define R511_SNAP_UV_EN			0x1E
+#define R511_SNAP_UV_EN			0x1E
+#define R511_SNAP_OPTS			0x1F
+
+#define R511_DRAM_FLOW_CTL		0x20
+#define R511_FIFO_OPTS			0x31
+#define R511_I2C_CTL			0x40
 #define R511_SYS_LED_CTL		0x55	/* OV511+ only */
-#define	OV511_RESET_NOREGS		0x3F	/* All but OV511 & regs */
+#define R511_COMP_EN			0x78
+#define R511_COMP_LUT_EN		0x79
 
 /* OV518 Camera interface register numbers */
 #define R518_GPIO_OUT			0x56	/* OV518(+) only */
@@ -1079,13 +1139,128 @@ static int ov518_reg_w32(struct sd *sd, __u16 index, u32 value, int n)
 	return ret;
 }
 
+static int ov511_i2c_w(struct sd *sd, __u8 reg, __u8 value)
+{
+	int rc, retries;
+
+	PDEBUG(D_USBO, "i2c 0x%02x -> [0x%02x]", value, reg);
+
+	/* Three byte write cycle */
+	for (retries = 6; ; ) {
+		/* Select camera register */
+		rc = reg_w(sd, R51x_I2C_SADDR_3, reg);
+		if (rc < 0)
+			return rc;
+
+		/* Write "value" to I2C data port of OV511 */
+		rc = reg_w(sd, R51x_I2C_DATA, value);
+		if (rc < 0)
+			return rc;
+
+		/* Initiate 3-byte write cycle */
+		rc = reg_w(sd, R511_I2C_CTL, 0x01);
+		if (rc < 0)
+			return rc;
+
+		do
+			rc = reg_r(sd, R511_I2C_CTL);
+		while (rc > 0 && ((rc & 1) == 0)); /* Retry until idle */
+
+		if (rc < 0)
+			return rc;
+
+		if ((rc & 2) == 0) /* Ack? */
+			break;
+		if (--retries < 0) {
+			PDEBUG(D_USBO, "i2c write retries exhausted");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int ov511_i2c_r(struct sd *sd, __u8 reg)
+{
+	int rc, value, retries;
+
+	/* Two byte write cycle */
+	for (retries = 6; ; ) {
+		/* Select camera register */
+		rc = reg_w(sd, R51x_I2C_SADDR_2, reg);
+		if (rc < 0)
+			return rc;
+
+		/* Initiate 2-byte write cycle */
+		rc = reg_w(sd, R511_I2C_CTL, 0x03);
+		if (rc < 0)
+			return rc;
+
+		do
+			rc = reg_r(sd, R511_I2C_CTL);
+		while (rc > 0 && ((rc & 1) == 0)); /* Retry until idle */
+
+		if (rc < 0)
+			return rc;
+
+		if ((rc & 2) == 0) /* Ack? */
+			break;
+
+		/* I2C abort */
+		reg_w(sd, R511_I2C_CTL, 0x10);
+
+		if (--retries < 0) {
+			PDEBUG(D_USBI, "i2c write retries exhausted");
+			return -1;
+		}
+	}
+
+	/* Two byte read cycle */
+	for (retries = 6; ; ) {
+		/* Initiate 2-byte read cycle */
+		rc = reg_w(sd, R511_I2C_CTL, 0x05);
+		if (rc < 0)
+			return rc;
+
+		do
+			rc = reg_r(sd, R511_I2C_CTL);
+		while (rc > 0 && ((rc & 1) == 0)); /* Retry until idle */
+
+		if (rc < 0)
+			return rc;
+
+		if ((rc & 2) == 0) /* Ack? */
+			break;
+
+		/* I2C abort */
+		rc = reg_w(sd, R511_I2C_CTL, 0x10);
+		if (rc < 0)
+			return rc;
+
+		if (--retries < 0) {
+			PDEBUG(D_USBI, "i2c read retries exhausted");
+			return -1;
+		}
+	}
+
+	value = reg_r(sd, R51x_I2C_DATA);
+
+	PDEBUG(D_USBI, "i2c [0x%02X] -> 0x%02X", reg, value);
+
+	/* This is needed to make i2c_w() work */
+	rc = reg_w(sd, R511_I2C_CTL, 0x05);
+	if (rc < 0)
+		return rc;
+
+	return value;
+}
 
 /*
  * The OV518 I2C I/O procedure is different, hence, this function.
  * This is normally only called from i2c_w(). Note that this function
  * always succeeds regardless of whether the sensor is present and working.
  */
-static int i2c_w(struct sd *sd,
+static int ov518_i2c_w(struct sd *sd,
 		__u8 reg,
 		__u8 value)
 {
@@ -1120,7 +1295,7 @@ static int i2c_w(struct sd *sd,
  * This is normally only called from i2c_r(). Note that this function
  * always succeeds regardless of whether the sensor is present and working.
  */
-static int i2c_r(struct sd *sd, __u8 reg)
+static int ov518_i2c_r(struct sd *sd, __u8 reg)
 {
 	int rc, value;
 
@@ -1143,6 +1318,34 @@ static int i2c_r(struct sd *sd, __u8 reg)
 	return value;
 }
 
+static int i2c_w(struct sd *sd, __u8 reg, __u8 value)
+{
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		return ov511_i2c_w(sd, reg, value);
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+	case BRIDGE_OV519:
+		return ov518_i2c_w(sd, reg, value);
+	}
+	return -1; /* Should never happen */
+}
+
+static int i2c_r(struct sd *sd, __u8 reg)
+{
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		return ov511_i2c_r(sd, reg);
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+	case BRIDGE_OV519:
+		return ov518_i2c_r(sd, reg);
+	}
+	return -1; /* Should never happen */
+}
+
 /* Writes bits at positions specified by mask to an I2C reg. Bits that are in
  * the same position as 1's in "mask" are cleared and set to "value". Bits
  * that are in the same position as 0's in "mask" are preserved, regardless
@@ -1490,9 +1693,31 @@ static void ov51x_led_control(struct sd *sd, int on)
 	}
 }
 
-/* OV518 quantization tables are 8x4 (instead of 8x8) */
-static int ov518_upload_quan_tables(struct sd *sd)
+static int ov51x_upload_quan_tables(struct sd *sd)
 {
+	const unsigned char yQuanTable511[] = {
+		0, 1, 1, 2, 2, 3, 3, 4,
+		1, 1, 1, 2, 2, 3, 4, 4,
+		1, 1, 2, 2, 3, 4, 4, 4,
+		2, 2, 2, 3, 4, 4, 4, 4,
+		2, 2, 3, 4, 4, 5, 5, 5,
+		3, 3, 4, 4, 5, 5, 5, 5,
+		3, 4, 4, 4, 5, 5, 5, 5,
+		4, 4, 4, 4, 5, 5, 5, 5
+	};
+
+	const unsigned char uvQuanTable511[] = {
+		0, 2, 2, 3, 4, 4, 4, 4,
+		2, 2, 2, 4, 4, 4, 4, 4,
+		2, 2, 3, 4, 4, 4, 4, 4,
+		3, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4
+	};
+
+	/* OV518 quantization tables are 8x4 (instead of 8x8) */
 	const unsigned char yQuanTable518[] = {
 		5, 4, 5, 6, 6, 7, 7, 7,
 		5, 5, 5, 5, 6, 7, 7, 7,
@@ -1507,14 +1732,23 @@ static int ov518_upload_quan_tables(struct sd *sd)
 		7, 7, 7, 7, 7, 7, 8, 8
 	};
 
-	const unsigned char *pYTable = yQuanTable518;
-	const unsigned char *pUVTable = uvQuanTable518;
+	const unsigned char *pYTable, *pUVTable;
 	unsigned char val0, val1;
-	int i, rc, reg = R51x_COMP_LUT_BEGIN;
+	int i, size, rc, reg = R51x_COMP_LUT_BEGIN;
 
 	PDEBUG(D_PROBE, "Uploading quantization tables");
 
-	for (i = 0; i < 16; i++) {
+	if (sd->bridge == BRIDGE_OV511 || sd->bridge == BRIDGE_OV511PLUS) {
+		pYTable = yQuanTable511;
+		pUVTable = uvQuanTable511;
+		size  = 32;
+	} else {
+		pYTable = yQuanTable518;
+		pUVTable = uvQuanTable518;
+		size  = 16;
+	}
+
+	for (i = 0; i < size; i++) {
 		val0 = *pYTable++;
 		val1 = *pYTable++;
 		val0 &= 0x0f;
@@ -1529,7 +1763,7 @@ static int ov518_upload_quan_tables(struct sd *sd)
 		val0 &= 0x0f;
 		val1 &= 0x0f;
 		val0 |= val1 << 4;
-		rc = reg_w(sd, reg + 16, val0);
+		rc = reg_w(sd, reg + size, val0);
 		if (rc < 0)
 			return rc;
 
@@ -1539,6 +1773,87 @@ static int ov518_upload_quan_tables(struct sd *sd)
 	return 0;
 }
 
+/* This initializes the OV511/OV511+ and the sensor */
+static int ov511_configure(struct gspca_dev *gspca_dev)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+	int rc;
+
+	/* For 511 and 511+ */
+	const struct ov_regvals init_511[] = {
+		{ R51x_SYS_RESET,	0x7f },
+		{ R51x_SYS_INIT,	0x01 },
+		{ R51x_SYS_RESET,	0x7f },
+		{ R51x_SYS_INIT,	0x01 },
+		{ R51x_SYS_RESET,	0x3f },
+		{ R51x_SYS_INIT,	0x01 },
+		{ R51x_SYS_RESET,	0x3d },
+	};
+
+	const struct ov_regvals norm_511[] = {
+		{ R511_DRAM_FLOW_CTL, 	0x01 },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R51x_SYS_SNAP,	0x02 },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R511_FIFO_OPTS,	0x1f },
+		{ R511_COMP_EN,		0x00 },
+		{ R511_COMP_LUT_EN,	0x03 },
+	};
+
+	const struct ov_regvals norm_511_p[] = {
+		{ R511_DRAM_FLOW_CTL,	0xff },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R51x_SYS_SNAP,	0x02 },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R511_FIFO_OPTS,	0xff },
+		{ R511_COMP_EN,		0x00 },
+		{ R511_COMP_LUT_EN,	0x03 },
+	};
+
+	const struct ov_regvals compress_511[] = {
+		{ 0x70, 0x1f },
+		{ 0x71, 0x05 },
+		{ 0x72, 0x06 },
+		{ 0x73, 0x06 },
+		{ 0x74, 0x14 },
+		{ 0x75, 0x03 },
+		{ 0x76, 0x04 },
+		{ 0x77, 0x04 },
+	};
+
+	PDEBUG(D_PROBE, "Device custom id %x", reg_r(sd, R51x_SYS_CUST_ID));
+
+	rc = write_regvals(sd, init_511, ARRAY_SIZE(init_511));
+	if (rc < 0)
+		return rc;
+
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+		rc = write_regvals(sd, norm_511, ARRAY_SIZE(norm_511));
+		if (rc < 0)
+			return rc;
+		break;
+	case BRIDGE_OV511PLUS:
+		rc = write_regvals(sd, norm_511_p, ARRAY_SIZE(norm_511_p));
+		if (rc < 0)
+			return rc;
+		break;
+	}
+
+	/* Init compression */
+	rc = write_regvals(sd, compress_511, ARRAY_SIZE(compress_511));
+	if (rc < 0)
+		return rc;
+
+	rc = ov51x_upload_quan_tables(sd);
+	if (rc < 0) {
+		PDEBUG(D_ERR, "Error uploading quantization tables");
+		return rc;
+	}
+
+	return 0;
+}
+
 /* This initializes the OV518/OV518+ and the sensor */
 static int ov518_configure(struct gspca_dev *gspca_dev)
 {
@@ -1615,7 +1930,7 @@ static int ov518_configure(struct gspca_dev *gspca_dev)
 		break;
 	}
 
-	rc = ov518_upload_quan_tables(sd);
+	rc = ov51x_upload_quan_tables(sd);
 	if (rc < 0) {
 		PDEBUG(D_ERR, "Error uploading quantization tables");
 		return rc;
@@ -1661,6 +1976,10 @@ static int sd_config(struct gspca_dev *gspca_dev,
 	sd->invert_led = id->driver_info & BRIDGE_INVERT_LED;
 
 	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		ret = ov511_configure(gspca_dev);
+		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
 		ret = ov518_configure(gspca_dev);
@@ -1719,6 +2038,16 @@ static int sd_config(struct gspca_dev *gspca_dev,
 
 	cam = &gspca_dev->cam;
 	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		if (!sd->sif) {
+			cam->cam_mode = ov511_vga_mode;
+			cam->nmodes = ARRAY_SIZE(ov511_vga_mode);
+		} else {
+			cam->cam_mode = ov511_sif_mode;
+			cam->nmodes = ARRAY_SIZE(ov511_sif_mode);
+		}
+		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
 		if (!sd->sif) {
@@ -1810,6 +2139,126 @@ static int sd_init(struct gspca_dev *gspca_dev)
 	return 0;
 }
 
+/* Set up the OV511/OV511+ with the given image parameters.
+ *
+ * Do not put any sensor-specific code in here (including I2C I/O functions)
+ */
+static int ov511_mode_init_regs(struct sd *sd)
+{
+	int hsegs, vsegs, packet_size, fps, needed;
+	int interlaced = 0;
+	struct usb_host_interface *alt;
+	struct usb_interface *intf;
+
+	intf = usb_ifnum_to_if(sd->gspca_dev.dev, sd->gspca_dev.iface);
+	alt = usb_altnum_to_altsetting(intf, sd->gspca_dev.alt);
+	if (!alt) {
+		PDEBUG(D_ERR, "Couldn't get altsetting");
+		return -EIO;
+	}
+
+	packet_size = le16_to_cpu(alt->endpoint[0].desc.wMaxPacketSize);
+	reg_w(sd, R51x_FIFO_PSIZE, packet_size >> 5);
+
+	reg_w(sd, R511_CAM_UV_EN, 0x01);
+	reg_w(sd, R511_SNAP_UV_EN, 0x01);
+	reg_w(sd, R511_SNAP_OPTS, 0x03);
+
+	/* Here I'm assuming that snapshot size == image size.
+	 * I hope that's always true. --claudio
+	 */
+	hsegs = (sd->gspca_dev.width >> 3) - 1;
+	vsegs = (sd->gspca_dev.height >> 3) - 1;
+
+	reg_w(sd, R511_CAM_PXCNT, hsegs);
+	reg_w(sd, R511_CAM_LNCNT, vsegs);
+	reg_w(sd, R511_CAM_PXDIV, 0x00);
+	reg_w(sd, R511_CAM_LNDIV, 0x00);
+
+	/* YUV420, low pass filter on */
+	reg_w(sd, R511_CAM_OPTS, 0x03);
+
+	/* Snapshot additions */
+	reg_w(sd, R511_SNAP_PXCNT, hsegs);
+	reg_w(sd, R511_SNAP_LNCNT, vsegs);
+	reg_w(sd, R511_SNAP_PXDIV, 0x00);
+	reg_w(sd, R511_SNAP_LNDIV, 0x00);
+
+	/******** Set the framerate ********/
+	if (frame_rate > 0)
+		sd->frame_rate = frame_rate;
+
+	switch (sd->sensor) {
+	case SEN_OV6620:
+		/* No framerate control, doesn't like higher rates yet */
+		sd->clockdiv = 3;
+		break;
+
+	/* Note once the FIXME's in mode_init_ov_sensor_regs() are fixed
+	   for more sensors we need to do this for them too */
+	case SEN_OV7620:
+	case SEN_OV7640:
+		if (sd->gspca_dev.width == 320)
+			interlaced = 1;
+		/* Fall through */
+	case SEN_OV6630:
+	case SEN_OV76BE:
+	case SEN_OV7610:
+	case SEN_OV7670:
+		switch (sd->frame_rate) {
+		case 30:
+		case 25:
+			/* Not enough bandwidth to do 640x480 @ 30 fps */
+			if (sd->gspca_dev.width != 640) {
+				sd->clockdiv = 0;
+				break;
+			}
+			/* Fall through for 640x480 case */
+		default:
+/*		case 20: */
+/*		case 15: */
+			sd->clockdiv = 1;
+			break;
+		case 10:
+			sd->clockdiv = 2;
+			break;
+		case 5:
+			sd->clockdiv = 5;
+			break;
+		}
+		if (interlaced) {
+			sd->clockdiv = (sd->clockdiv + 1) * 2 - 1;
+			/* Higher then 10 does not work */
+			if (sd->clockdiv > 10)
+				sd->clockdiv = 10;
+		}
+		break;
+
+	case SEN_OV8610:
+		/* No framerate control ?? */
+		sd->clockdiv = 0;
+		break;
+	}
+
+	/* Check if we have enough bandwidth to disable compression */
+	fps = (interlaced ? 60 : 30) / (sd->clockdiv + 1) + 1;
+	needed = fps * sd->gspca_dev.width * sd->gspca_dev.height * 3 / 2;
+	/* 1400 is a conservative estimate of the max nr of isoc packets/sec */
+	if (needed > 1400 * packet_size) {
+		/* Enable Y and UV quantization and compression */
+		reg_w(sd, R511_COMP_EN, 0x07);
+		reg_w(sd, R511_COMP_LUT_EN, 0x03);
+	} else {
+		reg_w(sd, R511_COMP_EN, 0x06);
+		reg_w(sd, R511_COMP_LUT_EN, 0x00);
+	}
+
+	reg_w(sd, R51x_SYS_RESET, OV511_RESET_OMNICE);
+	reg_w(sd, R51x_SYS_RESET, 0);
+
+	return 0;
+}
+
 /* Sets up the OV518/OV518+ with the given image parameters
  *
  * OV518 needs a completely different approach, until we can figure out what
@@ -2363,6 +2812,10 @@ static int sd_start(struct gspca_dev *gspca_dev)
 	int ret = 0;
 
 	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		ret = ov511_mode_init_regs(sd);
+		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
 		ret = ov518_mode_init_regs(sd);
@@ -2403,6 +2856,56 @@ static void sd_stopN(struct gspca_dev *gspca_dev)
 	ov51x_led_control(sd, 0);
 }
 
+static void ov511_pkt_scan(struct gspca_dev *gspca_dev,
+			struct gspca_frame *frame,	/* target */
+			__u8 *in,			/* isoc packet */
+			int len)			/* iso packet length */
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	/* SOF/EOF packets have 1st to 8th bytes zeroed and the 9th
+	 * byte non-zero. The EOF packet has image width/height in the
+	 * 10th and 11th bytes. The 9th byte is given as follows:
+	 *
+	 * bit 7: EOF
+	 *     6: compression enabled
+	 *     5: 422/420/400 modes
+	 *     4: 422/420/400 modes
+	 *     3: 1
+	 *     2: snapshot button on
+	 *     1: snapshot frame
+	 *     0: even/odd field
+	 */
+	if (!(in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6] | in[7]) &&
+	    (in[8] & 0x08)) {
+		if (in[8] & 0x80) {
+			/* Frame end */
+			if ((in[9] + 1) * 8 != gspca_dev->width ||
+			    (in[10] + 1) * 8 != gspca_dev->height) {
+				PDEBUG(D_ERR, "Invalid frame size, got: %dx%d,"
+					" requested: %dx%d\n",
+					(in[9] + 1) * 8, (in[10] + 1) * 8,
+					gspca_dev->width, gspca_dev->height);
+				gspca_dev->last_packet_type = DISCARD_PACKET;
+				return;
+			}
+			/* Add 11 byte footer to frame, might be usefull */
+			gspca_frame_add(gspca_dev, LAST_PACKET, frame, in, 11);
+			return;
+		} else {
+			/* Frame start */
+			gspca_frame_add(gspca_dev, FIRST_PACKET, frame, in, 0);
+			sd->packet_nr = 0;
+		}
+	}
+
+	/* Ignore the packet number */
+	len--;
+
+	/* intermediate packet */
+	gspca_frame_add(gspca_dev, INTER_PACKET, frame, in, len);
+}
+
 static void ov518_pkt_scan(struct gspca_dev *gspca_dev,
 			struct gspca_frame *frame,	/* target */
 			__u8 *data,			/* isoc packet */
@@ -2495,6 +2998,7 @@ static void sd_pkt_scan(struct gspca_dev *gspca_dev,
 	switch (sd->bridge) {
 	case BRIDGE_OV511:
 	case BRIDGE_OV511PLUS:
+		ov511_pkt_scan(gspca_dev, frame, data, len);
 		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
@@ -2862,12 +3366,15 @@ static const __devinitdata struct usb_device_id device_table[] = {
 	{USB_DEVICE(0x045e, 0x028c), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x054c, 0x0154), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x054c, 0x0155), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0x0511), .driver_info = BRIDGE_OV511 },
 	{USB_DEVICE(0x05a9, 0x0518), .driver_info = BRIDGE_OV518 },
 	{USB_DEVICE(0x05a9, 0x0519), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x05a9, 0x0530), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x05a9, 0x4519), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x05a9, 0x8519), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0xa511), .driver_info = BRIDGE_OV511PLUS },
 	{USB_DEVICE(0x05a9, 0xa518), .driver_info = BRIDGE_OV518PLUS },
+	{USB_DEVICE(0x0813, 0x0002), .driver_info = BRIDGE_OV511PLUS },
 	{}
 };
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 772d226cb5ca..8a025d510904 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -348,6 +348,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SQ905C   v4l2_fourcc('9', '0', '5', 'C') /* compressed RGGB bayer */
 #define V4L2_PIX_FMT_PJPG     v4l2_fourcc('P', 'J', 'P', 'G') /* Pixart 73xx JPEG */
 #define V4L2_PIX_FMT_YVYU     v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16 YVU 4:2:2 */
+#define V4L2_PIX_FMT_OV511    v4l2_fourcc('O', '5', '1', '1') /* ov511 JPEG */
 #define V4L2_PIX_FMT_OV518    v4l2_fourcc('O', '5', '1', '8') /* ov518 JPEG */
 
 /*
-- 
cgit v1.2.3-71-gd317


From f29ac756a40d0f1bb07d682ea521e7b666ff06d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Jun 2009 18:27:26 +0200
Subject: perf_counter: Optimize perf_swcounter_event()

Similar to tracepoints, use an enable variable to reduce
overhead when unused.

Only look for a counter of a particular event type when we know
there is at least one in the system.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 11 ++++++++++-
 kernel/perf_counter.c        | 18 +++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 89698d8aba5c..e7213e46cf9c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -669,7 +669,16 @@ static inline int is_software_counter(struct perf_counter *counter)
 		(counter->attr.type != PERF_TYPE_HW_CACHE);
 }
 
-extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
+extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+
+extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
+
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+{
+	if (atomic_read(&perf_swcounter_enabled[event]))
+		__perf_swcounter_event(event, nr, nmi, regs, addr);
+}
 
 extern void __perf_counter_mmap(struct vm_area_struct *vma);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea4..7515c7695428 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3317,8 +3317,8 @@ out:
 	put_cpu_var(perf_cpu_context);
 }
 
-void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+void __perf_swcounter_event(u32 event, u64 nr, int nmi,
+			    struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
 		.regs = regs,
@@ -3509,9 +3509,19 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 }
 #endif
 
+atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_counter_destroy(struct perf_counter *counter)
+{
+	u64 event = counter->attr.config;
+
+	atomic_dec(&perf_swcounter_enabled[event]);
+}
+
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
 	const struct pmu *pmu = NULL;
+	u64 event = counter->attr.config;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -3520,7 +3530,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->attr.config) {
+	switch (event) {
 	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3541,6 +3551,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_SW_CONTEXT_SWITCHES:
 	case PERF_COUNT_SW_CPU_MIGRATIONS:
+		atomic_inc(&perf_swcounter_enabled[event]);
+		counter->destroy = sw_perf_counter_destroy;
 		pmu = &perf_ops_generic;
 		break;
 	}
-- 
cgit v1.2.3-71-gd317


From d5fdd6babcfc2b0e6a8da1acf492a69fb54b4c47 Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Tue, 23 Jun 2009 04:31:07 -0700
Subject: ipv6: Use correct data types for ICMPv6 type and code

Change all the code that deals directly with ICMPv6 type and code
values to use u8 instead of a signed int as that's the actual data
type.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmpv6.h  |  6 +++---
 include/net/protocol.h  |  2 +-
 include/net/rawv6.h     |  2 +-
 include/net/xfrm.h      |  2 +-
 net/dccp/ipv6.c         |  2 +-
 net/ipv6/ah6.c          |  2 +-
 net/ipv6/esp6.c         |  2 +-
 net/ipv6/icmp.c         | 12 ++++++------
 net/ipv6/ip6_tunnel.c   | 18 +++++++++---------
 net/ipv6/ipcomp6.c      |  2 +-
 net/ipv6/mip6.c         |  2 +-
 net/ipv6/raw.c          |  4 ++--
 net/ipv6/route.c        |  2 +-
 net/ipv6/tcp_ipv6.c     |  2 +-
 net/ipv6/tunnel6.c      |  2 +-
 net/ipv6/udp.c          |  6 +++---
 net/ipv6/udp_impl.h     |  2 +-
 net/ipv6/udplite.c      |  2 +-
 net/ipv6/xfrm6_tunnel.c |  2 +-
 net/sctp/ipv6.c         |  2 +-
 20 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 10d701eec484..b6a85183c333 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -175,16 +175,16 @@ struct icmp6_filter {
 
 
 extern void				icmpv6_send(struct sk_buff *skb,
-						    int type, int code,
+						    u8 type, u8 code,
 						    __u32 info, 
 						    struct net_device *dev);
 
 extern int				icmpv6_init(void);
-extern int				icmpv6_err_convert(int type, int code,
+extern int				icmpv6_err_convert(u8 type, u8 code,
 							   int *err);
 extern void				icmpv6_cleanup(void);
 extern void				icmpv6_param_prob(struct sk_buff *skb,
-							  int code, int pos);
+							  u8 code, int pos);
 
 struct flowi;
 struct in6_addr;
diff --git a/include/net/protocol.h b/include/net/protocol.h
index ffa5b8b1f1df..1089d5aabd49 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -53,7 +53,7 @@ struct inet6_protocol
 
 	void	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
-			       int type, int code, int offset,
+			       u8 type, u8 code, int offset,
 			       __be32 info);
 
 	int	(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index 8a22599f26ba..f6b9b830df8c 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -6,7 +6,7 @@
 #include <net/protocol.h>
 
 void raw6_icmp_error(struct sk_buff *, int nexthdr,
-		int type, int code, int inner_offset, __be32);
+		u8 type, u8 code, int inner_offset, __be32);
 int raw6_local_deliver(struct sk_buff *, int);
 
 extern int			rawv6_rcv(struct sock *sk,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 736bca450886..9e3a3f4c1f60 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1274,7 +1274,7 @@ struct xfrm_tunnel {
 struct xfrm6_tunnel {
 	int (*handler)(struct sk_buff *skb);
 	int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			   int type, int code, int offset, __be32 info);
+			   u8 type, u8 code, int offset, __be32 info);
 	struct xfrm6_tunnel *next;
 	int priority;
 };
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 05ea7440d9e5..3e70faab2989 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -85,7 +85,7 @@ static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
 }
 
 static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			int type, int code, int offset, __be32 info)
+			u8 type, u8 code, int offset, __be32 info)
 {
 	struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
 	const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 52449f7a1b71..86f42a288c4b 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -405,7 +405,7 @@ out:
 }
 
 static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		    int type, int code, int offset, __be32 info)
+		    u8 type, u8 code, int offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index c2f250150db1..678bb95b1525 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -354,7 +354,7 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
 }
 
 static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		     int type, int code, int offset, __be32 info)
+		     u8 type, u8 code, int offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 36dff8807183..eab62a7a8f06 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -117,7 +117,7 @@ static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
 /*
  * Slightly more convenient version of icmpv6_send.
  */
-void icmpv6_param_prob(struct sk_buff *skb, int code, int pos)
+void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
 {
 	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev);
 	kfree_skb(skb);
@@ -161,7 +161,7 @@ static int is_ineligible(struct sk_buff *skb)
 /*
  * Check the ICMP output rate limit
  */
-static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
+static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type,
 				     struct flowi *fl)
 {
 	struct dst_entry *dst;
@@ -305,7 +305,7 @@ static inline void mip6_addr_swap(struct sk_buff *skb) {}
 /*
  *	Send an ICMP message in response to a packet in error
  */
-void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
+void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		 struct net_device *dev)
 {
 	struct net *net = dev_net(skb->dev);
@@ -590,7 +590,7 @@ out:
 	icmpv6_xmit_unlock(sk);
 }
 
-static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info)
+static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 {
 	struct inet6_protocol *ipprot;
 	int inner_offset;
@@ -643,7 +643,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
 	struct in6_addr *saddr, *daddr;
 	struct ipv6hdr *orig_hdr;
 	struct icmp6hdr *hdr;
-	int type;
+	u8 type;
 
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 		struct sec_path *sp = skb_sec_path(skb);
@@ -914,7 +914,7 @@ static const struct icmp6_err {
 	},
 };
 
-int icmpv6_err_convert(int type, int code, int *err)
+int icmpv6_err_convert(u8 type, u8 code, int *err)
 {
 	int fatal = 0;
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 404d16a97d5c..51f410e7775a 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -394,13 +394,13 @@ parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
 
 static int
 ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
-	    int *type, int *code, int *msg, __u32 *info, int offset)
+	    u8 *type, u8 *code, int *msg, __u32 *info, int offset)
 {
 	struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data;
 	struct ip6_tnl *t;
 	int rel_msg = 0;
-	int rel_type = ICMPV6_DEST_UNREACH;
-	int rel_code = ICMPV6_ADDR_UNREACH;
+	u8 rel_type = ICMPV6_DEST_UNREACH;
+	u8 rel_code = ICMPV6_ADDR_UNREACH;
 	__u32 rel_info = 0;
 	__u16 len;
 	int err = -ENOENT;
@@ -488,11 +488,11 @@ out:
 
 static int
 ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-	   int type, int code, int offset, __be32 info)
+	   u8 type, u8 code, int offset, __be32 info)
 {
 	int rel_msg = 0;
-	int rel_type = type;
-	int rel_code = code;
+	u8 rel_type = type;
+	u8 rel_code = code;
 	__u32 rel_info = ntohl(info);
 	int err;
 	struct sk_buff *skb2;
@@ -586,11 +586,11 @@ out:
 
 static int
 ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-	   int type, int code, int offset, __be32 info)
+	   u8 type, u8 code, int offset, __be32 info)
 {
 	int rel_msg = 0;
-	int rel_type = type;
-	int rel_code = code;
+	u8 rel_type = type;
+	u8 rel_code = code;
 	__u32 rel_info = ntohl(info);
 	int err;
 
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 3a0b3be7ece5..79c172f1ff01 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -51,7 +51,7 @@
 #include <linux/mutex.h>
 
 static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-				int type, int code, int offset, __be32 info)
+				u8 type, u8 code, int offset, __be32 info)
 {
 	__be32 spi;
 	struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index f995e19c87a9..f797e8c6f3b3 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -54,7 +54,7 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen)
 	return data + padlen;
 }
 
-static inline void mip6_param_prob(struct sk_buff *skb, int code, int pos)
+static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos)
 {
 	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev);
 }
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8b0b6f948063..d6c3c1c34b2d 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -310,7 +310,7 @@ out:
 
 static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	       struct inet6_skb_parm *opt,
-	       int type, int code, int offset, __be32 info)
+	       u8 type, u8 code, int offset, __be32 info)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -343,7 +343,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 }
 
 void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
-		int type, int code, int inner_offset, __be32 info)
+		u8 type, u8 code, int inner_offset, __be32 info)
 {
 	struct sock *sk;
 	int hash;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 658293ea05ba..1473ee0a1f51 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1865,7 +1865,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
  *	Drop the packet on the floor
  */
 
-static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
+static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
 {
 	int type;
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 53b6a4192b16..58810c65b635 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -317,7 +317,7 @@ failure:
 }
 
 static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		int type, int code, int offset, __be32 info)
+		u8 type, u8 code, int offset, __be32 info)
 {
 	struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
 	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 669f280989c3..633ad789effc 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -124,7 +124,7 @@ drop:
 }
 
 static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			int type, int code, int offset, __be32 info)
+			u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_tunnel *handler;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 023beda6b224..33b59bd92c4d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -312,7 +312,7 @@ csum_copy_err:
 }
 
 void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		    int type, int code, int offset, __be32 info,
+		    u8 type, u8 code, int offset, __be32 info,
 		    struct udp_table *udptable)
 {
 	struct ipv6_pinfo *np;
@@ -346,8 +346,8 @@ out:
 }
 
 static __inline__ void udpv6_err(struct sk_buff *skb,
-				 struct inet6_skb_parm *opt, int type,
-				 int code, int offset, __be32 info     )
+				 struct inet6_skb_parm *opt, u8 type,
+				 u8 code, int offset, __be32 info     )
 {
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 23779208c334..6bb303471e20 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -9,7 +9,7 @@
 
 extern int  	__udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
 extern void 	__udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
-			       int , int , int , __be32 , struct udp_table *);
+			       u8 , u8 , int , __be32 , struct udp_table *);
 
 extern int	udp_v6_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index ba162a824585..4818c48688f2 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -20,7 +20,7 @@ static int udplitev6_rcv(struct sk_buff *skb)
 
 static void udplitev6_err(struct sk_buff *skb,
 			  struct inet6_skb_parm *opt,
-			  int type, int code, int offset, __be32 info)
+			  u8 type, u8 code, int offset, __be32 info)
 {
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 80193db224d9..81a95c00e503 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -262,7 +262,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
 }
 
 static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			    int type, int code, int offset, __be32 info)
+			    u8 type, u8 code, int offset, __be32 info)
 {
 	/* xfrm6_tunnel native err handling */
 	switch (type) {
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index a63de3f7f185..6a4b19094143 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -133,7 +133,7 @@ static struct notifier_block sctp_inet6addr_notifier = {
 
 /* ICMP error handler. */
 SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			     int type, int code, int offset, __be32 info)
+			     u8 type, u8 code, int offset, __be32 info)
 {
 	struct inet6_dev *idev;
 	struct sock *sk;
-- 
cgit v1.2.3-71-gd317


From 788c7df451467df71638dd79a2d63d78c6e13b9c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Tue, 23 Jun 2009 13:49:05 +0100
Subject: hugetlb: fault flags instead of write_access

handle_mm_fault() is now passing fault flags rather than write_access
down to hugetlb_fault(), so better recognize that in hugetlb_fault(),
and in hugetlb_no_page().

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a05a5ef33391..2723513a5651 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -33,7 +33,7 @@ void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, int write_access);
+			unsigned long address, unsigned int flags);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						int acctflags);
@@ -98,7 +98,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 #define pud_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-#define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
+#define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 
 #define hugetlb_change_protection(vma, address, end, newprot)
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a56e6f3ce979..d0351e31f474 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1985,7 +1985,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
 }
 
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep, int write_access)
+			unsigned long address, pte_t *ptep, unsigned int flags)
 {
 	struct hstate *h = hstate_vma(vma);
 	int ret = VM_FAULT_SIGBUS;
@@ -2053,7 +2053,7 @@ retry:
 	 * any allocations necessary to record that reservation occur outside
 	 * the spinlock.
 	 */
-	if (write_access && !(vma->vm_flags & VM_SHARED))
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
 		if (vma_needs_reservation(h, vma, address) < 0) {
 			ret = VM_FAULT_OOM;
 			goto backout_unlocked;
@@ -2072,7 +2072,7 @@ retry:
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
 
-	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
 	}
@@ -2091,7 +2091,7 @@ backout_unlocked:
 }
 
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, int write_access)
+			unsigned long address, unsigned int flags)
 {
 	pte_t *ptep;
 	pte_t entry;
@@ -2112,7 +2112,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	mutex_lock(&hugetlb_instantiation_mutex);
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
-		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
 		goto out_mutex;
 	}
 
@@ -2126,7 +2126,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * page now as it is used to determine if a reservation has been
 	 * consumed.
 	 */
-	if (write_access && !pte_write(entry)) {
+	if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
 		if (vma_needs_reservation(h, vma, address) < 0) {
 			ret = VM_FAULT_OOM;
 			goto out_mutex;
@@ -2143,7 +2143,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_page_table_lock;
 
 
-	if (write_access) {
+	if (flags & FAULT_FLAG_WRITE) {
 		if (!pte_write(entry)) {
 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
 							pagecache_page);
@@ -2152,7 +2152,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
-	if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
+	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
+						flags & FAULT_FLAG_WRITE))
 		update_mmu_cache(vma, address, entry);
 
 out_page_table_lock:
-- 
cgit v1.2.3-71-gd317


From 92722b1bb1ebcba767f9c6ee499992ee33367268 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 11 Jun 2009 14:17:48 +0100
Subject: leds: Further document parameters for blink_set()

The documentation for the parameters of blink_set() was a bit hard
to find so put some where I'd expected to find it.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 include/linux/leds.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 376fe07732ea..c7f0b148df06 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -45,7 +45,9 @@ struct led_classdev {
 	/* Get LED brightness level */
 	enum led_brightness (*brightness_get)(struct led_classdev *led_cdev);
 
-	/* Activate hardware accelerated blink */
+	/* Activate hardware accelerated blink, delays are in
+	 * miliseconds and if none is provided then a sensible default
+	 * should be chosen. */
 	int		(*blink_set)(struct led_classdev *led_cdev,
 				     unsigned long *delay_on,
 				     unsigned long *delay_off);
-- 
cgit v1.2.3-71-gd317


From 5054d39e327f76df022163a2ebd02e444c5d65f9 Mon Sep 17 00:00:00 2001
From: Antonio Ospite <ospite@studenti.unina.it>
Date: Fri, 19 Jun 2009 13:55:42 +0200
Subject: leds: LED driver for National Semiconductor LP3944 Funlight Chip

LEDs driver for National Semiconductor LP3944 Funlight Chip
http://www.national.com/pf/LP/LP3944.html

This helper chip can drive up to 8 leds, with two programmable DIM
modes; it could even be used as a gpio expander but this driver assumes
it is used as a led controller.

The DIM modes are used to set _blink_ patterns for leds, the pattern is
specified supplying two parameters:
  - period: from 0s to 1.6s
  - duty cycle: percentage of the period the led is on, from 0 to 100

LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
leds, the camera flash light and the displays backlights.

Signed-off-by: Antonio Ospite <ospite@studenti.unina.it>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 Documentation/leds-lp3944.txt |  50 +++++
 drivers/leds/Kconfig          |  11 +
 drivers/leds/Makefile         |   1 +
 drivers/leds/leds-lp3944.c    | 466 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/leds-lp3944.h   |  53 +++++
 5 files changed, 581 insertions(+)
 create mode 100644 Documentation/leds-lp3944.txt
 create mode 100644 drivers/leds/leds-lp3944.c
 create mode 100644 include/linux/leds-lp3944.h

(limited to 'include/linux')

diff --git a/Documentation/leds-lp3944.txt b/Documentation/leds-lp3944.txt
new file mode 100644
index 000000000000..c6eda18b15ef
--- /dev/null
+++ b/Documentation/leds-lp3944.txt
@@ -0,0 +1,50 @@
+Kernel driver lp3944
+====================
+
+  * National Semiconductor LP3944 Fun-light Chip
+    Prefix: 'lp3944'
+    Addresses scanned: None (see the Notes section below)
+    Datasheet: Publicly available at the National Semiconductor website
+               http://www.national.com/pf/LP/LP3944.html
+
+Authors:
+        Antonio Ospite <ospite@studenti.unina.it>
+
+
+Description
+-----------
+The LP3944 is a helper chip that can drive up to 8 leds, with two programmable
+DIM modes; it could even be used as a gpio expander but this driver assumes it
+is used as a led controller.
+
+The DIM modes are used to set _blink_ patterns for leds, the pattern is
+specified supplying two parameters:
+  - period: from 0s to 1.6s
+  - duty cycle: percentage of the period the led is on, from 0 to 100
+
+Setting a led in DIM0 or DIM1 mode makes it blink according to the pattern.
+See the datasheet for details.
+
+LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
+leds, the camera flash light and the lcds power.
+
+
+Notes
+-----
+The chip is used mainly in embedded contexts, so this driver expects it is
+registered using the i2c_board_info mechanism.
+
+To register the chip at address 0x60 on adapter 0, set the platform data
+according to include/linux/leds-lp3944.h, set the i2c board info:
+
+	static struct i2c_board_info __initdata a910_i2c_board_info[] = {
+		{
+			I2C_BOARD_INFO("lp3944", 0x60),
+			.platform_data = &a910_lp3944_leds,
+		},
+	};
+
+and register it in the platform init function
+
+	i2c_register_board_info(0, a910_i2c_board_info,
+			ARRAY_SIZE(a910_i2c_board_info));
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index cfcd6bf831c9..7c8e7122aaa9 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -146,6 +146,17 @@ config LEDS_GPIO_OF
 	  of_platform devices.  For instance, LEDs which are listed in a "dts"
 	  file.
 
+config LEDS_LP3944
+	tristate "LED Support for N.S. LP3944 (Fun Light) I2C chip"
+	depends on LEDS_CLASS && I2C
+	help
+    This option enables support for LEDs connected to the National
+    Semiconductor LP3944 Lighting Management Unit (LMU) also known as
+    Fun Light Chip.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called leds-lp3944.
+
 config LEDS_CLEVO_MAIL
 	tristate "Mail LED on Clevo notebook"
 	depends on LEDS_CLASS && X86 && SERIO_I8042 && DMI
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 2d41c4dcf92f..e8cdcf77a4c3 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_LEDS_COBALT_RAQ)		+= leds-cobalt-raq.o
 obj-$(CONFIG_LEDS_SUNFIRE)		+= leds-sunfire.o
 obj-$(CONFIG_LEDS_PCA9532)		+= leds-pca9532.o
 obj-$(CONFIG_LEDS_GPIO)			+= leds-gpio.o
+obj-$(CONFIG_LEDS_LP3944)		+= leds-lp3944.o
 obj-$(CONFIG_LEDS_CLEVO_MAIL)		+= leds-clevo-mail.o
 obj-$(CONFIG_LEDS_HP6XX)		+= leds-hp6xx.o
 obj-$(CONFIG_LEDS_FSG)			+= leds-fsg.o
diff --git a/drivers/leds/leds-lp3944.c b/drivers/leds/leds-lp3944.c
new file mode 100644
index 000000000000..5946208ba26e
--- /dev/null
+++ b/drivers/leds/leds-lp3944.c
@@ -0,0 +1,466 @@
+/*
+ * leds-lp3944.c - driver for National Semiconductor LP3944 Funlight Chip
+ *
+ * Copyright (C) 2009 Antonio Ospite <ospite@studenti.unina.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+/*
+ * I2C driver for National Semiconductor LP3944 Funlight Chip
+ * http://www.national.com/pf/LP/LP3944.html
+ *
+ * This helper chip can drive up to 8 leds, with two programmable DIM modes;
+ * it could even be used as a gpio expander but this driver assumes it is used
+ * as a led controller.
+ *
+ * The DIM modes are used to set _blink_ patterns for leds, the pattern is
+ * specified supplying two parameters:
+ *   - period: from 0s to 1.6s
+ *   - duty cycle: percentage of the period the led is on, from 0 to 100
+ *
+ * LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
+ * leds, the camera flash light and the displays backlights.
+ */
+
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/leds.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/leds-lp3944.h>
+
+/* Read Only Registers */
+#define LP3944_REG_INPUT1     0x00 /* LEDs 0-7 InputRegister (Read Only) */
+#define LP3944_REG_REGISTER1  0x01 /* None (Read Only) */
+
+#define LP3944_REG_PSC0       0x02 /* Frequency Prescaler 0 (R/W) */
+#define LP3944_REG_PWM0       0x03 /* PWM Register 0 (R/W) */
+#define LP3944_REG_PSC1       0x04 /* Frequency Prescaler 1 (R/W) */
+#define LP3944_REG_PWM1       0x05 /* PWM Register 1 (R/W) */
+#define LP3944_REG_LS0        0x06 /* LEDs 0-3 Selector (R/W) */
+#define LP3944_REG_LS1        0x07 /* LEDs 4-7 Selector (R/W) */
+
+/* These registers are not used to control leds in LP3944, they can store
+ * arbitrary values which the chip will ignore.
+ */
+#define LP3944_REG_REGISTER8  0x08
+#define LP3944_REG_REGISTER9  0x09
+
+#define LP3944_DIM0 0
+#define LP3944_DIM1 1
+
+/* period in ms */
+#define LP3944_PERIOD_MIN 0
+#define LP3944_PERIOD_MAX 1600
+
+/* duty cycle is a percentage */
+#define LP3944_DUTY_CYCLE_MIN 0
+#define LP3944_DUTY_CYCLE_MAX 100
+
+#define ldev_to_led(c)       container_of(c, struct lp3944_led_data, ldev)
+
+/* Saved data */
+struct lp3944_led_data {
+	u8 id;
+	enum lp3944_type type;
+	enum lp3944_status status;
+	struct led_classdev ldev;
+	struct i2c_client *client;
+	struct work_struct work;
+};
+
+struct lp3944_data {
+	struct mutex lock;
+	struct i2c_client *client;
+	struct lp3944_led_data leds[LP3944_LEDS_MAX];
+};
+
+static int lp3944_reg_read(struct i2c_client *client, u8 reg, u8 *value)
+{
+	int tmp;
+
+	tmp = i2c_smbus_read_byte_data(client, reg);
+	if (tmp < 0)
+		return -EINVAL;
+
+	*value = tmp;
+
+	return 0;
+}
+
+static int lp3944_reg_write(struct i2c_client *client, u8 reg, u8 value)
+{
+	return i2c_smbus_write_byte_data(client, reg, value);
+}
+
+/**
+ * Set the period for DIM status
+ *
+ * @client: the i2c client
+ * @dim: either LP3944_DIM0 or LP3944_DIM1
+ * @period: period of a blink, that is a on/off cycle, expressed in ms.
+ */
+static int lp3944_dim_set_period(struct i2c_client *client, u8 dim, u16 period)
+{
+	u8 psc_reg;
+	u8 psc_value;
+	int err;
+
+	if (dim == LP3944_DIM0)
+		psc_reg = LP3944_REG_PSC0;
+	else if (dim == LP3944_DIM1)
+		psc_reg = LP3944_REG_PSC1;
+	else
+		return -EINVAL;
+
+	/* Convert period to Prescaler value */
+	if (period > LP3944_PERIOD_MAX)
+		return -EINVAL;
+
+	psc_value = (period * 255) / LP3944_PERIOD_MAX;
+
+	err = lp3944_reg_write(client, psc_reg, psc_value);
+
+	return err;
+}
+
+/**
+ * Set the duty cycle for DIM status
+ *
+ * @client: the i2c client
+ * @dim: either LP3944_DIM0 or LP3944_DIM1
+ * @duty_cycle: percentage of a period during which a led is ON
+ */
+static int lp3944_dim_set_dutycycle(struct i2c_client *client, u8 dim,
+				    u8 duty_cycle)
+{
+	u8 pwm_reg;
+	u8 pwm_value;
+	int err;
+
+	if (dim == LP3944_DIM0)
+		pwm_reg = LP3944_REG_PWM0;
+	else if (dim == LP3944_DIM1)
+		pwm_reg = LP3944_REG_PWM1;
+	else
+		return -EINVAL;
+
+	/* Convert duty cycle to PWM value */
+	if (duty_cycle > LP3944_DUTY_CYCLE_MAX)
+		return -EINVAL;
+
+	pwm_value = (duty_cycle * 255) / LP3944_DUTY_CYCLE_MAX;
+
+	err = lp3944_reg_write(client, pwm_reg, pwm_value);
+
+	return err;
+}
+
+/**
+ * Set the led status
+ *
+ * @led: a lp3944_led_data structure
+ * @status: one of LP3944_LED_STATUS_OFF
+ *                 LP3944_LED_STATUS_ON
+ *                 LP3944_LED_STATUS_DIM0
+ *                 LP3944_LED_STATUS_DIM1
+ */
+static int lp3944_led_set(struct lp3944_led_data *led, u8 status)
+{
+	struct lp3944_data *data = i2c_get_clientdata(led->client);
+	u8 id = led->id;
+	u8 reg;
+	u8 val = 0;
+	int err;
+
+	dev_dbg(&led->client->dev, "%s: %s, status before normalization:%d\n",
+		__func__, led->ldev.name, status);
+
+	switch (id) {
+	case LP3944_LED0:
+	case LP3944_LED1:
+	case LP3944_LED2:
+	case LP3944_LED3:
+		reg = LP3944_REG_LS0;
+		break;
+	case LP3944_LED4:
+	case LP3944_LED5:
+	case LP3944_LED6:
+	case LP3944_LED7:
+		id -= LP3944_LED4;
+		reg = LP3944_REG_LS1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (status > LP3944_LED_STATUS_DIM1)
+		return -EINVAL;
+
+	/* invert only 0 and 1, leave unchanged the other values,
+	 * remember we are abusing status to set blink patterns
+	 */
+	if (led->type == LP3944_LED_TYPE_LED_INVERTED && status < 2)
+		status = 1 - status;
+
+	mutex_lock(&data->lock);
+	lp3944_reg_read(led->client, reg, &val);
+
+	val &= ~(LP3944_LED_STATUS_MASK << (id << 1));
+	val |= (status << (id << 1));
+
+	dev_dbg(&led->client->dev, "%s: %s, reg:%d id:%d status:%d val:%#x\n",
+		__func__, led->ldev.name, reg, id, status, val);
+
+	/* set led status */
+	err = lp3944_reg_write(led->client, reg, val);
+	mutex_unlock(&data->lock);
+
+	return err;
+}
+
+static int lp3944_led_set_blink(struct led_classdev *led_cdev,
+				unsigned long *delay_on,
+				unsigned long *delay_off)
+{
+	struct lp3944_led_data *led = ldev_to_led(led_cdev);
+	u16 period;
+	u8 duty_cycle;
+	int err;
+
+	/* units are in ms */
+	if (*delay_on + *delay_off > LP3944_PERIOD_MAX)
+		return -EINVAL;
+
+	if (*delay_on == 0 && *delay_off == 0) {
+		/* Special case: the leds subsystem requires a default user
+		 * friendly blink pattern for the LED.  Let's blink the led
+		 * slowly (1Hz).
+		 */
+		*delay_on = 500;
+		*delay_off = 500;
+	}
+
+	period = (*delay_on) + (*delay_off);
+
+	/* duty_cycle is the percentage of period during which the led is ON */
+	duty_cycle = 100 * (*delay_on) / period;
+
+	/* invert duty cycle for inverted leds, this has the same effect of
+	 * swapping delay_on and delay_off
+	 */
+	if (led->type == LP3944_LED_TYPE_LED_INVERTED)
+		duty_cycle = 100 - duty_cycle;
+
+	/* NOTE: using always the first DIM mode, this means that all leds
+	 * will have the same blinking pattern.
+	 *
+	 * We could find a way later to have two leds blinking in hardware
+	 * with different patterns at the same time, falling back to software
+	 * control for the other ones.
+	 */
+	err = lp3944_dim_set_period(led->client, LP3944_DIM0, period);
+	if (err)
+		return err;
+
+	err = lp3944_dim_set_dutycycle(led->client, LP3944_DIM0, duty_cycle);
+	if (err)
+		return err;
+
+	dev_dbg(&led->client->dev, "%s: OK hardware accelerated blink!\n",
+		__func__);
+
+	led->status = LP3944_LED_STATUS_DIM0;
+	schedule_work(&led->work);
+
+	return 0;
+}
+
+static void lp3944_led_set_brightness(struct led_classdev *led_cdev,
+				      enum led_brightness brightness)
+{
+	struct lp3944_led_data *led = ldev_to_led(led_cdev);
+
+	dev_dbg(&led->client->dev, "%s: %s, %d\n",
+		__func__, led_cdev->name, brightness);
+
+	led->status = brightness;
+	schedule_work(&led->work);
+}
+
+static void lp3944_led_work(struct work_struct *work)
+{
+	struct lp3944_led_data *led;
+
+	led = container_of(work, struct lp3944_led_data, work);
+	lp3944_led_set(led, led->status);
+}
+
+static int lp3944_configure(struct i2c_client *client,
+			    struct lp3944_data *data,
+			    struct lp3944_platform_data *pdata)
+{
+	int i, err = 0;
+
+	for (i = 0; i < pdata->leds_size; i++) {
+		struct lp3944_led *pled = &pdata->leds[i];
+		struct lp3944_led_data *led = &data->leds[i];
+		led->client = client;
+		led->id = i;
+
+		switch (pled->type) {
+
+		case LP3944_LED_TYPE_LED:
+		case LP3944_LED_TYPE_LED_INVERTED:
+			led->type = pled->type;
+			led->status = pled->status;
+			led->ldev.name = pled->name;
+			led->ldev.max_brightness = 1;
+			led->ldev.brightness_set = lp3944_led_set_brightness;
+			led->ldev.blink_set = lp3944_led_set_blink;
+			led->ldev.flags = LED_CORE_SUSPENDRESUME;
+
+			INIT_WORK(&led->work, lp3944_led_work);
+			err = led_classdev_register(&client->dev, &led->ldev);
+			if (err < 0) {
+				dev_err(&client->dev,
+					"couldn't register LED %s\n",
+					led->ldev.name);
+				goto exit;
+			}
+
+			/* to expose the default value to userspace */
+			led->ldev.brightness = led->status;
+
+			/* Set the default led status */
+			err = lp3944_led_set(led, led->status);
+			if (err < 0) {
+				dev_err(&client->dev,
+					"%s couldn't set STATUS %d\n",
+					led->ldev.name, led->status);
+				goto exit;
+			}
+			break;
+
+		case LP3944_LED_TYPE_NONE:
+		default:
+			break;
+
+		}
+	}
+	return 0;
+
+exit:
+	if (i > 0)
+		for (i = i - 1; i >= 0; i--)
+			switch (pdata->leds[i].type) {
+
+			case LP3944_LED_TYPE_LED:
+			case LP3944_LED_TYPE_LED_INVERTED:
+				led_classdev_unregister(&data->leds[i].ldev);
+				cancel_work_sync(&data->leds[i].work);
+				break;
+
+			case LP3944_LED_TYPE_NONE:
+			default:
+				break;
+			}
+
+	return err;
+}
+
+static int __devinit lp3944_probe(struct i2c_client *client,
+				  const struct i2c_device_id *id)
+{
+	struct lp3944_platform_data *lp3944_pdata = client->dev.platform_data;
+	struct lp3944_data *data;
+
+	if (lp3944_pdata == NULL) {
+		dev_err(&client->dev, "no platform data\n");
+		return -EINVAL;
+	}
+
+	/* Let's see whether this adapter can support what we need. */
+	if (!i2c_check_functionality(client->adapter,
+				I2C_FUNC_SMBUS_BYTE_DATA)) {
+		dev_err(&client->dev, "insufficient functionality!\n");
+		return -ENODEV;
+	}
+
+	data = kzalloc(sizeof(struct lp3944_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->client = client;
+	i2c_set_clientdata(client, data);
+
+	mutex_init(&data->lock);
+
+	dev_info(&client->dev, "lp3944 enabled\n");
+
+	lp3944_configure(client, data, lp3944_pdata);
+	return 0;
+}
+
+static int __devexit lp3944_remove(struct i2c_client *client)
+{
+	struct lp3944_platform_data *pdata = client->dev.platform_data;
+	struct lp3944_data *data = i2c_get_clientdata(client);
+	int i;
+
+	for (i = 0; i < pdata->leds_size; i++)
+		switch (data->leds[i].type) {
+		case LP3944_LED_TYPE_LED:
+		case LP3944_LED_TYPE_LED_INVERTED:
+			led_classdev_unregister(&data->leds[i].ldev);
+			cancel_work_sync(&data->leds[i].work);
+			break;
+
+		case LP3944_LED_TYPE_NONE:
+		default:
+			break;
+		}
+
+	kfree(data);
+	i2c_set_clientdata(client, NULL);
+
+	return 0;
+}
+
+/* lp3944 i2c driver struct */
+static const struct i2c_device_id lp3944_id[] = {
+	{"lp3944", 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, lp3944_id);
+
+static struct i2c_driver lp3944_driver = {
+	.driver   = {
+		   .name = "lp3944",
+	},
+	.probe    = lp3944_probe,
+	.remove   = __devexit_p(lp3944_remove),
+	.id_table = lp3944_id,
+};
+
+static int __init lp3944_module_init(void)
+{
+	return i2c_add_driver(&lp3944_driver);
+}
+
+static void __exit lp3944_module_exit(void)
+{
+	i2c_del_driver(&lp3944_driver);
+}
+
+module_init(lp3944_module_init);
+module_exit(lp3944_module_exit);
+
+MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
+MODULE_DESCRIPTION("LP3944 Fun Light Chip");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/leds-lp3944.h b/include/linux/leds-lp3944.h
new file mode 100644
index 000000000000..afc9f9fd70f5
--- /dev/null
+++ b/include/linux/leds-lp3944.h
@@ -0,0 +1,53 @@
+/*
+ * leds-lp3944.h - platform data structure for lp3944 led controller
+ *
+ * Copyright (C) 2009 Antonio Ospite <ospite@studenti.unina.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __LINUX_LEDS_LP3944_H
+#define __LINUX_LEDS_LP3944_H
+
+#include <linux/leds.h>
+#include <linux/workqueue.h>
+
+#define LP3944_LED0 0
+#define LP3944_LED1 1
+#define LP3944_LED2 2
+#define LP3944_LED3 3
+#define LP3944_LED4 4
+#define LP3944_LED5 5
+#define LP3944_LED6 6
+#define LP3944_LED7 7
+#define LP3944_LEDS_MAX 8
+
+#define LP3944_LED_STATUS_MASK	0x03
+enum lp3944_status {
+	LP3944_LED_STATUS_OFF  = 0x0,
+	LP3944_LED_STATUS_ON   = 0x1,
+	LP3944_LED_STATUS_DIM0 = 0x2,
+	LP3944_LED_STATUS_DIM1 = 0x3
+};
+
+enum lp3944_type {
+	LP3944_LED_TYPE_NONE,
+	LP3944_LED_TYPE_LED,
+	LP3944_LED_TYPE_LED_INVERTED,
+};
+
+struct lp3944_led {
+	char *name;
+	enum lp3944_type type;
+	enum lp3944_status status;
+};
+
+struct lp3944_platform_data {
+	struct lp3944_led leds[LP3944_LEDS_MAX];
+	u8 leds_size;
+};
+
+#endif /* __LINUX_LEDS_LP3944_H */
-- 
cgit v1.2.3-71-gd317


From ed88bae6918fa990cbfe47316bd0f790121aaf00 Mon Sep 17 00:00:00 2001
From: Trent Piepho <xyzzy@speakeasy.org>
Date: Tue, 12 May 2009 15:33:12 -0700
Subject: leds: Add options to have GPIO LEDs start on or keep their state

There already is a "default-on" trigger but there are problems with it.

For one, it's a inefficient way to do it and requires led trigger support
to be compiled in.

But the real reason is that is produces a glitch on the LED.  The GPIO is
allocate with the LED *off*, then *later* when the trigger runs it is
turned back on.  If the LED was already on via the GPIO's reset default or
action of the firmware, this produces a glitch where the LED goes from on
to off to on.  While normally this is fast enough that it wouldn't be
noticeable to a human observer, there are still serious problems.

One is that there may be something else on the GPIO line, like a hardware
alarm or watchdog, that is fast enough to notice the glitch.

Another is that the kernel may panic before the LED is turned back on, thus
hanging with the LED in the wrong state.  This is not just speculation, but
actually happened to me with an embedded system that has an LED which
should turn off when the kernel finishes booting, which was left in the
incorrect state due to a bug in the OF LED binding code.

We also let GPIO LEDs get their initial value from whatever the current
state of the GPIO line is.  On some systems the LEDs are put into some
state by the firmware or hardware before Linux boots, and it is desired to
have them keep this state which is otherwise unknown to Linux.

This requires that the underlying GPIO driver support reading the value of
output GPIOs.  Some drivers support this and some do not.

The platform device binding gains a field in the platform data
"default_state" that controls this.  There are three constants defined to
select from on, off, or keeping the current state.  The OpenFirmware
binding uses a property named "default-state" that can be set to "on",
"off", or "keep".  The default if the property isn't present is off.

Signed-off-by: Trent Piepho <xyzzy@speakeasy.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Sean MacLennan <smaclennan@pikatech.com>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 Documentation/powerpc/dts-bindings/gpio/led.txt | 17 ++++++++++++++++-
 drivers/leds/leds-gpio.c                        | 20 +++++++++++++++++---
 include/linux/leds.h                            |  9 +++++++--
 3 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/powerpc/dts-bindings/gpio/led.txt b/Documentation/powerpc/dts-bindings/gpio/led.txt
index 4fe14deedc0a..064db928c3c1 100644
--- a/Documentation/powerpc/dts-bindings/gpio/led.txt
+++ b/Documentation/powerpc/dts-bindings/gpio/led.txt
@@ -16,10 +16,17 @@ LED sub-node properties:
   string defining the trigger assigned to the LED.  Current triggers are:
     "backlight" - LED will act as a back-light, controlled by the framebuffer
 		  system
-    "default-on" - LED will turn on
+    "default-on" - LED will turn on, but see "default-state" below
     "heartbeat" - LED "double" flashes at a load average based rate
     "ide-disk" - LED indicates disk activity
     "timer" - LED flashes at a fixed, configurable rate
+- default-state:  (optional) The initial state of the LED.  Valid
+  values are "on", "off", and "keep".  If the LED is already on or off
+  and the default-state property is set the to same value, then no
+  glitch should be produced where the LED momentarily turns off (or
+  on).  The "keep" setting will keep the LED at whatever its current
+  state is, without producing a glitch.  The default is off if this
+  property is not present.
 
 Examples:
 
@@ -30,14 +37,22 @@ leds {
 		gpios = <&mcu_pio 0 1>; /* Active low */
 		linux,default-trigger = "ide-disk";
 	};
+
+	fault {
+		gpios = <&mcu_pio 1 0>;
+		/* Keep LED on if BIOS detected hardware fault */
+		default-state = "keep";
+	};
 };
 
 run-control {
 	compatible = "gpio-leds";
 	red {
 		gpios = <&mpc8572 6 0>;
+		default-state = "off";
 	};
 	green {
 		gpios = <&mpc8572 7 0>;
+		default-state = "on";
 	};
 }
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index 76895e691042..6b06638eb5b4 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -76,7 +76,7 @@ static int __devinit create_gpio_led(const struct gpio_led *template,
 	struct gpio_led_data *led_dat, struct device *parent,
 	int (*blink_set)(unsigned, unsigned long *, unsigned long *))
 {
-	int ret;
+	int ret, state;
 
 	/* skip leds that aren't available */
 	if (!gpio_is_valid(template->gpio)) {
@@ -99,11 +99,15 @@ static int __devinit create_gpio_led(const struct gpio_led *template,
 		led_dat->cdev.blink_set = gpio_blink_set;
 	}
 	led_dat->cdev.brightness_set = gpio_led_set;
-	led_dat->cdev.brightness = LED_OFF;
+	if (template->default_state == LEDS_GPIO_DEFSTATE_KEEP)
+		state = !!gpio_get_value(led_dat->gpio) ^ led_dat->active_low;
+	else
+		state = (template->default_state == LEDS_GPIO_DEFSTATE_ON);
+	led_dat->cdev.brightness = state ? LED_FULL : LED_OFF;
 	if (!template->retain_state_suspended)
 		led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME;
 
-	ret = gpio_direction_output(led_dat->gpio, led_dat->active_low);
+	ret = gpio_direction_output(led_dat->gpio, led_dat->active_low ^ state);
 	if (ret < 0)
 		goto err;
 
@@ -223,12 +227,22 @@ static int __devinit of_gpio_leds_probe(struct of_device *ofdev,
 	memset(&led, 0, sizeof(led));
 	for_each_child_of_node(np, child) {
 		enum of_gpio_flags flags;
+		const char *state;
 
 		led.gpio = of_get_gpio_flags(child, 0, &flags);
 		led.active_low = flags & OF_GPIO_ACTIVE_LOW;
 		led.name = of_get_property(child, "label", NULL) ? : child->name;
 		led.default_trigger =
 			of_get_property(child, "linux,default-trigger", NULL);
+		state = of_get_property(child, "default-state", NULL);
+		if (state) {
+			if (!strcmp(state, "keep"))
+				led.default_state = LEDS_GPIO_DEFSTATE_KEEP;
+			else if(!strcmp(state, "on"))
+				led.default_state = LEDS_GPIO_DEFSTATE_ON;
+			else
+				led.default_state = LEDS_GPIO_DEFSTATE_OFF;
+		}
 
 		ret = create_gpio_led(&led, &pdata->led_data[pdata->num_leds++],
 				      &ofdev->dev, NULL);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index c7f0b148df06..62af62915cf7 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -143,9 +143,14 @@ struct gpio_led {
 	const char *name;
 	const char *default_trigger;
 	unsigned 	gpio;
-	u8 		active_low : 1;
-	u8		retain_state_suspended : 1;
+	unsigned	active_low : 1;
+	unsigned	retain_state_suspended : 1;
+	unsigned	default_state : 2;
+	/* default_state should be one of LEDS_GPIO_DEFSTATE_(ON|OFF|KEEP) */
 };
+#define LEDS_GPIO_DEFSTATE_OFF	0
+#define LEDS_GPIO_DEFSTATE_ON	1
+#define LEDS_GPIO_DEFSTATE_KEEP	2
 
 struct gpio_led_platform_data {
 	int 		num_leds;
-- 
cgit v1.2.3-71-gd317


From a1dd8c617217322614f0465ae347895c4b58e1ab Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@linux.intel.com>
Date: Mon, 22 Jun 2009 14:54:13 +0100
Subject: leds: Futher document blink_set

Futher document blink_set function pointer

Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 include/linux/leds.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 62af62915cf7..d8bf9665e70c 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -47,7 +47,8 @@ struct led_classdev {
 
 	/* Activate hardware accelerated blink, delays are in
 	 * miliseconds and if none is provided then a sensible default
-	 * should be chosen. */
+	 * should be chosen. The call can adjust the timings if it can't
+	 * match the values specified exactly. */
 	int		(*blink_set)(struct led_classdev *led_cdev,
 				     unsigned long *delay_on,
 				     unsigned long *delay_off);
-- 
cgit v1.2.3-71-gd317


From a5c9b696ec109bb54d547fdb437a7a0c2d514670 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Tue, 23 Jun 2009 12:36:58 -0700
Subject: mm: pass mm to grab_swap_token

If a kthread happens to use get_user_pages() on an mm (as KSM does),
there's a chance that it will end up trying to read in a swap page, then
oops in grab_swap_token() because the kthread has no mm: GUP passes down
the right mm, so grab_swap_token() ought to be using it.

We have not identified a stronger case than KSM's daemon (not yet in
mainline), but the issue must have come up before, since RHEL has included
a fix for this for years (though a different fix, they just back out of
grab_swap_token if current->mm is unset: which is what we first proposed,
but using the right mm here seems more correct).

Reported-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 12 ++++++------
 mm/memory.c          |  2 +-
 mm/thrash.c          | 32 +++++++++++++++-----------------
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c88b36665f79..7c15334f3ff2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,8 +298,8 @@ extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
-extern struct mm_struct * swap_token_mm;
-extern void grab_swap_token(void);
+extern struct mm_struct *swap_token_mm;
+extern void grab_swap_token(struct mm_struct *);
 extern void __put_swap_token(struct mm_struct *);
 
 static inline int has_swap_token(struct mm_struct *mm)
@@ -419,10 +419,10 @@ static inline swp_entry_t get_swap_page(void)
 }
 
 /* linux/mm/thrash.c */
-#define put_swap_token(x) do { } while(0)
-#define grab_swap_token()  do { } while(0)
-#define has_swap_token(x) 0
-#define disable_swap_token() do { } while(0)
+#define put_swap_token(mm)	do { } while (0)
+#define grab_swap_token(mm)	do { } while (0)
+#define has_swap_token(mm)	0
+#define disable_swap_token()	do { } while (0)
 
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
diff --git a/mm/memory.c b/mm/memory.c
index 50da9511aa77..f46ac18ba231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2519,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		grab_swap_token(); /* Contend for token _before_ read-in */
+		grab_swap_token(mm); /* Contend for token _before_ read-in */
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
diff --git a/mm/thrash.c b/mm/thrash.c
index c4c5205a9c35..2372d4ed5dd8 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
 static unsigned int global_faults;
 
-void grab_swap_token(void)
+void grab_swap_token(struct mm_struct *mm)
 {
 	int current_interval;
 
 	global_faults++;
 
-	current_interval = global_faults - current->mm->faultstamp;
+	current_interval = global_faults - mm->faultstamp;
 
 	if (!spin_trylock(&swap_token_lock))
 		return;
 
 	/* First come first served */
 	if (swap_token_mm == NULL) {
-		current->mm->token_priority = current->mm->token_priority + 2;
-		swap_token_mm = current->mm;
+		mm->token_priority = mm->token_priority + 2;
+		swap_token_mm = mm;
 		goto out;
 	}
 
-	if (current->mm != swap_token_mm) {
-		if (current_interval < current->mm->last_interval)
-			current->mm->token_priority++;
+	if (mm != swap_token_mm) {
+		if (current_interval < mm->last_interval)
+			mm->token_priority++;
 		else {
-			if (likely(current->mm->token_priority > 0))
-				current->mm->token_priority--;
+			if (likely(mm->token_priority > 0))
+				mm->token_priority--;
 		}
 		/* Check if we deserve the token */
-		if (current->mm->token_priority >
-				swap_token_mm->token_priority) {
-			current->mm->token_priority += 2;
-			swap_token_mm = current->mm;
+		if (mm->token_priority > swap_token_mm->token_priority) {
+			mm->token_priority += 2;
+			swap_token_mm = mm;
 		}
 	} else {
 		/* Token holder came in again! */
-		current->mm->token_priority += 2;
+		mm->token_priority += 2;
 	}
 
 out:
-	current->mm->faultstamp = global_faults;
-	current->mm->last_interval = current_interval;
+	mm->faultstamp = global_faults;
+	mm->last_interval = current_interval;
 	spin_unlock(&swap_token_lock);
-return;
 }
 
 /* Called on process exit. */
-- 
cgit v1.2.3-71-gd317


From 01ff53f416757da416413bc32229770a8448b6ef Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 23 Jun 2009 12:37:01 -0700
Subject: rmap: fixup page_referenced() for nommu systems

After the recent changes that went into mm/vmscan.c to overhaul stuff, we
ended up with these warnings on no-mmu systems:

  mm/vmscan.c: In function `shrink_page_list':
  mm/vmscan.c:580: warning: unused variable `vm_flags'
  mm/vmscan.c: In function `shrink_active_list':
  mm/vmscan.c:1294: warning: `vm_flags' may be used uninitialized in this function
  mm/vmscan.c:1242: note: `vm_flags' was declared here

This is because the no-mmu function defines page_referenced() to work on
the first argument only (the page).  It does not clear the vm_flags given
to it because for no-mmu systems, they never actually get utilized.  Since
that is no longer strictly true, we need to set vm_flags to 0 like
everyone else so gcc can do proper dead code elimination without annoying
us with unused warnings.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: David Howells <dhowells@redhat.com>
Acked-by: David McCullough <davidm@snapgear.com>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 216d024f830d..bf116d0dbf23 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -118,7 +118,14 @@ int try_to_munlock(struct page *);
 #define anon_vma_prepare(vma)	(0)
 #define anon_vma_link(vma)	do {} while (0)
 
-#define page_referenced(page, locked, cnt, flags) TestClearPageReferenced(page)
+static inline int page_referenced(struct page *page, int is_locked,
+				  struct mem_cgroup *cnt,
+				  unsigned long *vm_flags)
+{
+	*vm_flags = 0;
+	return TestClearPageReferenced(page);
+}
+
 #define try_to_unmap(page, refs) SWAP_FAIL
 
 static inline int page_mkclean(struct page *page)
-- 
cgit v1.2.3-71-gd317


From f007e99c8e2e322b8331aba72414715119a2920d Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Sat, 23 May 2009 00:41:15 +0800
Subject: Intel-IOMMU, intr-remap: source-id checking

To support domain-isolation usages, the platform hardware must be
capable of uniquely identifying the requestor (source-id) for each
interrupt message. Without source-id checking for interrupt remapping
, a rouge guest/VM with assigned devices can launch interrupt attacks
to bring down anothe guest/VM or the VMM itself.

This patch adds source-id checking for interrupt remapping, and then
really isolates interrupts for guests/VMs with assigned devices.

Because PCI subsystem is not initialized yet when set up IOAPIC
entries, use read_pci_config_byte to access PCI config space directly.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/apic/io_apic.c |   6 +++
 drivers/pci/intr_remapping.c   | 120 +++++++++++++++++++++++++++++++++++++++--
 drivers/pci/intr_remapping.h   |   2 +
 include/linux/dmar.h           |  11 ++++
 4 files changed, 136 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b7a79207295e..4d0216fcb36c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1414,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq,
 		irte.vector = vector;
 		irte.dest_id = IRTE_DEST(destination);
 
+		/* Set source-id of interrupt request */
+		set_ioapic_sid(&irte, apic_id);
+
 		modify_irte(irq, &irte);
 
 		ir_entry->index2 = (index >> 15) & 0x1;
@@ -3290,6 +3293,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 		irte.vector = cfg->vector;
 		irte.dest_id = IRTE_DEST(dest);
 
+		/* Set source-id of interrupt request */
+		set_msi_sid(&irte, pdev);
+
 		modify_irte(irq, &irte);
 
 		msg->address_hi = MSI_ADDR_BASE_HI;
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 44025a0c2bb6..4f5b8712931f 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -10,6 +10,8 @@
 #include <linux/intel-iommu.h>
 #include "intr_remapping.h"
 #include <acpi/acpi.h>
+#include <asm/pci-direct.h>
+#include "pci.h"
 
 static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static int ir_ioapic_num;
@@ -418,6 +420,91 @@ int free_irte(int irq)
 	return rc;
 }
 
+/*
+ * source validation type
+ */
+#define SVT_NO_VERIFY		0x0  /* no verification is required */
+#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fiels */
+#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */
+
+/*
+ * source-id qualifier
+ */
+#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */
+#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore
+			      * the third least significant bit
+			      */
+#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore
+			      * the second and third least significant bits
+			      */
+#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore
+			      * the least three significant bits
+			      */
+
+/*
+ * set SVT, SQ and SID fields of irte to verify
+ * source ids of interrupt requests
+ */
+static void set_irte_sid(struct irte *irte, unsigned int svt,
+			 unsigned int sq, unsigned int sid)
+{
+	irte->svt = svt;
+	irte->sq = sq;
+	irte->sid = sid;
+}
+
+int set_ioapic_sid(struct irte *irte, int apic)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	for (i = 0; i < MAX_IO_APICS; i++) {
+		if (ir_ioapic[i].id == apic) {
+			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+			break;
+		}
+	}
+
+	if (sid == 0) {
+		pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic);
+		return -1;
+	}
+
+	set_irte_sid(irte, 1, 0, sid);
+
+	return 0;
+}
+
+int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	struct pci_dev *bridge;
+
+	if (!irte || !dev)
+		return -1;
+
+	/* PCIe device or Root Complex integrated PCI device */
+	if (dev->is_pcie || !dev->bus->parent) {
+		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+			     (dev->bus->number << 8) | dev->devfn);
+		return 0;
+	}
+
+	bridge = pci_find_upstream_pcie_bridge(dev);
+	if (bridge) {
+		if (bridge->is_pcie) /* this is a PCIE-to-PCI/PCIX bridge */
+			set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
+				(bridge->bus->number << 8) | dev->bus->number);
+		else /* this is a legacy PCI bridge */
+			set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+				(bridge->bus->number << 8) | bridge->devfn);
+	}
+
+	return 0;
+}
+
 static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 {
 	u64 addr;
@@ -624,6 +711,35 @@ error:
 	return -1;
 }
 
+static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->dev, path->fn,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+
+	ir_ioapic[ir_ioapic_num].bus   = bus;
+	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
+	ir_ioapic[ir_ioapic_num].iommu = iommu;
+	ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
+	ir_ioapic_num++;
+}
+
 static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
 				 struct intel_iommu *iommu)
 {
@@ -648,9 +764,7 @@ static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
 			       " 0x%Lx\n", scope->enumeration_id,
 			       drhd->address);
 
-			ir_ioapic[ir_ioapic_num].iommu = iommu;
-			ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
-			ir_ioapic_num++;
+			ir_parse_one_ioapic_scope(scope, iommu);
 		}
 		start += scope->length;
 	}
diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
index ca48f0df8ac9..63a263c18415 100644
--- a/drivers/pci/intr_remapping.h
+++ b/drivers/pci/intr_remapping.h
@@ -3,6 +3,8 @@
 struct ioapic_scope {
 	struct intel_iommu *iommu;
 	unsigned int id;
+	unsigned int bus;	/* PCI bus number */
+	unsigned int devfn;	/* PCI devfn number */
 };
 
 #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 1731fb5fd775..4a2b162c256a 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -126,6 +126,8 @@ extern int free_irte(int irq);
 extern int irq_remapped(int irq);
 extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
 extern struct intel_iommu *map_ioapic_to_ir(int apic);
+extern int set_ioapic_sid(struct irte *irte, int apic);
+extern int set_msi_sid(struct irte *irte, struct pci_dev *dev);
 #else
 static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 {
@@ -156,6 +158,15 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
 {
 	return NULL;
 }
+static inline int set_ioapic_sid(struct irte *irte, int apic)
+{
+	return 0;
+}
+static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	return 0;
+}
+
 #define irq_remapped(irq)		(0)
 #define enable_intr_remapping(mode)	(-1)
 #define disable_intr_remapping()	(0)
-- 
cgit v1.2.3-71-gd317


From 9d9609851003ebed15957f0f2ce18492739ee124 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 14:31:37 -0400
Subject: Audit: clean up all op= output to include string quoting

A number of places in the audit system we send an op= followed by a string
that includes spaces.  Somehow this works but it's just wrong.  This patch
moves all of those that I could find to be quoted.

Example:

Change From: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1
subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op=remove rule
key="number2" list=4 res=0

Change To: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1
subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op="remove rule"
key="number2" list=4 res=0

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  3 +++
 kernel/audit.c        |  9 +++++++++
 kernel/audit_tree.c   | 10 ++++------
 kernel/audit_watch.c  |  6 +-----
 kernel/auditfilter.c  | 12 +++++-------
 kernel/auditsc.c      |  8 ++------
 6 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4fa2810b675e..3c7a358241a7 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -599,6 +599,8 @@ extern void		    audit_log_untrustedstring(struct audit_buffer *ab,
 extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct path *path);
+extern void		    audit_log_key(struct audit_buffer *ab,
+					  char *key);
 extern void		    audit_log_lost(const char *message);
 extern int		    audit_update_lsm_rules(void);
 
@@ -621,6 +623,7 @@ extern int audit_enabled;
 #define audit_log_n_untrustedstring(a,n,s) do { ; } while (0)
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_d_path(b, p, d) do { ; } while (0)
+#define audit_log_key(b, k) do { ; } while (0)
 #define audit_enabled 0
 #endif
 #endif
diff --git a/kernel/audit.c b/kernel/audit.c
index e07ad2340dbe..6194c50e2039 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1450,6 +1450,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	kfree(pathname);
 }
 
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+	audit_log_format(ab, " key=");
+	if (key)
+		audit_log_untrustedstring(ab, key);
+	else
+		audit_log_format(ab, "(null)");
+}
+
 /**
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..3ff0731284a1 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -441,13 +441,11 @@ static void kill_rules(struct audit_tree *tree)
 		if (rule->tree) {
 			/* not a half-baked one */
 			ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-			audit_log_format(ab, "op=remove rule dir=");
+			audit_log_format(ab, "op=");
+			audit_log_string(ab, "remove rule");
+			audit_log_format(ab, " dir=");
 			audit_log_untrustedstring(ab, rule->tree->pathname);
-			if (rule->filterkey) {
-				audit_log_format(ab, " key=");
-				audit_log_untrustedstring(ab, rule->filterkey);
-			} else
-				audit_log_format(ab, " key=(null)");
+			audit_log_key(ab, rule->filterkey);
 			audit_log_format(ab, " list=%d res=1", rule->listnr);
 			audit_log_end(ab);
 			rule->tree = NULL;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b49ab019fdff..0e96dbc60ea9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -234,11 +234,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
 		audit_log_string(ab, op);
 		audit_log_format(ab, " path=");
 		audit_log_untrustedstring(ab, w->path);
-		if (r->filterkey) {
-			audit_log_format(ab, " key=");
-			audit_log_untrustedstring(ab, r->filterkey);
-		} else
-			audit_log_format(ab, " key=(null)");
+		audit_log_key(ab, r->filterkey);
 		audit_log_format(ab, " list=%d res=1", r->listnr);
 		audit_log_end(ab);
 	}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 21b623595aad..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1079,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
 			security_release_secctx(ctx, len);
 		}
 	}
-	audit_log_format(ab, " op=%s rule key=", action);
-	if (rule->filterkey)
-		audit_log_untrustedstring(ab, rule->filterkey);
-	else
-		audit_log_format(ab, "(null)");
+	audit_log_format(ab, " op=");
+	audit_log_string(ab, action);
+	audit_log_key(ab, rule->filterkey);
 	audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
 	audit_log_end(ab);
 }
@@ -1147,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_add_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "add",
+		audit_log_rule_change(loginuid, sessionid, sid, "add rule",
 				      &entry->rule, !err);
 
 		if (err)
@@ -1163,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_del_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "remove",
+		audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
 				      &entry->rule, !err);
 
 		audit_free_rule(entry);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 0b862cac6ca2..2de95d1582bc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1137,7 +1137,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		if (has_cntl)
 			audit_log_n_hex(*ab, buf, to_send);
 		else
-			audit_log_format(*ab, "\"%s\"", buf);
+			audit_log_string(*ab, buf);
 
 		p += to_send;
 		len_left -= to_send;
@@ -1372,11 +1372,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 
 	audit_log_task_info(ab, tsk);
-	if (context->filterkey) {
-		audit_log_format(ab, " key=");
-		audit_log_untrustedstring(ab, context->filterkey);
-	} else
-		audit_log_format(ab, " key=(null)");
+	audit_log_key(ab, context->filterkey);
 	audit_log_end(ab);
 
 	for (aux = context->aux; aux; aux = aux->next) {
-- 
cgit v1.2.3-71-gd317


From 346c17a6cf60375323adfaa4b8a9d841049f890e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 22 Jun 2009 07:38:26 +0000
Subject: ide: relax DMA info validity checking

There are some broken devices that report multiple DMA xfer modes
enabled at once (ATA spec doesn't allow it) but otherwise work fine
with DMA so just delete ide_id_dma_bug().

[ As discovered by detective work by Frans and Bart, due to how
  handling of the ID block was handled before commit c419993
  ("ide-iops: only clear DMA words on setting DMA mode") this
  check was always seeing zeros in the fields or other similar
  garbage.  Therefore this check wasn't actually checking anything.
  Now that the tests actually check the real bits, all we see are
  devices that trigger the check yet work perfectly fine, therefore
  killing this useless check is the best thing to do. -DaveM ]

Reported-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ide/ide-dma.c  | 21 ---------------------
 drivers/ide/ide-iops.c |  3 ---
 include/linux/ide.h    |  2 --
 3 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 219e6fb78dc6..ee58c88dee5a 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -361,9 +361,6 @@ static int ide_tune_dma(ide_drive_t *drive)
 	if (__ide_dma_bad_drive(drive))
 		return 0;
 
-	if (ide_id_dma_bug(drive))
-		return 0;
-
 	if (hwif->host_flags & IDE_HFLAG_TRUST_BIOS_FOR_DMA)
 		return config_drive_for_dma(drive);
 
@@ -394,24 +391,6 @@ static int ide_dma_check(ide_drive_t *drive)
 	return -1;
 }
 
-int ide_id_dma_bug(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-
-	if (id[ATA_ID_FIELD_VALID] & 4) {
-		if ((id[ATA_ID_UDMA_MODES] >> 8) &&
-		    (id[ATA_ID_MWDMA_MODES] >> 8))
-			goto err_out;
-	} else if ((id[ATA_ID_MWDMA_MODES] >> 8) &&
-		   (id[ATA_ID_SWDMA_MODES] >> 8))
-		goto err_out;
-
-	return 0;
-err_out:
-	printk(KERN_ERR "%s: bad DMA info in identify block\n", drive->name);
-	return 1;
-}
-
 int ide_set_dma(ide_drive_t *drive)
 {
 	int rc;
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index fa047150a1c6..917186ec4966 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -329,9 +329,6 @@ int ide_driveid_update(ide_drive_t *drive)
 
 	kfree(id);
 
-	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) && ide_id_dma_bug(drive))
-		ide_dma_off(drive);
-
 	return 1;
 out_err:
 	if (rc == 2)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 95c6e00a72e8..cf1f3888067c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1361,7 +1361,6 @@ int ide_in_drive_list(u16 *, const struct drive_list_entry *);
 #ifdef CONFIG_BLK_DEV_IDEDMA
 int ide_dma_good_drive(ide_drive_t *);
 int __ide_dma_bad_drive(ide_drive_t *);
-int ide_id_dma_bug(ide_drive_t *);
 
 u8 ide_find_dma_mode(ide_drive_t *, u8);
 
@@ -1402,7 +1401,6 @@ void ide_dma_lost_irq(ide_drive_t *);
 ide_startstop_t ide_dma_timeout_retry(ide_drive_t *, int);
 
 #else
-static inline int ide_id_dma_bug(ide_drive_t *drive) { return 0; }
 static inline u8 ide_find_dma_mode(ide_drive_t *drive, u8 speed) { return 0; }
 static inline u8 ide_max_dma_mode(ide_drive_t *drive) { return 0; }
 static inline void ide_dma_off_quietly(ide_drive_t *drive) { ; }
-- 
cgit v1.2.3-71-gd317


From 507e123151149e578c9aae33eb876c49824da5f8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 23 Jun 2009 17:38:15 +0200
Subject: timer stats: Optimize by adding quick check to avoid function calls

When the kernel is configured with CONFIG_TIMER_STATS but timer
stats are runtime disabled we still get calls to
__timer_stats_timer_set_start_info which initializes some
fields in the corresponding struct timer_list.

So add some quick checks in the the timer stats setup functions
to avoid function calls to __timer_stats_timer_set_start_info
when timer stats are disabled.

In an artificial workload that does nothing but playing ping
pong with a single tcp packet via loopback this decreases cpu
consumption by 1 - 1.5%.

This is part of a modified function trace output on SLES11:

 perl-2497  [00] 28630647177732388 [+  125]: sk_reset_timer <-tcp_v4_rcv
 perl-2497  [00] 28630647177732513 [+  125]: mod_timer <-sk_reset_timer
 perl-2497  [00] 28630647177732638 [+  125]: __timer_stats_timer_set_start_info <-mod_timer
 perl-2497  [00] 28630647177732763 [+  125]: __mod_timer <-mod_timer
 perl-2497  [00] 28630647177732888 [+  125]: __timer_stats_timer_set_start_info <-__mod_timer
 perl-2497  [00] 28630647177733013 [+   93]: lock_timer_base <-__mod_timer

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mustafa Mesanovic <mustafa.mesanovic@de.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <20090623153811.GA4641@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h   |  5 +++++
 include/linux/timer.h     |  4 ++++
 kernel/time/timer_stats.c | 16 ++++++++--------
 kernel/timer.c            |  2 ++
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7400900de94a..54648e625efd 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -21,6 +21,7 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/percpu.h>
+#include <linux/timer.h>
 
 
 struct hrtimer_clock_base;
@@ -447,6 +448,8 @@ extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 
 static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
 {
+	if (likely(!timer->start_pid))
+		return;
 	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
 				 timer->function, timer->start_comm, 0);
 }
@@ -456,6 +459,8 @@ extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer,
 
 static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
 {
+	if (likely(!timer_stats_active))
+		return;
 	__timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0));
 }
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ccf882eed8f8..be62ec2ebea5 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -190,6 +190,8 @@ extern unsigned long get_next_timer_interrupt(unsigned long now);
  */
 #ifdef CONFIG_TIMER_STATS
 
+extern int timer_stats_active;
+
 #define TIMER_STATS_FLAG_DEFERRABLE	0x1
 
 extern void init_timer_stats(void);
@@ -203,6 +205,8 @@ extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
 
 static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
 {
+	if (likely(!timer_stats_active))
+		return;
 	__timer_stats_timer_set_start_info(timer, __builtin_return_address(0));
 }
 
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
 /*
  * Collection status, active/inactive:
  */
-static int __read_mostly active;
+int __read_mostly timer_stats_active;
 
 /*
  * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	struct entry *entry, input;
 	unsigned long flags;
 
-	if (likely(!active))
+	if (likely(!timer_stats_active))
 		return;
 
 	lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	input.timer_flag = timer_flag;
 
 	spin_lock_irqsave(lock, flags);
-	if (!active)
+	if (!timer_stats_active)
 		goto out_unlock;
 
 	entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
 	/*
 	 * If still active then calculate up to now:
 	 */
-	if (active)
+	if (timer_stats_active)
 		time_stop = ktime_get();
 
 	time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
 	mutex_lock(&show_mutex);
 	switch (ctl[0]) {
 	case '0':
-		if (active) {
-			active = 0;
+		if (timer_stats_active) {
+			timer_stats_active = 0;
 			time_stop = ktime_get();
 			sync_access();
 		}
 		break;
 	case '1':
-		if (!active) {
+		if (!timer_stats_active) {
 			reset_entries();
 			time_start = ktime_get();
 			smp_mb();
-			active = 1;
+			timer_stats_active = 1;
 		}
 		break;
 	default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..0b36b9e5cc8b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
 {
 	unsigned int flag = 0;
 
+	if (likely(!timer->start_site))
+		return;
 	if (unlikely(tbase_get_deferrable(timer->base)))
 		flag |= TIMER_STATS_FLAG_DEFERRABLE;
 
-- 
cgit v1.2.3-71-gd317


From 3e63cbb1efca7dd3137de1bb475e2e068e38ef23 Mon Sep 17 00:00:00 2001
From: Ankit Jain <me@ankitjain.org>
Date: Fri, 19 Jun 2009 14:28:07 -0400
Subject: fs: Add new pre-allocation ioctls to vfs for compatibility with
 legacy xfs ioctls

This patch adds ioctls to vfs for compatibility with legacy XFS
pre-allocation ioctls (XFS_IOC_*RESVP*). The implementation
effectively invokes sys_fallocate for the new ioctls.
Also handles the compat_ioctl case.
Note: These legacy ioctls are also implemented by OCFS2.

[AV: folded fixes from hch]

Signed-off-by: Ankit Jain <me@ankitjain.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat_ioctl.c      | 48 +++++++++++++++++++++++++++++++++++++++++
 fs/ioctl.c             | 35 ++++++++++++++++++++++++++++++
 fs/open.c              | 58 +++++++++++++++++++++++++-------------------------
 include/linux/falloc.h | 21 ++++++++++++++++++
 include/linux/fs.h     |  6 ++++++
 5 files changed, 139 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c135202c38b3..626c7483b4de 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -31,6 +31,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/vt.h>
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/ppp_defs.h>
@@ -1779,6 +1780,41 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, cmd, (unsigned long)tn);
 }
 
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+struct space_resv_32 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start	__attribute__((packed));
+			/* len == 0 means until end of file */
+	__s64		l_len __attribute__((packed));
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area */
+};
+
+#define FS_IOC_RESVSP_32		_IOW ('X', 40, struct space_resv_32)
+#define FS_IOC_RESVSP64_32	_IOW ('X', 42, struct space_resv_32)
+
+/* just account for different alignment */
+static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
+{
+	struct space_resv_32	__user *p32 = (void __user *)arg;
+	struct space_resv	__user *p = compat_alloc_user_space(sizeof(*p));
+
+	if (copy_in_user(&p->l_type,	&p32->l_type,	sizeof(s16)) ||
+	    copy_in_user(&p->l_whence,	&p32->l_whence, sizeof(s16)) ||
+	    copy_in_user(&p->l_start,	&p32->l_start,	sizeof(s64)) ||
+	    copy_in_user(&p->l_len,	&p32->l_len,	sizeof(s64)) ||
+	    copy_in_user(&p->l_sysid,	&p32->l_sysid,	sizeof(s32)) ||
+	    copy_in_user(&p->l_pid,	&p32->l_pid,	sizeof(u32)) ||
+	    copy_in_user(&p->l_pad,	&p32->l_pad,	4*sizeof(u32)))
+		return -EFAULT;
+
+	return ioctl_preallocate(file, p);
+}
+#endif
+
 
 typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
 					unsigned long, struct file *);
@@ -2756,6 +2792,18 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	case FIOQSIZE:
 		break;
 
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+	case FS_IOC_RESVSP_32:
+	case FS_IOC_RESVSP64_32:
+		error = compat_ioctl_preallocate(filp, arg);
+		goto out_fput;
+#else
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		error = ioctl_preallocate(filp, (void __user *)arg);
+		goto out_fput;
+#endif
+
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 001f8d3118f2..5612880fcbe7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
+#include <linux/falloc.h>
 
 #include <asm/ioctls.h>
 
@@ -403,6 +404,37 @@ EXPORT_SYMBOL(generic_block_fiemap);
 
 #endif  /*  CONFIG_BLOCK  */
 
+/*
+ * This provides compatibility with legacy XFS pre-allocation ioctls
+ * which predate the fallocate syscall.
+ *
+ * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
+ * are used here, rest are ignored.
+ */
+int ioctl_preallocate(struct file *filp, void __user *argp)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct space_resv sr;
+
+	if (copy_from_user(&sr, argp, sizeof(sr)))
+		return -EFAULT;
+
+	switch (sr.l_whence) {
+	case SEEK_SET:
+		break;
+	case SEEK_CUR:
+		sr.l_start += filp->f_pos;
+		break;
+	case SEEK_END:
+		sr.l_start += i_size_read(inode);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
@@ -414,6 +446,9 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 		return ioctl_fibmap(filp, p);
 	case FIONREAD:
 		return put_user(i_size_read(inode) - filp->f_pos, p);
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		return ioctl_preallocate(filp, p);
 	}
 
 	return vfs_ioctl(filp, cmd, arg);
diff --git a/fs/open.c b/fs/open.c
index 7200e23d9258..dd98e8076024 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -378,63 +378,63 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 #endif
 #endif /* BITS_PER_LONG == 32 */
 
-SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+
+int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
-	struct file *file;
-	struct inode *inode;
-	long ret = -EINVAL;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	long ret;
 
 	if (offset < 0 || len <= 0)
-		goto out;
+		return -EINVAL;
 
 	/* Return error if mode is not supported */
-	ret = -EOPNOTSUPP;
 	if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
-		goto out;
+		return -EOPNOTSUPP;
 
-	ret = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
 	if (!(file->f_mode & FMODE_WRITE))
-		goto out_fput;
+		return -EBADF;
 	/*
 	 * Revalidate the write permissions, in case security policy has
 	 * changed since the files were opened.
 	 */
 	ret = security_file_permission(file, MAY_WRITE);
 	if (ret)
-		goto out_fput;
+		return ret;
 
-	inode = file->f_path.dentry->d_inode;
-
-	ret = -ESPIPE;
 	if (S_ISFIFO(inode->i_mode))
-		goto out_fput;
+		return -ESPIPE;
 
-	ret = -ENODEV;
 	/*
 	 * Let individual file system decide if it supports preallocation
 	 * for directories or not.
 	 */
 	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-		goto out_fput;
+		return -ENODEV;
 
-	ret = -EFBIG;
 	/* Check for wrap through zero too */
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
-		goto out_fput;
+		return -EFBIG;
 
-	if (inode->i_op->fallocate)
-		ret = inode->i_op->fallocate(inode, mode, offset, len);
-	else
-		ret = -EOPNOTSUPP;
+	if (!inode->i_op->fallocate)
+		return -EOPNOTSUPP;
 
-out_fput:
-	fput(file);
-out:
-	return ret;
+	return inode->i_op->fallocate(inode, mode, offset, len);
 }
+
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+{
+	struct file *file;
+	int error = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		error = do_fallocate(file, mode, offset, len);
+		fput(file);
+	}
+
+	return error;
+}
+
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
 {
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 8e912ab6a072..3c155107d61f 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -3,4 +3,25 @@
 
 #define FALLOC_FL_KEEP_SIZE	0x01 /* default is extend size */
 
+#ifdef __KERNEL__
+
+/*
+ * Space reservation ioctls and argument structure
+ * are designed to be compatible with the legacy XFS ioctls.
+ */
+struct space_resv {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserved area */
+};
+
+#define FS_IOC_RESVSP		_IOW('X', 40, struct space_resv)
+#define FS_IOC_RESVSP64		_IOW('X', 42, struct space_resv)
+
+#endif /* __KERNEL__ */
+
 #endif /* _FALLOC_H_ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1ff5e4e01952..79e302ddde0f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1906,6 +1906,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
+extern int do_fallocate(struct file *file, int mode, loff_t offset,
+			loff_t len);
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			int mode);
 extern struct file *filp_open(const char *, int, int);
@@ -1914,6 +1916,10 @@ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
 extern int filp_close(struct file *, fl_owner_t id);
 extern char * getname(const char __user *);
 
+/* fs/ioctl.c */
+
+extern int ioctl_preallocate(struct file *filp, void __user *argp);
+
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
 extern void __init vfs_caches_init(unsigned long);
-- 
cgit v1.2.3-71-gd317


From f19d4a8fa6f9b6ccf54df0971c97ffcaa390b7b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 19:50:45 -0400
Subject: add caching of ACLs in struct inode

No helpers, no conversions yet.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 10 ++++++++++
 include/linux/fs.h |  7 +++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index f643be565df8..e193cd592fa8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,7 @@
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
+#include <linux/posix_acl.h>
 
 /*
  * This is needed for the following functions:
@@ -189,6 +190,9 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	}
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
+#ifdef CONFIG_FS_POSIX_ACL
+	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+#endif
 
 #ifdef CONFIG_FSNOTIFY
 	inode->i_fsnotify_mask = 0;
@@ -227,6 +231,12 @@ void destroy_inode(struct inode *inode)
 	ima_inode_free(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
+#ifdef CONFIG_FS_POSIX_ACL
+	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_acl);
+	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_default_acl);
+#endif
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 79e302ddde0f..0872372184fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -710,6 +710,9 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 #define i_size_ordered_init(inode) do { } while (0)
 #endif
 
+struct posix_acl;
+#define ACL_NOT_CACHED ((void *)(-1))
+
 struct inode {
 	struct hlist_node	i_hash;
 	struct list_head	i_list;
@@ -772,6 +775,10 @@ struct inode {
 	atomic_t		i_writecount;
 #ifdef CONFIG_SECURITY
 	void			*i_security;
+#endif
+#ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
 #endif
 	void			*i_private; /* fs or device private pointer */
 };
-- 
cgit v1.2.3-71-gd317


From 6582a0e6f6bc7bf64817b9e1a424782855292ab0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 19:53:58 -0400
Subject: switch ext3 to inode->i_acl

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/acl.c             | 22 ++++++++++------------
 fs/ext3/acl.h             |  4 ----
 fs/ext3/inode.c           |  4 ----
 fs/ext3/super.c           | 16 ----------------
 include/linux/ext3_fs_i.h |  4 ----
 5 files changed, 10 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e0c745451715..a9707689d9e1 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -134,7 +134,7 @@ ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
 	if (acl) {
 		spin_lock(&inode->i_lock);
 		acl = *i_acl;
-		if (acl != EXT3_ACL_NOT_CACHED)
+		if (acl != ACL_NOT_CACHED)
 			acl = posix_acl_dup(acl);
 		spin_unlock(&inode->i_lock);
 	}
@@ -147,7 +147,7 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
                   struct posix_acl *acl)
 {
 	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT3_ACL_NOT_CACHED)
+	if (*i_acl != ACL_NOT_CACHED)
 		posix_acl_release(*i_acl);
 	*i_acl = posix_acl_dup(acl);
 	spin_unlock(&inode->i_lock);
@@ -161,7 +161,6 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
 {
-	struct ext3_inode_info *ei = EXT3_I(inode);
 	int name_index;
 	char *value = NULL;
 	struct posix_acl *acl;
@@ -172,15 +171,15 @@ ext3_get_acl(struct inode *inode, int type)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			acl = ext3_iget_acl(inode, &ei->i_acl);
-			if (acl != EXT3_ACL_NOT_CACHED)
+			acl = ext3_iget_acl(inode, &inode->i_acl);
+			if (acl != ACL_NOT_CACHED)
 				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			acl = ext3_iget_acl(inode, &ei->i_default_acl);
-			if (acl != EXT3_ACL_NOT_CACHED)
+			acl = ext3_iget_acl(inode, &inode->i_default_acl);
+			if (acl != ACL_NOT_CACHED)
 				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
 			break;
@@ -206,11 +205,11 @@ ext3_get_acl(struct inode *inode, int type)
 	if (!IS_ERR(acl)) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &ei->i_acl, acl);
+				ext3_iset_acl(inode, &inode->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &ei->i_default_acl, acl);
+				ext3_iset_acl(inode, &inode->i_default_acl, acl);
 				break;
 		}
 	}
@@ -226,7 +225,6 @@ static int
 ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	     struct posix_acl *acl)
 {
-	struct ext3_inode_info *ei = EXT3_I(inode);
 	int name_index;
 	void *value = NULL;
 	size_t size = 0;
@@ -274,11 +272,11 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	if (!error) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &ei->i_acl, acl);
+				ext3_iset_acl(inode, &inode->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &ei->i_default_acl, acl);
+				ext3_iset_acl(inode, &inode->i_default_acl, acl);
 				break;
 		}
 	}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 42da16b8cac0..07d15a3a5969 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -53,10 +53,6 @@ static inline int ext3_acl_count(size_t size)
 
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 
-/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT3_ACL_NOT_CACHED ((void *)-1)
-
 /* acl.c */
 extern int ext3_permission (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 05dea8132fc0..5f51fed5c750 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2752,10 +2752,6 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	ei = EXT3_I(inode);
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	ei->i_acl = EXT3_ACL_NOT_CACHED;
-	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
 	ei->i_block_alloc_info = NULL;
 
 	ret = __ext3_get_inode_loc(inode, &iloc, 0);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 601e881e6105..524b349c6299 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -464,10 +464,6 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	ei->i_acl = EXT3_ACL_NOT_CACHED;
-	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
 	ei->i_block_alloc_info = NULL;
 	ei->vfs_inode.i_version = 1;
 	return &ei->vfs_inode;
@@ -518,18 +514,6 @@ static void destroy_inodecache(void)
 static void ext3_clear_inode(struct inode *inode)
 {
 	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	if (EXT3_I(inode)->i_acl &&
-			EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-		posix_acl_release(EXT3_I(inode)->i_acl);
-		EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
-	}
-	if (EXT3_I(inode)->i_default_acl &&
-			EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-		posix_acl_release(EXT3_I(inode)->i_default_acl);
-		EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
-	}
-#endif
 	ext3_discard_reservation(inode);
 	EXT3_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 7894dd0f3b77..ca1bfe90004f 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -103,10 +103,6 @@ struct ext3_inode_info {
 	 */
 	struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	struct posix_acl	*i_acl;
-	struct posix_acl	*i_default_acl;
-#endif
 
 	struct list_head i_orphan;	/* unlinked but open inodes */
 
-- 
cgit v1.2.3-71-gd317


From 7a77b15d9294749809de918e24bebc39e0fbc9ab Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 21:01:13 -0400
Subject: switch reiserfs to usual conventions for caching ACLs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/super.c          | 12 ++++++------
 fs/reiserfs/xattr_acl.c      | 21 ++++++++-------------
 include/linux/reiserfs_acl.h |  4 ++--
 3 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2969773cfc22..b194451fc04b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -530,8 +530,8 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	inode_init_once(&ei->vfs_inode);
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	ei->i_acl_access = NULL;
-	ei->i_acl_default = NULL;
+	ei->i_acl_access = ACL_NOT_CACHED;
+	ei->i_acl_default = ACL_NOT_CACHED;
 #endif
 }
 
@@ -586,14 +586,14 @@ static void reiserfs_clear_inode(struct inode *inode)
 	struct posix_acl *acl;
 
 	acl = REISERFS_I(inode)->i_acl_access;
-	if (acl && !IS_ERR(acl))
+	if (acl && acl != ACL_NOT_CACHED)
 		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_access = NULL;
+	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
 
 	acl = REISERFS_I(inode)->i_acl_default;
-	if (acl && !IS_ERR(acl))
+	if (acl && acl != ACL_NOT_CACHED)
 		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_default = NULL;
+	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
 }
 #else
 #define reiserfs_clear_inode NULL
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index a1a7e3530e17..7b3aeb9327d3 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -192,19 +192,19 @@ static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
 			    struct posix_acl *acl)
 {
 	spin_lock(&inode->i_lock);
-	if (*i_acl != ERR_PTR(-ENODATA))
+	if (*i_acl != ACL_NOT_CACHED)
 		posix_acl_release(*i_acl);
-	*i_acl = acl ? posix_acl_dup(acl) : ERR_PTR(-ENODATA);
+	*i_acl = posix_acl_dup(acl);
 	spin_unlock(&inode->i_lock);
 }
 
 static inline struct posix_acl *iget_acl(struct inode *inode,
 					 struct posix_acl **i_acl)
 {
-	struct posix_acl *acl = ERR_PTR(-ENODATA);
+	struct posix_acl *acl = ACL_NOT_CACHED;
 
 	spin_lock(&inode->i_lock);
-	if (*i_acl != ERR_PTR(-ENODATA))
+	if (*i_acl != ACL_NOT_CACHED)
 		acl = posix_acl_dup(*i_acl);
 	spin_unlock(&inode->i_lock);
 
@@ -239,15 +239,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	}
 
 	acl = iget_acl(inode, p_acl);
-	if (acl && !IS_ERR(acl))
+	if (acl != ACL_NOT_CACHED)
 		return acl;
-	else if (PTR_ERR(acl) == -ENODATA)
-		return NULL;
 
 	size = reiserfs_xattr_get(inode, name, NULL, 0);
 	if (size < 0) {
 		if (size == -ENODATA || size == -ENOSYS) {
-			*p_acl = ERR_PTR(-ENODATA);
+			*p_acl = NULL;
 			return NULL;
 		}
 		return ERR_PTR(size);
@@ -262,7 +260,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 		/* This shouldn't actually happen as it should have
 		   been caught above.. but just in case */
 		acl = NULL;
-		*p_acl = ERR_PTR(-ENODATA);
+		*p_acl = acl;
 	} else if (retval < 0) {
 		acl = ERR_PTR(retval);
 	} else {
@@ -379,11 +377,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 	}
 
 	acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
-	if (IS_ERR(acl)) {
-		if (PTR_ERR(acl) == -ENODATA)
-			goto apply_umask;
+	if (IS_ERR(acl))
 		return PTR_ERR(acl);
-	}
 
 	if (acl) {
 		struct posix_acl *acl_copy;
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 8cc65757e47a..8f4d8d718b10 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -58,12 +58,12 @@ extern struct xattr_handler reiserfs_posix_acl_access_handler;
 
 static inline void reiserfs_init_acl_access(struct inode *inode)
 {
-	REISERFS_I(inode)->i_acl_access = NULL;
+	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
 }
 
 static inline void reiserfs_init_acl_default(struct inode *inode)
 {
-	REISERFS_I(inode)->i_acl_default = NULL;
+	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
 }
 #else
 
-- 
cgit v1.2.3-71-gd317


From 281eede0328c84a8f20e0e85b807d5b51c3de4f2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 21:07:04 -0400
Subject: switch reiserfs to inode->i_acl

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/inode.c           |  4 ----
 fs/reiserfs/super.c           | 24 ------------------------
 fs/reiserfs/xattr_acl.c       | 10 ++++------
 include/linux/reiserfs_acl.h  | 17 -----------------
 include/linux/reiserfs_fs_i.h |  4 ----
 5 files changed, 4 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6fd0f47e45db..a14d6cd9eeda 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1131,8 +1131,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
 	mutex_init(&(REISERFS_I(inode)->i_mmap));
-	reiserfs_init_acl_access(inode);
-	reiserfs_init_acl_default(inode);
 	reiserfs_init_xattr_rwsem(inode);
 
 	if (stat_data_v1(ih)) {
@@ -1834,8 +1832,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
 	mutex_init(&(REISERFS_I(inode)->i_mmap));
-	reiserfs_init_acl_access(inode);
-	reiserfs_init_acl_default(inode);
 	reiserfs_init_xattr_rwsem(inode);
 
 	/* key to search for correct place for new stat data */
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b194451fc04b..d3aeb061612b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -529,10 +529,6 @@ static void init_once(void *foo)
 
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	inode_init_once(&ei->vfs_inode);
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	ei->i_acl_access = ACL_NOT_CACHED;
-	ei->i_acl_default = ACL_NOT_CACHED;
-#endif
 }
 
 static int init_inodecache(void)
@@ -580,25 +576,6 @@ static void reiserfs_dirty_inode(struct inode *inode)
 	reiserfs_write_unlock(inode->i_sb);
 }
 
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-static void reiserfs_clear_inode(struct inode *inode)
-{
-	struct posix_acl *acl;
-
-	acl = REISERFS_I(inode)->i_acl_access;
-	if (acl && acl != ACL_NOT_CACHED)
-		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
-
-	acl = REISERFS_I(inode)->i_acl_default;
-	if (acl && acl != ACL_NOT_CACHED)
-		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
-}
-#else
-#define reiserfs_clear_inode NULL
-#endif
-
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
 				    size_t, loff_t);
@@ -612,7 +589,6 @@ static const struct super_operations reiserfs_sops = {
 	.write_inode = reiserfs_write_inode,
 	.dirty_inode = reiserfs_dirty_inode,
 	.delete_inode = reiserfs_delete_inode,
-	.clear_inode = reiserfs_clear_inode,
 	.put_super = reiserfs_put_super,
 	.write_super = reiserfs_write_super,
 	.sync_fs = reiserfs_sync_fs,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 7b3aeb9327d3..b6e473faa8b8 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -223,16 +223,15 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	struct posix_acl *acl, **p_acl;
 	int size;
 	int retval;
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &reiserfs_i->i_acl_access;
+		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &reiserfs_i->i_acl_default;
+		p_acl = &inode->i_default_acl;
 		break;
 	default:
 		return ERR_PTR(-EINVAL);
@@ -288,7 +287,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	struct posix_acl **p_acl;
 	size_t size = 0;
 	int error;
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
@@ -296,7 +294,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &reiserfs_i->i_acl_access;
+		p_acl = &inode->i_acl;
 		if (acl) {
 			mode_t mode = inode->i_mode;
 			error = posix_acl_equiv_mode(acl, &mode);
@@ -311,7 +309,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &reiserfs_i->i_acl_default;
+		p_acl = &inode->i_default_acl;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 8f4d8d718b10..b4448853900e 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -56,15 +56,6 @@ int reiserfs_cache_default_acl(struct inode *dir);
 extern struct xattr_handler reiserfs_posix_acl_default_handler;
 extern struct xattr_handler reiserfs_posix_acl_access_handler;
 
-static inline void reiserfs_init_acl_access(struct inode *inode)
-{
-	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
-}
-
-static inline void reiserfs_init_acl_default(struct inode *inode)
-{
-	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
-}
 #else
 
 #define reiserfs_cache_default_acl(inode) 0
@@ -86,12 +77,4 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 {
 	return 0;
 }
-
-static inline void reiserfs_init_acl_access(struct inode *inode)
-{
-}
-
-static inline void reiserfs_init_acl_default(struct inode *inode)
-{
-}
 #endif
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index 76360b36ac33..89f4d3abbf5a 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -54,10 +54,6 @@ struct reiserfs_inode_info {
 	unsigned int i_trans_id;
 	struct reiserfs_journal_list *i_jl;
 	struct mutex i_mmap;
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	struct posix_acl *i_acl_access;
-	struct posix_acl *i_acl_default;
-#endif
 #ifdef CONFIG_REISERFS_FS_XATTR
 	struct rw_semaphore i_xattr_sem;
 #endif
-- 
cgit v1.2.3-71-gd317


From 06b16e9f68edaa1e71aee943d3c030bcf7380af1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 19:56:00 -0400
Subject: switch shmem to inode->i_acl

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/shmem_fs.h |  8 --------
 mm/shmem.c               |  9 ++++-----
 mm/shmem_acl.c           | 29 ++++++-----------------------
 3 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index fd83f2584b15..abff6c9b413c 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -19,10 +19,6 @@ struct shmem_inode_info {
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* first blocks */
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct inode		vfs_inode;
-#ifdef CONFIG_TMPFS_POSIX_ACL
-	struct posix_acl	*i_acl;
-	struct posix_acl	*i_default_acl;
-#endif
 };
 
 struct shmem_sb_info {
@@ -45,7 +41,6 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 #ifdef CONFIG_TMPFS_POSIX_ACL
 int shmem_permission(struct inode *, int);
 int shmem_acl_init(struct inode *, struct inode *);
-void shmem_acl_destroy_inode(struct inode *);
 
 extern struct xattr_handler shmem_xattr_acl_access_handler;
 extern struct xattr_handler shmem_xattr_acl_default_handler;
@@ -57,9 +52,6 @@ static inline int shmem_acl_init(struct inode *inode, struct inode *dir)
 {
 	return 0;
 }
-static inline void shmem_acl_destroy_inode(struct inode *inode)
-{
-}
 #endif  /* CONFIG_TMPFS_POSIX_ACL */
 
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index e89d7ec18eda..5f2019fc7895 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2379,6 +2379,10 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 	if (!p)
 		return NULL;
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	p->vfs_inode.i_acl = NULL;
+	p->vfs_inode.i_default_acl = NULL;
+#endif
 	return &p->vfs_inode;
 }
 
@@ -2388,7 +2392,6 @@ static void shmem_destroy_inode(struct inode *inode)
 		/* only struct inode is valid if it's an inline symlink */
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	}
-	shmem_acl_destroy_inode(inode);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
@@ -2397,10 +2400,6 @@ static void init_once(void *foo)
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
 	inode_init_once(&p->vfs_inode);
-#ifdef CONFIG_TMPFS_POSIX_ACL
-	p->i_acl = NULL;
-	p->i_default_acl = NULL;
-#endif
 }
 
 static int init_inodecache(void)
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 8e5aadd7dcd6..606a8e757a42 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type)
 	spin_lock(&inode->i_lock);
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
+			acl = posix_acl_dup(inode->i_acl);
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
+			acl = posix_acl_dup(inode->i_default_acl);
 			break;
 	}
 	spin_unlock(&inode->i_lock);
@@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	spin_lock(&inode->i_lock);
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			free = SHMEM_I(inode)->i_acl;
-			SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
+			free = inode->i_acl;
+			inode->i_acl = posix_acl_dup(acl);
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			free = SHMEM_I(inode)->i_default_acl;
-			SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
+			free = inode->i_default_acl;
+			inode->i_default_acl = posix_acl_dup(acl);
 			break;
 	}
 	spin_unlock(&inode->i_lock);
@@ -154,23 +154,6 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
 	return generic_acl_init(inode, dir, &shmem_acl_ops);
 }
 
-/**
- * shmem_acl_destroy_inode  -  destroy acls hanging off the in-memory inode
- *
- * This is done before destroying the actual inode.
- */
-
-void
-shmem_acl_destroy_inode(struct inode *inode)
-{
-	if (SHMEM_I(inode)->i_acl)
-		posix_acl_release(SHMEM_I(inode)->i_acl);
-	SHMEM_I(inode)->i_acl = NULL;
-	if (SHMEM_I(inode)->i_default_acl)
-		posix_acl_release(SHMEM_I(inode)->i_default_acl);
-	SHMEM_I(inode)->i_default_acl = NULL;
-}
-
 /**
  * shmem_check_acl  -  check_acl() callback for generic_permission()
  */
-- 
cgit v1.2.3-71-gd317


From 073aaa1b142461d91f83da66db1184d7c1b1edea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Jun 2009 12:11:54 -0400
Subject: helpers for acl caching + switch to those

helpers: get_cached_acl(inode, type), set_cached_acl(inode, type, acl),
forget_cached_acl(inode, type).

ubifs/xattr.c needed includes reordered, the rest is a plain switchover.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/acl.c            | 44 +++++--------------------
 fs/ext2/acl.c             | 79 ++++++++++----------------------------------
 fs/ext3/acl.c             | 83 +++++++++++------------------------------------
 fs/ext4/acl.c             | 65 +++++--------------------------------
 fs/jffs2/acl.c            | 60 +++++++---------------------------
 fs/jfs/acl.c              | 32 ++++++++----------
 fs/jfs/xattr.c            | 10 ++----
 fs/reiserfs/xattr_acl.c   | 49 ++++++----------------------
 fs/ubifs/xattr.c          |  2 +-
 include/linux/posix_acl.h | 64 ++++++++++++++++++++++++++++++++++++
 10 files changed, 155 insertions(+), 333 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6db8a42a3e5e..f128427b995b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -29,51 +29,28 @@
 
 #ifdef CONFIG_FS_POSIX_ACL
 
-static void btrfs_update_cached_acl(struct inode *inode,
-				    struct posix_acl **p_acl,
-				    struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*p_acl && *p_acl != ACL_NOT_CACHED)
-		posix_acl_release(*p_acl);
-	*p_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
 	int size;
 	const char *name;
 	char *value = NULL;
-	struct posix_acl *acl = NULL, **p_acl;
+	struct posix_acl *acl;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 
-	/* Handle the cached NULL acl case without locking */
-	acl = ACCESS_ONCE(*p_acl);
-	if (!acl)
-		return acl;
-
-	spin_lock(&inode->i_lock);
-	acl = *p_acl;
-	if (acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	size = __btrfs_getxattr(inode, name, "", 0);
 	if (size > 0) {
 		value = kzalloc(size, GFP_NOFS);
@@ -82,13 +59,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		size = __btrfs_getxattr(inode, name, value, size);
 		if (size > 0) {
 			acl = posix_acl_from_xattr(value, size);
-			btrfs_update_cached_acl(inode, p_acl, acl);
+			set_cached_acl(inode, type, acl);
 		}
 		kfree(value);
 	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
 		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
-		btrfs_update_cached_acl(inode, p_acl, acl);
+		set_cached_acl(inode, type, acl);
 	} else {
 		acl = ERR_PTR(-EIO);
 	}
@@ -121,7 +98,6 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret, size = 0;
 	const char *name;
-	struct posix_acl **p_acl;
 	char *value = NULL;
 	mode_t mode;
 
@@ -141,13 +117,11 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		ret = 0;
 		inode->i_mode = mode;
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EINVAL : 0;
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		break;
 	default:
 		return -EINVAL;
@@ -172,7 +146,7 @@ out:
 	kfree(value);
 
 	if (!ret)
-		btrfs_update_cached_acl(inode, p_acl, acl);
+		set_cached_acl(inode, type, acl);
 
 	return ret;
 }
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d2ffddc12117..d636e1297cad 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -125,30 +125,6 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct posix_acl *
-ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACL_NOT_CACHED;
-
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
-
-	return acl;
-}
-
-static inline void
-ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-		   struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * inode->i_mutex: don't care
  */
@@ -163,23 +139,19 @@ ext2_get_acl(struct inode *inode, int type)
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return NULL;
 
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = ext2_iget_acl(inode, &inode->i_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			acl = ext2_iget_acl(inode, &inode->i_default_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			break;
-
-		default:
-			return ERR_PTR(-EINVAL);
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
 	}
 	retval = ext2_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
@@ -196,17 +168,9 @@ ext2_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl)) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext2_iset_acl(inode, &inode->i_acl, acl);
-				break;
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-			case ACL_TYPE_DEFAULT:
-				ext2_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
 	return acl;
 }
 
@@ -261,17 +225,8 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	error = ext2_xattr_set(inode, name_index, "", value, size, 0);
 
 	kfree(value);
-	if (!error) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext2_iset_acl(inode, &inode->i_acl, acl);
-				break;
-
-			case ACL_TYPE_DEFAULT:
-				ext2_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
+	if (!error)
+		set_cached_acl(inode, type, acl);
 	return error;
 }
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index a9707689d9e1..e167bae37ef0 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -126,33 +126,6 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct posix_acl *
-ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
-
-	if (acl) {
-		spin_lock(&inode->i_lock);
-		acl = *i_acl;
-		if (acl != ACL_NOT_CACHED)
-			acl = posix_acl_dup(acl);
-		spin_unlock(&inode->i_lock);
-	}
-
-	return acl;
-}
-
-static inline void
-ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                  struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * Inode operation get_posix_acl().
  *
@@ -169,24 +142,21 @@ ext3_get_acl(struct inode *inode, int type)
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return NULL;
 
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = ext3_iget_acl(inode, &inode->i_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			acl = ext3_iget_acl(inode, &inode->i_default_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			break;
-
-		default:
-			return ERR_PTR(-EINVAL);
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
 	}
+
 	retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
 		value = kmalloc(retval, GFP_NOFS);
@@ -202,17 +172,9 @@ ext3_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl)) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &inode->i_acl, acl);
-				break;
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
 	return acl;
 }
 
@@ -269,17 +231,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 				      value, size, 0);
 
 	kfree(value);
-	if (!error) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &inode->i_acl, acl);
-				break;
 
-			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
 	return error;
 }
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 0084e3a19d86..f6d8967149ca 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -126,33 +126,6 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct posix_acl *
-ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
-
-	if (acl) {
-		spin_lock(&inode->i_lock);
-		acl = *i_acl;
-		if (acl != ACL_NOT_CACHED)
-			acl = posix_acl_dup(acl);
-		spin_unlock(&inode->i_lock);
-	}
-
-	return acl;
-}
-
-static inline void
-ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-		struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * Inode operation get_posix_acl().
  *
@@ -169,23 +142,19 @@ ext4_get_acl(struct inode *inode, int type)
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return NULL;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		acl = ext4_iget_acl(inode, &inode->i_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 		break;
-
 	case ACL_TYPE_DEFAULT:
-		acl = ext4_iget_acl(inode, &inode->i_default_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
 		break;
-
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
@@ -202,17 +171,9 @@ ext4_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl)) {
-		switch (type) {
-		case ACL_TYPE_ACCESS:
-			ext4_iset_acl(inode, &inode->i_acl, acl);
-			break;
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-		case ACL_TYPE_DEFAULT:
-			ext4_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
 	return acl;
 }
 
@@ -269,17 +230,9 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 				      value, size, 0);
 
 	kfree(value);
-	if (!error) {
-		switch (type) {
-		case ACL_TYPE_ACCESS:
-			ext4_iset_acl(inode, &inode->i_acl, acl);
-			break;
+	if (!error)
+		set_cached_acl(inode, type, acl);
 
-		case ACL_TYPE_DEFAULT:
-			ext4_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
 	return error;
 }
 
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index ac16589ebbd1..edd2ad6416d8 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -156,47 +156,25 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
 	return ERR_PTR(-EINVAL);
 }
 
-static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACL_NOT_CACHED;
-
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
-	return acl;
-}
-
-static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 {
 	struct posix_acl *acl;
 	char *value = NULL;
 	int rc, xprefix;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		acl = jffs2_iget_acl(inode, &inode->i_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		acl = jffs2_iget_acl(inode, &inode->i_default_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 	rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
 	if (rc > 0) {
@@ -214,16 +192,8 @@ static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 	}
 	if (value)
 		kfree(value);
-	if (!IS_ERR(acl)) {
-		switch (type) {
-		case ACL_TYPE_ACCESS:
-			jffs2_iset_acl(inode, &inode->i_acl, acl);
-			break;
-		case ACL_TYPE_DEFAULT:
-			jffs2_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 	return acl;
 }
 
@@ -283,16 +253,8 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		return -EINVAL;
 	}
 	rc = __jffs2_set_acl(inode, xprefix, acl);
-	if (!rc) {
-		switch(type) {
-		case ACL_TYPE_ACCESS:
-			jffs2_iset_acl(inode, &inode->i_acl, acl);
-			break;
-		case ACL_TYPE_DEFAULT:
-			jffs2_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
+	if (!rc)
+		set_cached_acl(inode, type, acl);
 	return rc;
 }
 
@@ -336,7 +298,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 		*i_mode &= ~current_umask();
 	} else {
 		if (S_ISDIR(*i_mode))
-			jffs2_iset_acl(inode, &inode->i_default_acl, acl);
+			set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
 
 		clone = posix_acl_clone(acl, GFP_KERNEL);
 		if (!clone)
@@ -347,7 +309,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 			return rc;
 		}
 		if (rc > 0)
-			jffs2_iset_acl(inode, &inode->i_acl, clone);
+			set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
 
 		posix_acl_release(clone);
 	}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 5fcfc9857c11..f272bf032e1e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -31,26 +31,24 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 {
 	struct posix_acl *acl;
 	char *ea_name;
-	struct posix_acl **p_acl;
 	int size;
 	char *value = NULL;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch(type) {
 		case ACL_TYPE_ACCESS:
 			ea_name = POSIX_ACL_XATTR_ACCESS;
-			p_acl = &inode->i_acl;
 			break;
 		case ACL_TYPE_DEFAULT:
 			ea_name = POSIX_ACL_XATTR_DEFAULT;
-			p_acl = &inode->i_default_acl;
 			break;
 		default:
 			return ERR_PTR(-EINVAL);
 	}
 
-	if (*p_acl != ACL_NOT_CACHED)
-		return posix_acl_dup(*p_acl);
-
 	size = __jfs_getxattr(inode, ea_name, NULL, 0);
 
 	if (size > 0) {
@@ -61,17 +59,18 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 	}
 
 	if (size < 0) {
-		if (size == -ENODATA) {
-			*p_acl = NULL;
+		if (size == -ENODATA)
 			acl = NULL;
-		} else
+		else
 			acl = ERR_PTR(size);
 	} else {
 		acl = posix_acl_from_xattr(value, size);
-		if (!IS_ERR(acl))
-			*p_acl = posix_acl_dup(acl);
 	}
 	kfree(value);
+	if (!IS_ERR(acl)) {
+		set_cached_acl(inode, type, acl);
+		posix_acl_release(acl);
+	}
 	return acl;
 }
 
@@ -79,7 +78,6 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 		       struct posix_acl *acl)
 {
 	char *ea_name;
-	struct posix_acl **p_acl;
 	int rc;
 	int size = 0;
 	char *value = NULL;
@@ -90,11 +88,9 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 	switch(type) {
 		case ACL_TYPE_ACCESS:
 			ea_name = POSIX_ACL_XATTR_ACCESS;
-			p_acl = &inode->i_acl;
 			break;
 		case ACL_TYPE_DEFAULT:
 			ea_name = POSIX_ACL_XATTR_DEFAULT;
-			p_acl = &inode->i_default_acl;
 			if (!S_ISDIR(inode->i_mode))
 				return acl ? -EACCES : 0;
 			break;
@@ -114,11 +110,9 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 out:
 	kfree(value);
 
-	if (!rc) {
-		if (*p_acl && (*p_acl != ACL_NOT_CACHED))
-			posix_acl_release(*p_acl);
-		*p_acl = posix_acl_dup(acl);
-	}
+	if (!rc)
+		set_cached_acl(inode, type, acl);
+
 	return rc;
 }
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index f6e90e343593..fad364548bc9 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -727,10 +727,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 		/*
 		 * We're changing the ACL.  Get rid of the cached one
 		 */
-		acl =inode->i_acl;
-		if (acl != ACL_NOT_CACHED)
-			posix_acl_release(acl);
-		inode->i_acl = ACL_NOT_CACHED;
+		forget_cached_acl(inode, ACL_TYPE_ACCESS);
 
 		return 0;
 	} else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
@@ -746,10 +743,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 		/*
 		 * We're changing the default ACL.  Get rid of the cached one
 		 */
-		acl = inode->i_default_acl;
-		if (acl && (acl != ACL_NOT_CACHED))
-			posix_acl_release(acl);
-		inode->i_default_acl = ACL_NOT_CACHED;
+		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
 
 		return 0;
 	}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index b6e473faa8b8..35d6e672a279 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -188,29 +188,6 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 	return ERR_PTR(-EINVAL);
 }
 
-static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
-			    struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
-static inline struct posix_acl *iget_acl(struct inode *inode,
-					 struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACL_NOT_CACHED;
-
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
-
-	return acl;
-}
-
 /*
  * Inode operation get_posix_acl().
  *
@@ -220,31 +197,29 @@ static inline struct posix_acl *iget_acl(struct inode *inode,
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 {
 	char *name, *value;
-	struct posix_acl *acl, **p_acl;
+	struct posix_acl *acl;
 	int size;
 	int retval;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 
-	acl = iget_acl(inode, p_acl);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	size = reiserfs_xattr_get(inode, name, NULL, 0);
 	if (size < 0) {
 		if (size == -ENODATA || size == -ENOSYS) {
-			*p_acl = NULL;
+			set_cached_acl(inode, type, NULL);
 			return NULL;
 		}
 		return ERR_PTR(size);
@@ -259,14 +234,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 		/* This shouldn't actually happen as it should have
 		   been caught above.. but just in case */
 		acl = NULL;
-		*p_acl = acl;
 	} else if (retval < 0) {
 		acl = ERR_PTR(retval);
 	} else {
 		acl = posix_acl_from_disk(value, retval);
-		if (!IS_ERR(acl))
-			iset_acl(inode, p_acl, acl);
 	}
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
 	kfree(value);
 	return acl;
@@ -284,7 +258,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 {
 	char *name;
 	void *value = NULL;
-	struct posix_acl **p_acl;
 	size_t size = 0;
 	int error;
 
@@ -294,7 +267,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		if (acl) {
 			mode_t mode = inode->i_mode;
 			error = posix_acl_equiv_mode(acl, &mode);
@@ -309,7 +281,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
@@ -342,7 +313,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	kfree(value);
 
 	if (!error)
-		iset_acl(inode, p_acl, acl);
+		set_cached_acl(inode, type, acl);
 
 	return error;
 }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index cfd31e229c89..adafcf556531 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -55,9 +55,9 @@
  * ACL support is not implemented.
  */
 
+#include "ubifs.h"
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
-#include "ubifs.h"
 
 /*
  * Limit the number of extended attributes per inode so that the total size
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 4bc241290c24..0cdba01b7756 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -83,4 +83,68 @@ extern int posix_acl_chmod_masq(struct posix_acl *, mode_t);
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
+static inline struct posix_acl *get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl **p, *acl;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		p = &inode->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		p = &inode->i_default_acl;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	acl = ACCESS_ONCE(*p);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *p;
+		if (acl != ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
+	return acl;
+}
+
+static inline void set_cached_acl(struct inode *inode,
+				  int type,
+				  struct posix_acl *acl)
+{
+	struct posix_acl *old = NULL;
+	spin_lock(&inode->i_lock);
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		old = inode->i_acl;
+		inode->i_acl = posix_acl_dup(acl);
+		break;
+	case ACL_TYPE_DEFAULT:
+		old = inode->i_default_acl;
+		inode->i_default_acl = posix_acl_dup(acl);
+		break;
+	}
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+
+static inline void forget_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl *old = NULL;
+	spin_lock(&inode->i_lock);
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		old = inode->i_acl;
+		inode->i_acl = ACL_NOT_CACHED;
+		break;
+	case ACL_TYPE_DEFAULT:
+		old = inode->i_default_acl;
+		inode->i_default_acl = ACL_NOT_CACHED;
+		break;
+	}
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+
 #endif  /* __LINUX_POSIX_ACL_H */
-- 
cgit v1.2.3-71-gd317


From 641cf4a668e9e69d2bc061e953422ff72a91f86e Mon Sep 17 00:00:00 2001
From: Markus Trippelsdorf <markus@trippelsdorf.de>
Date: Wed, 24 Jun 2009 22:28:52 +0200
Subject: inline functions left without protection of ifdef (acl)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/posix_acl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 0cdba01b7756..c513466c7dc7 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -83,6 +83,7 @@ extern int posix_acl_chmod_masq(struct posix_acl *, mode_t);
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
+#ifdef CONFIG_FS_POSIX_ACL
 static inline struct posix_acl *get_cached_acl(struct inode *inode, int type)
 {
 	struct posix_acl **p, *acl;
@@ -146,5 +147,5 @@ static inline void forget_cached_acl(struct inode *inode, int type)
 	if (old != ACL_NOT_CACHED)
 		posix_acl_release(old);
 }
-
+#endif
 #endif  /* __LINUX_POSIX_ACL_H */
-- 
cgit v1.2.3-71-gd317


From 72c04902d1e27c8a324014cff1d4475c11b1cecd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 24 Jun 2009 16:58:48 -0400
Subject: Get "no acls for this inode" right, fix shmem breakage

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c          | 6 ++----
 fs/jffs2/acl.c            | 3 +--
 include/linux/posix_acl.h | 9 +++++++++
 mm/shmem.c                | 5 +----
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 78ad38ddd01f..dbe1aabf96cd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2122,10 +2122,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	 * any xattrs or acls
 	 */
 	maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
-	if (!maybe_acls) {
-		inode->i_acl = NULL;
-		inode->i_default_acl = NULL;
-	}
+	if (!maybe_acls)
+		cache_no_acl(inode);
 
 	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
 						alloc_group_block, 0);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index edd2ad6416d8..8fcb6239218e 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -284,8 +284,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 	struct posix_acl *acl, *clone;
 	int rc;
 
-	inode->i_default_acl = NULL;
-	inode->i_acl = NULL;
+	cache_no_acl(inode);
 
 	if (S_ISLNK(*i_mode))
 		return 0;	/* Symlink always has no-ACL */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index c513466c7dc7..065a3652a3ea 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -148,4 +148,13 @@ static inline void forget_cached_acl(struct inode *inode, int type)
 		posix_acl_release(old);
 }
 #endif
+
+static inline void cache_no_acl(struct inode *inode)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+	inode->i_acl = NULL;
+	inode->i_default_acl = NULL;
+#endif
+}
+
 #endif  /* __LINUX_POSIX_ACL_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index 5f2019fc7895..d713239ce2ce 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1558,6 +1558,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
 		spin_lock_init(&info->lock);
 		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->swaplist);
+		cache_no_acl(inode);
 
 		switch (mode & S_IFMT) {
 		default:
@@ -2379,10 +2380,6 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 	if (!p)
 		return NULL;
-#ifdef CONFIG_TMPFS_POSIX_ACL
-	p->vfs_inode.i_acl = NULL;
-	p->vfs_inode.i_default_acl = NULL;
-#endif
 	return &p->vfs_inode;
 }
 
-- 
cgit v1.2.3-71-gd317


From 9d73777e500929b71dcfed16eec05f6760e345a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 25 Jun 2009 11:58:55 +0200
Subject: clarify get_user_pages() prototype

Currently the 4th parameter of get_user_pages() is called len, but its
in pages, not bytes. Rename the thing to nr_pages to avoid future
confusion.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/memory.c        | 26 ++++++++++++--------------
 mm/nommu.c         | 12 +++++-------
 3 files changed, 18 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d006e93d5c93..ba3a7cb1eaa0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -826,7 +826,7 @@ extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-			unsigned long start, int len, int write, int force,
+			unsigned long start, int nr_pages, int write, int force,
 			struct page **pages, struct vm_area_struct **vmas);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
diff --git a/mm/memory.c b/mm/memory.c
index f46ac18ba231..65216194eb8d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1207,8 +1207,8 @@ static inline int use_zero_page(struct vm_area_struct *vma)
 
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		     unsigned long start, int len, int flags,
-		struct page **pages, struct vm_area_struct **vmas)
+		     unsigned long start, int nr_pages, int flags,
+		     struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
 	unsigned int vm_flags = 0;
@@ -1217,7 +1217,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
 	int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
 
-	if (len <= 0)
+	if (nr_pages <= 0)
 		return 0;
 	/* 
 	 * Require read or write permissions.
@@ -1269,7 +1269,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				vmas[i] = gate_vma;
 			i++;
 			start += PAGE_SIZE;
-			len--;
+			nr_pages--;
 			continue;
 		}
 
@@ -1280,7 +1280,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 		if (is_vm_hugetlb_page(vma)) {
 			i = follow_hugetlb_page(mm, vma, pages, vmas,
-						&start, &len, i, write);
+						&start, &nr_pages, i, write);
 			continue;
 		}
 
@@ -1357,9 +1357,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				vmas[i] = vma;
 			i++;
 			start += PAGE_SIZE;
-			len--;
-		} while (len && start < vma->vm_end);
-	} while (len);
+			nr_pages--;
+		} while (nr_pages && start < vma->vm_end);
+	} while (nr_pages);
 	return i;
 }
 
@@ -1368,7 +1368,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  * @tsk:	task_struct of target task
  * @mm:		mm_struct of target mm
  * @start:	starting user address
- * @len:	number of pages from start to pin
+ * @nr_pages:	number of pages from start to pin
  * @write:	whether pages will be written to by the caller
  * @force:	whether to force write access even if user mapping is
  *		readonly. This will result in the page being COWed even
@@ -1380,7 +1380,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  *		Or NULL if the caller does not require them.
  *
  * Returns number of pages pinned. This may be fewer than the number
- * requested. If len is 0 or negative, returns 0. If no pages
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno. Each page returned must be released
  * with a put_page() call when it is finished with. vmas will only
  * remain valid while mmap_sem is held.
@@ -1414,7 +1414,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  * See also get_user_pages_fast, for performance critical applications.
  */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long start, int len, int write, int force,
+		unsigned long start, int nr_pages, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int flags = 0;
@@ -1424,9 +1424,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	if (force)
 		flags |= GUP_FLAGS_FORCE;
 
-	return __get_user_pages(tsk, mm,
-				start, len, flags,
-				pages, vmas);
+	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
 }
 
 EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fd2ad5da98e..bf0cc762a7d2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -173,8 +173,8 @@ unsigned int kobjsize(const void *objp)
 }
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		     unsigned long start, int len, int flags,
-		struct page **pages, struct vm_area_struct **vmas)
+		     unsigned long start, int nr_pages, int flags,
+		     struct page **pages, struct vm_area_struct **vmas)
 {
 	struct vm_area_struct *vma;
 	unsigned long vm_flags;
@@ -189,7 +189,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 
-	for (i = 0; i < len; i++) {
+	for (i = 0; i < nr_pages; i++) {
 		vma = find_vma(mm, start);
 		if (!vma)
 			goto finish_or_fault;
@@ -224,7 +224,7 @@ finish_or_fault:
  * - don't permit access to VMAs that don't support it, such as I/O mappings
  */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-	unsigned long start, int len, int write, int force,
+	unsigned long start, int nr_pages, int write, int force,
 	struct page **pages, struct vm_area_struct **vmas)
 {
 	int flags = 0;
@@ -234,9 +234,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	if (force)
 		flags |= GUP_FLAGS_FORCE;
 
-	return __get_user_pages(tsk, mm,
-				start, len, flags,
-				pages, vmas);
+	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
 }
 EXPORT_SYMBOL(get_user_pages);
 
-- 
cgit v1.2.3-71-gd317


From 41f95331b972a039f519ae0c70f051b7121f7346 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Jun 2009 17:55:18 +0200
Subject: perf_counter: Split the mmap control page in two parts

Since there are two distinct sections to the control page,
move them apart so that possible extentions don't overlap.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e7213e46cf9c..489d5cbfbcca 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -233,6 +233,12 @@ struct perf_counter_mmap_page {
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
 
+		/*
+		 * Hole for extension of the self monitor capabilities
+		 */
+
+	__u64	__reserved[125];	/* align to 1k */
+
 	/*
 	 * Control data for the mmap() data buffer.
 	 *
-- 
cgit v1.2.3-71-gd317


From 7f8b4e4e0988dadfd22330fd147ad2453e19f510 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 22 Jun 2009 14:34:35 +0200
Subject: perf_counter: Add scale information to the mmap control page

Add the needed time scale to the self-profile mmap information.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 4 +++-
 kernel/perf_counter.c        | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 489d5cbfbcca..bcbf1c43ed42 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -232,12 +232,14 @@ struct perf_counter_mmap_page {
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
+	__u64	time_enabled;		/* time counter active */
+	__u64	time_running;		/* time counter on cpu */
 
 		/*
 		 * Hole for extension of the self monitor capabilities
 		 */
 
-	__u64	__reserved[125];	/* align to 1k */
+	__u64	__reserved[123];	/* align to 1k */
 
 	/*
 	 * Control data for the mmap() data buffer.
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c2b19c111718..23614adab475 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1782,6 +1782,12 @@ void perf_counter_update_userpage(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
+	userpg->time_enabled = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+
+	userpg->time_running = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+
 	barrier();
 	++userpg->lock;
 	preempt_enable();
-- 
cgit v1.2.3-71-gd317


From 38b200d67636a30cb8dc1508137908e7a649b5c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Jun 2009 20:13:11 +0200
Subject: perf_counter: Add PERF_EVENT_READ

Provide a read() like event which can be used to log the
counter value at specific sites such as child->parent
folding on exit.

In order to be useful, we log the counter parent ID, not the
actual counter ID, since userspace can only relate parent
IDs to perf_counter_attr constructs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 12 ++++++++
 kernel/perf_counter.c        | 72 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 80 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index bcbf1c43ed42..6a384f04755a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -334,6 +334,18 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_FORK			= 7,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u32				pid, tid;
+	 * 	u64				value;
+	 * 	{ u64		time_enabled; 	} && PERF_FORMAT_ENABLED
+	 * 	{ u64		time_running; 	} && PERF_FORMAT_RUNNING
+	 * 	{ u64		parent_id;	} && PERF_FORMAT_ID
+	 * };
+	 */
+	PERF_EVENT_READ			= 8,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_SAMPLE_*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 02994a719e27..a72c20e91953 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2623,6 +2623,66 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	perf_output_end(&handle);
 }
 
+/*
+ * read event
+ */
+
+struct perf_read_event {
+	struct perf_event_header	header;
+
+	u32				pid;
+	u32				tid;
+	u64				value;
+	u64				format[3];
+};
+
+static void
+perf_counter_read_event(struct perf_counter *counter,
+			struct task_struct *task)
+{
+	struct perf_output_handle handle;
+	struct perf_read_event event = {
+		.header = {
+			.type = PERF_EVENT_READ,
+			.misc = 0,
+			.size = sizeof(event) - sizeof(event.format),
+		},
+		.pid = perf_counter_pid(counter, task),
+		.tid = perf_counter_tid(counter, task),
+		.value = atomic64_read(&counter->count),
+	};
+	int ret, i = 0;
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		event.header.size += sizeof(u64);
+		event.format[i++] = counter->total_time_enabled;
+	}
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		event.header.size += sizeof(u64);
+		event.format[i++] = counter->total_time_running;
+	}
+
+	if (counter->attr.read_format & PERF_FORMAT_ID) {
+		u64 id;
+
+		event.header.size += sizeof(u64);
+		if (counter->parent)
+			id = counter->parent->id;
+		else
+			id = counter->id;
+
+		event.format[i++] = id;
+	}
+
+	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
+	if (ret)
+		return;
+
+	perf_output_copy(&handle, &event, event.header.size);
+	perf_output_end(&handle);
+}
+
 /*
  * fork tracking
  */
@@ -3985,10 +4045,13 @@ static int inherit_group(struct perf_counter *parent_counter,
 }
 
 static void sync_child_counter(struct perf_counter *child_counter,
-			       struct perf_counter *parent_counter)
+			       struct task_struct *child)
 {
+	struct perf_counter *parent_counter = child_counter->parent;
 	u64 child_val;
 
+	perf_counter_read_event(child_counter, child);
+
 	child_val = atomic64_read(&child_counter->count);
 
 	/*
@@ -4017,7 +4080,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
 
 static void
 __perf_counter_exit_task(struct perf_counter *child_counter,
-			 struct perf_counter_context *child_ctx)
+			 struct perf_counter_context *child_ctx,
+			 struct task_struct *child)
 {
 	struct perf_counter *parent_counter;
 
@@ -4031,7 +4095,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
 	 * counters need to be zapped - but otherwise linger.
 	 */
 	if (parent_counter) {
-		sync_child_counter(child_counter, parent_counter);
+		sync_child_counter(child_counter, child);
 		free_counter(child_counter);
 	}
 }
@@ -4093,7 +4157,7 @@ void perf_counter_exit_task(struct task_struct *child)
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
-		__perf_counter_exit_task(child_counter, child_ctx);
+		__perf_counter_exit_task(child_counter, child_ctx, child);
 
 	/*
 	 * If the last counter was a group counter, it will have appended all
-- 
cgit v1.2.3-71-gd317


From bfbd3381e63aa2a14c6706afb50ce4630aa0d9a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 24 Jun 2009 21:11:59 +0200
Subject: perf_counter: Implement more accurate per task statistics

With the introduction of PERF_EVENT_READ we have the
possibility to provide accurate counter values for
individual tasks in a task hierarchy.

However, due to the lazy context switching used for similar
counter contexts our current per task counts are way off.

In order to maintain some of the lazy switch benefits we
don't disable it out-right, but simply iterate the active
counters and flip the values between the contexts.

This only reads the counters but does not need to reprogram
the full PMU.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++-
 kernel/perf_counter.c        | 83 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6a384f04755a..de70a10b5ec8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -178,8 +178,9 @@ struct perf_counter_attr {
 				mmap           :  1, /* include mmap data     */
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
+				inherit_stat   :  1, /* per task counts       */
 
-				__reserved_1   : 53;
+				__reserved_1   : 52;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -602,6 +603,7 @@ struct perf_counter_context {
 	int				nr_counters;
 	int				nr_active;
 	int				is_active;
+	int				nr_stat;
 	atomic_t			refcount;
 	struct task_struct		*task;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a72c20e91953..385ca51c6e60 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -236,6 +236,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
+	if (counter->attr.inherit_stat)
+		ctx->nr_stat++;
 }
 
 /*
@@ -250,6 +252,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
+	if (counter->attr.inherit_stat)
+		ctx->nr_stat--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -1006,6 +1010,76 @@ static int context_equiv(struct perf_counter_context *ctx1,
 		&& !ctx1->pin_count && !ctx2->pin_count;
 }
 
+static void __perf_counter_read(void *counter);
+
+static void __perf_counter_sync_stat(struct perf_counter *counter,
+				     struct perf_counter *next_counter)
+{
+	u64 value;
+
+	if (!counter->attr.inherit_stat)
+		return;
+
+	/*
+	 * Update the counter value, we cannot use perf_counter_read()
+	 * because we're in the middle of a context switch and have IRQs
+	 * disabled, which upsets smp_call_function_single(), however
+	 * we know the counter must be on the current CPU, therefore we
+	 * don't need to use it.
+	 */
+	switch (counter->state) {
+	case PERF_COUNTER_STATE_ACTIVE:
+		__perf_counter_read(counter);
+		break;
+
+	case PERF_COUNTER_STATE_INACTIVE:
+		update_counter_times(counter);
+		break;
+
+	default:
+		break;
+	}
+
+	/*
+	 * In order to keep per-task stats reliable we need to flip the counter
+	 * values when we flip the contexts.
+	 */
+	value = atomic64_read(&next_counter->count);
+	value = atomic64_xchg(&counter->count, value);
+	atomic64_set(&next_counter->count, value);
+
+	/*
+	 * XXX also sync time_enabled and time_running ?
+	 */
+}
+
+#define list_next_entry(pos, member) \
+	list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_counter_sync_stat(struct perf_counter_context *ctx,
+				   struct perf_counter_context *next_ctx)
+{
+	struct perf_counter *counter, *next_counter;
+
+	if (!ctx->nr_stat)
+		return;
+
+	counter = list_first_entry(&ctx->event_list,
+				   struct perf_counter, event_entry);
+
+	next_counter = list_first_entry(&next_ctx->event_list,
+					struct perf_counter, event_entry);
+
+	while (&counter->event_entry != &ctx->event_list &&
+	       &next_counter->event_entry != &next_ctx->event_list) {
+
+		__perf_counter_sync_stat(counter, next_counter);
+
+		counter = list_next_entry(counter, event_entry);
+		next_counter = list_next_entry(counter, event_entry);
+	}
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -1061,6 +1135,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
 			ctx->task = next;
 			next_ctx->task = task;
 			do_switch = 0;
+
+			perf_counter_sync_stat(ctx, next_ctx);
 		}
 		spin_unlock(&next_ctx->lock);
 		spin_unlock(&ctx->lock);
@@ -1350,7 +1426,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 /*
  * Cross CPU call to read the hardware counter
  */
-static void __read(void *info)
+static void __perf_counter_read(void *info)
 {
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
@@ -1372,7 +1448,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
-					 __read, counter, 1);
+					 __perf_counter_read, counter, 1);
 	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 	}
@@ -4050,7 +4126,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	struct perf_counter *parent_counter = child_counter->parent;
 	u64 child_val;
 
-	perf_counter_read_event(child_counter, child);
+	if (child_counter->attr.inherit_stat)
+		perf_counter_read_event(child_counter, child);
 
 	child_val = atomic64_read(&child_counter->count);
 
-- 
cgit v1.2.3-71-gd317


From e6e18ec79b023d5fe84226cef533cf0e3770ce93 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Jun 2009 11:27:12 +0200
Subject: perf_counter: Rework the sample ABI

The PERF_EVENT_READ implementation made me realize we don't
actually need the sample_type int the output sample, since
we already have that in the perf_counter_attr information.

Therefore, remove the PERF_EVENT_MISC_OVERFLOW bit and the
event->type overloading, and imply put counter overflow
samples in a PERF_EVENT_SAMPLE type.

This also fixes the issue that event->type was only 32-bit
and sample_type had 64 usable bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h  | 10 +++++-----
 kernel/perf_counter.c         | 36 +++++++++++++++---------------------
 tools/perf/builtin-annotate.c |  8 ++++----
 tools/perf/builtin-report.c   | 32 +++++++++++++++++++-------------
 tools/perf/builtin-top.c      | 11 ++++++-----
 5 files changed, 49 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index de70a10b5ec8..3078e23c91eb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -262,7 +262,6 @@ struct perf_counter_mmap_page {
 #define PERF_EVENT_MISC_KERNEL			(1 << 0)
 #define PERF_EVENT_MISC_USER			(2 << 0)
 #define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
-#define PERF_EVENT_MISC_OVERFLOW		(1 << 2)
 
 struct perf_event_header {
 	__u32	type;
@@ -348,9 +347,6 @@ enum perf_event_type {
 	PERF_EVENT_READ			= 8,
 
 	/*
-	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
-	 * will be PERF_SAMPLE_*
-	 *
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *
@@ -358,8 +354,9 @@ enum perf_event_type {
 	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
 	 *	{ u64			time;     } && PERF_SAMPLE_TIME
 	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
-	 *	{ u64			config;   } && PERF_SAMPLE_CONFIG
+	 *	{ u64			id;	  } && PERF_SAMPLE_ID
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
 	 *
 	 *	{ u64			nr;
 	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
@@ -368,6 +365,9 @@ enum perf_event_type {
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
 	 * };
 	 */
+	PERF_EVENT_SAMPLE		= 9,
+
+	PERF_EVENT_MAX,			/* non-ABI */
 };
 
 enum perf_callchain_context {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 385ca51c6e60..f2f232696587 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2575,15 +2575,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u32 cpu, reserved;
 	} cpu_entry;
 
-	header.type = 0;
+	header.type = PERF_EVENT_SAMPLE;
 	header.size = sizeof(header);
 
-	header.misc = PERF_EVENT_MISC_OVERFLOW;
+	header.misc = 0;
 	header.misc |= perf_misc_flags(data->regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(data->regs);
-		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
 
@@ -2592,7 +2591,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		tid_entry.pid = perf_counter_pid(counter, current);
 		tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.type |= PERF_SAMPLE_TID;
 		header.size += sizeof(tid_entry);
 	}
 
@@ -2602,34 +2600,25 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		 */
 		time = sched_clock();
 
-		header.type |= PERF_SAMPLE_TIME;
 		header.size += sizeof(u64);
 	}
 
-	if (sample_type & PERF_SAMPLE_ADDR) {
-		header.type |= PERF_SAMPLE_ADDR;
+	if (sample_type & PERF_SAMPLE_ADDR)
 		header.size += sizeof(u64);
-	}
 
-	if (sample_type & PERF_SAMPLE_ID) {
-		header.type |= PERF_SAMPLE_ID;
+	if (sample_type & PERF_SAMPLE_ID)
 		header.size += sizeof(u64);
-	}
 
 	if (sample_type & PERF_SAMPLE_CPU) {
-		header.type |= PERF_SAMPLE_CPU;
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
-	if (sample_type & PERF_SAMPLE_PERIOD) {
-		header.type |= PERF_SAMPLE_PERIOD;
+	if (sample_type & PERF_SAMPLE_PERIOD)
 		header.size += sizeof(u64);
-	}
 
 	if (sample_type & PERF_SAMPLE_GROUP) {
-		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
@@ -2639,10 +2628,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
-
-			header.type |= PERF_SAMPLE_CALLCHAIN;
 			header.size += callchain_size;
-		}
+		} else
+			header.size += sizeof(u64);
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2693,8 +2681,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if (callchain)
-		perf_output_copy(&handle, callchain, callchain_size);
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		if (callchain)
+			perf_output_copy(&handle, callchain, callchain_size);
+		else {
+			u64 nr = 0;
+			perf_output_put(&handle, nr);
+		}
+	}
 
 	perf_output_end(&handle);
 }
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 7e58e3ad1508..722c0f54e549 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -855,7 +855,7 @@ static unsigned long total = 0,
 		     total_unknown = 0;
 
 static int
-process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	char level;
 	int show = 0;
@@ -1013,10 +1013,10 @@ process_period_event(event_t *event, unsigned long offset, unsigned long head)
 static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
-		return process_overflow_event(event, offset, head);
-
 	switch (event->header.type) {
+	case PERF_EVENT_SAMPLE:
+		return process_sample_event(event, offset, head);
+
 	case PERF_EVENT_MMAP:
 		return process_mmap_event(event, offset, head);
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index e575f3039766..ec5361c67bf5 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -53,6 +53,8 @@ static regex_t		parent_regex;
 
 static int		exclude_other = 1;
 
+static u64		sample_type;
+
 struct ip_event {
 	struct perf_event_header header;
 	u64 ip;
@@ -1135,7 +1137,7 @@ static int validate_chain(struct ip_callchain *chain, event_t *event)
 }
 
 static int
-process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	char level;
 	int show = 0;
@@ -1147,12 +1149,12 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	void *more_data = event->ip.__more_data;
 	struct ip_callchain *chain = NULL;
 
-	if (event->header.type & PERF_SAMPLE_PERIOD) {
+	if (sample_type & PERF_SAMPLE_PERIOD) {
 		period = *(u64 *)more_data;
 		more_data += sizeof(u64);
 	}
 
-	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
+	dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
@@ -1160,7 +1162,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		(void *)(long)ip,
 		(long long)period);
 
-	if (event->header.type & PERF_SAMPLE_CALLCHAIN) {
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		int i;
 
 		chain = (void *)more_data;
@@ -1352,10 +1354,10 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	trace_event(event);
 
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
-		return process_overflow_event(event, offset, head);
-
 	switch (event->header.type) {
+	case PERF_EVENT_SAMPLE:
+		return process_sample_event(event, offset, head);
+
 	case PERF_EVENT_MMAP:
 		return process_mmap_event(event, offset, head);
 
@@ -1388,18 +1390,21 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 
 static struct perf_header	*header;
 
-static int perf_header__has_sample(u64 sample_mask)
+static u64 perf_header__sample_type(void)
 {
+	u64 sample_type = 0;
 	int i;
 
 	for (i = 0; i < header->attrs; i++) {
 		struct perf_header_attr *attr = header->attr[i];
 
-		if (!(attr->attr.sample_type & sample_mask))
-			return 0;
+		if (!sample_type)
+			sample_type = attr->attr.sample_type;
+		else if (sample_type != attr->attr.sample_type)
+			die("non matching sample_type");
 	}
 
-	return 1;
+	return sample_type;
 }
 
 static int __cmd_report(void)
@@ -1437,8 +1442,9 @@ static int __cmd_report(void)
 	header = perf_header__read(input);
 	head = header->data_offset;
 
-	if (sort__has_parent &&
-	    !perf_header__has_sample(PERF_SAMPLE_CALLCHAIN)) {
+	sample_type = perf_header__sample_type();
+
+	if (sort__has_parent && !(sample_type & PERF_SAMPLE_CALLCHAIN)) {
 		fprintf(stderr, "selected --sort parent, but no callchain data\n");
 		exit(-1);
 	}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 5352b5e352ed..cf0d21f1ae10 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -392,11 +392,11 @@ static void record_ip(u64 ip, int counter)
 	samples--;
 }
 
-static void process_event(u64 ip, int counter)
+static void process_event(u64 ip, int counter, int user)
 {
 	samples++;
 
-	if (ip < min_ip || ip > max_ip) {
+	if (user) {
 		userspace_samples++;
 		return;
 	}
@@ -509,9 +509,10 @@ static void mmap_read_counter(struct mmap_data *md)
 
 		old += size;
 
-		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-			if (event->header.type & PERF_SAMPLE_IP)
-				process_event(event->ip.ip, md->counter);
+		if (event->header.type == PERF_EVENT_SAMPLE) {
+			int user =
+	(event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER;
+			process_event(event->ip.ip, md->counter, user);
 		}
 	}
 
-- 
cgit v1.2.3-71-gd317


From 5211a242d0cbdded372aee59da18f80552b0a80a Mon Sep 17 00:00:00 2001
From: Kurt Garloff <garloff@suse.de>
Date: Wed, 24 Jun 2009 14:32:11 -0700
Subject: x86: Add sysctl to allow panic on IOCK NMI error

This patch introduces a new sysctl:

    /proc/sys/kernel/panic_on_io_nmi

which defaults to 0 (off).

When enabled, the kernel panics when the kernel receives an NMI
caused by an IO error.

The IO error triggered NMI indicates a serious system
condition, which could result in IO data corruption. Rather
than contiuing, panicing and dumping might be a better choice,
so one can figure out what's causing the IO error.

This could be especially important to companies running IO
intensive applications where corruption must be avoided, e.g. a
bank's databases.

[ SuSE has been shipping it for a while, it was done at the
  request of a large database vendor, for their users. ]

Signed-off-by: Kurt Garloff <garloff@suse.de>
Signed-off-by: Roberto Angelino <robertangelino@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
LKML-Reference: <20090624213211.GA11291@kroah.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c | 1 +
 arch/x86/kernel/traps.c     | 3 +++
 include/linux/kernel.h      | 1 +
 kernel/sysctl.c             | 8 ++++++++
 4 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 95ea5fa7d444..c8405718a4c3 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,6 +22,7 @@
 #include "dumpstack.h"
 
 int panic_on_unrecovered_nmi;
+int panic_on_io_nmi;
 unsigned int code_bytes = 64;
 int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
 static int die_counter;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a0f48f5671c0..5204332f475d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
 	show_registers(regs);
 
+	if (panic_on_io_nmi)
+		panic("NMI IOCK error: Not continuing");
+
 	/* Re-enable the IOCK line, wait for a few seconds */
 	reason = (reason & 0xf) | 8;
 	outb(reason, 0x61);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index fac104e7186a..d6320a3e8def 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -303,6 +303,7 @@ extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in
 extern int panic_timeout;
 extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
+extern int panic_on_io_nmi;
 extern const char *print_tainted(void);
 extern void add_taint(unsigned flag);
 extern int test_taint(unsigned flag);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b5..fba42eda8de2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "panic_on_io_nmi",
+		.data		= &panic_on_io_nmi,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= KERN_BOOTLOADER_TYPE,
 		.procname	= "bootloader_type",
-- 
cgit v1.2.3-71-gd317


From 5e955245d6cf49c5ed26c7add7392ff5a6762bf4 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 23 Jun 2009 11:27:27 +0000
Subject: ide: always kill the whole request on error

* Use blk_rq_bytes() instead of obsolete ide_rq_bytes() in ide_kill_rq()
  and ide_floppy_do_request() for failed requests.
  [ bugfix part ]

* Use blk_rq_bytes() instead of obsolete ide_rq_bytes() in ide_do_devset()
  and ide_complete_drive_reset().  Then remove ide_rq_bytes().
  [ cleanup part ]

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ide/ide-devsets.c |  2 +-
 drivers/ide/ide-eh.c      |  2 +-
 drivers/ide/ide-floppy.c  |  2 +-
 drivers/ide/ide-io.c      | 14 ++------------
 include/linux/ide.h       |  1 -
 5 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 5bf958e5b1d5..1099bf7cf968 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -183,6 +183,6 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 	err = setfunc(drive, *(int *)&rq->cmd[1]);
 	if (err)
 		rq->errors = err;
-	ide_complete_rq(drive, err, ide_rq_bytes(rq));
+	ide_complete_rq(drive, err, blk_rq_bytes(rq));
 	return ide_stopped;
 }
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 2b9141979613..e9abf2c3c335 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -149,7 +149,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 	if (rq && blk_special_request(rq) && rq->cmd[0] == REQ_DRIVE_RESET) {
 		if (err <= 0 && rq->errors == 0)
 			rq->errors = -EIO;
-		ide_complete_rq(drive, err ? err : 0, ide_rq_bytes(rq));
+		ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
 	}
 }
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 8b3f204f7d73..fefbdfc8db06 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -293,7 +293,7 @@ out_end:
 	drive->failed_pc = NULL;
 	if (blk_fs_request(rq) == 0 && rq->errors == 0)
 		rq->errors = -EIO;
-	ide_complete_rq(drive, -EIO, ide_rq_bytes(rq));
+	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
 	return ide_stopped;
 }
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 95db5f03f6a2..d5f3c77beadd 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -112,16 +112,6 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 	}
 }
 
-/* obsolete, blk_rq_bytes() should be used instead */
-unsigned int ide_rq_bytes(struct request *rq)
-{
-	if (blk_pc_request(rq))
-		return blk_rq_bytes(rq);
-	else
-		return blk_rq_cur_sectors(rq) << 9;
-}
-EXPORT_SYMBOL_GPL(ide_rq_bytes);
-
 int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -152,14 +142,14 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 
 	if ((media == ide_floppy || media == ide_tape) && drv_req) {
 		rq->errors = 0;
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
 	} else {
 		if (media == ide_tape)
 			rq->errors = IDE_DRV_ERROR_GENERAL;
 		else if (blk_fs_request(rq) == 0 && rq->errors == 0)
 			rq->errors = -EIO;
-		ide_complete_rq(drive, -EIO, ide_rq_bytes(rq));
 	}
+
+	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
 }
 
 static void ide_tf_set_specify_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cf1f3888067c..c6af7c44d46c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1062,7 +1062,6 @@ int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned l
 extern int ide_vlb_clk;
 extern int ide_pci_clk;
 
-unsigned int ide_rq_bytes(struct request *);
 int ide_end_rq(ide_drive_t *, struct request *, int, unsigned int);
 void ide_kill_rq(ide_drive_t *, struct request *);
 
-- 
cgit v1.2.3-71-gd317


From 84261923d3dddb766736023bead6fa07b7e218d5 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 17 Jun 2009 10:53:47 -0300
Subject: KVM: protect concurrent make_all_cpus_request

make_all_cpus_request contains a race condition which can
trigger false request completed status, as follows:

CPU0                                              CPU1

if (test_and_set_bit(req,&vcpu->requests))
   ....                                        	   if (test_and_set_bit(req,&vcpu->requests))
   ..                                                  return
proceed to smp_call_function_many(wait=1)

Use a spinlock to serialize concurrent CPUs.

Cc: stable@kernel.org
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index aacc5449f586..16713dc672e4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -125,6 +125,7 @@ struct kvm_kernel_irq_routing_entry {
 struct kvm {
 	struct mutex lock; /* protects the vcpus array and APIC accesses */
 	spinlock_t mmu_lock;
+	spinlock_t requests_lock;
 	struct rw_semaphore slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	int nmemslots;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 013a5b3e9f75..2884baf1d5f9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -746,6 +746,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		cpumask_clear(cpus);
 
 	me = get_cpu();
+	spin_lock(&kvm->requests_lock);
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		vcpu = kvm->vcpus[i];
 		if (!vcpu)
@@ -762,6 +763,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		smp_call_function_many(cpus, ack_flush, NULL, 1);
 	else
 		called = false;
+	spin_unlock(&kvm->requests_lock);
 	put_cpu();
 	free_cpumask_var(cpus);
 	return called;
@@ -982,6 +984,7 @@ static struct kvm *kvm_create_vm(void)
 	kvm->mm = current->mm;
 	atomic_inc(&kvm->mm->mm_count);
 	spin_lock_init(&kvm->mmu_lock);
+	spin_lock_init(&kvm->requests_lock);
 	kvm_io_bus_init(&kvm->pio_bus);
 	mutex_init(&kvm->lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
-- 
cgit v1.2.3-71-gd317


From 94e5d714f604d4cb4cb13163f01ede278e69258b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Fri, 26 Jun 2009 14:05:27 -0400
Subject: integrity: add ima_counts_put (updated)

This patch fixes an imbalance message as reported by J.R. Okajima.
The IMA file counters are incremented in ima_path_check. If the
actual open fails, such as ETXTBSY, decrement the counters to
prevent unnecessary imbalance messages.

Reported-by: J.R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namei.c                        |  7 +++++++
 include/linux/ima.h               |  6 ++++++
 security/integrity/ima/ima_main.c | 29 ++++++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 5b961eb71cbf..f3c5b278895a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1761,6 +1761,10 @@ do_last:
 			goto exit;
 		}
 		filp = nameidata_to_filp(&nd, open_flag);
+		if (IS_ERR(filp))
+			ima_counts_put(&nd.path,
+				       acc_mode & (MAY_READ | MAY_WRITE |
+						   MAY_EXEC));
 		mnt_drop_write(nd.path.mnt);
 		if (nd.root.mnt)
 			path_put(&nd.root);
@@ -1817,6 +1821,9 @@ ok:
 		goto exit;
 	}
 	filp = nameidata_to_filp(&nd, open_flag);
+	if (IS_ERR(filp))
+		ima_counts_put(&nd.path,
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
 	/*
 	 * It is now safe to drop the mnt write
 	 * because the filp has had a write taken
diff --git a/include/linux/ima.h b/include/linux/ima.h
index b1b827d091a9..0e3f2a4c25f6 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -24,6 +24,7 @@ extern int ima_path_check(struct path *path, int mask, int update_counts);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern void ima_counts_get(struct file *file);
+extern void ima_counts_put(struct path *path, int mask);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -60,5 +61,10 @@ static inline void ima_counts_get(struct file *file)
 {
 	return;
 }
+
+static inline void ima_counts_put(struct path *path, int mask)
+{
+	return;
+}
 #endif /* CONFIG_IMA_H */
 #endif /* _LINUX_IMA_H */
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 6f611874d10e..101c512564ec 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -238,7 +238,34 @@ out:
 }
 
 /*
- * ima_opens_get - increment file counts
+ * ima_counts_put - decrement file counts
+ *
+ * File counts are incremented in ima_path_check. On file open
+ * error, such as ETXTBSY, decrement the counts to prevent
+ * unnecessary imbalance messages.
+ */
+void ima_counts_put(struct path *path, int mask)
+{
+	struct inode *inode = path->dentry->d_inode;
+	struct ima_iint_cache *iint;
+
+	if (!ima_initialized || !S_ISREG(inode->i_mode))
+		return;
+	iint = ima_iint_find_insert_get(inode);
+	if (!iint)
+		return;
+
+	mutex_lock(&iint->mutex);
+	iint->opencount--;
+	if ((mask & MAY_WRITE) || (mask == 0))
+		iint->writecount--;
+	else if (mask & (MAY_READ | MAY_EXEC))
+		iint->readcount--;
+	mutex_unlock(&iint->mutex);
+}
+
+/*
+ * ima_counts_get - increment file counts
  *
  * - for IPC shm and shmat file.
  * - for nfsd exported files.
-- 
cgit v1.2.3-71-gd317


From 8a3af79361e85db6fec4173ef1916322471c19e3 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 29 Jun 2009 14:28:27 +0200
Subject: netfilter: headers_check fix: linux/netfilter/xt_osf.h

fix the following 'make headers_check' warnings:

  usr/include/linux/netfilter/xt_osf.h:40: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_osf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_osf.h b/include/linux/netfilter/xt_osf.h
index fd2272e0959a..18afa495f973 100644
--- a/include/linux/netfilter/xt_osf.h
+++ b/include/linux/netfilter/xt_osf.h
@@ -20,6 +20,8 @@
 #ifndef _XT_OSF_H
 #define _XT_OSF_H
 
+#include <linux/types.h>
+
 #define MAXGENRELEN		32
 
 #define XT_OSF_GENRE		(1<<0)
-- 
cgit v1.2.3-71-gd317


From d6d3f08b0fd998b647a05540cedd11a067b72867 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Mon, 29 Jun 2009 14:31:46 +0200
Subject: netfilter: xtables: conntrack match revision 2

As reported by Philip, the UNTRACKED state bit does not fit within
the 8-bit state_mask member. Enlarge state_mask and give status_mask
a few more bits too.

Reported-by: Philip Craig <philipc@snapgear.com>
References: http://markmail.org/thread/b7eg6aovfh4agyz7
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_conntrack.h | 13 +++++++
 net/netfilter/xt_conntrack.c           | 66 ++++++++++++++++++++++++++++++----
 2 files changed, 73 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h
index 3430c7751948..7ae05338e94c 100644
--- a/include/linux/netfilter/xt_conntrack.h
+++ b/include/linux/netfilter/xt_conntrack.h
@@ -81,4 +81,17 @@ struct xt_conntrack_mtinfo1 {
 	__u8 state_mask, status_mask;
 };
 
+struct xt_conntrack_mtinfo2 {
+	union nf_inet_addr origsrc_addr, origsrc_mask;
+	union nf_inet_addr origdst_addr, origdst_mask;
+	union nf_inet_addr replsrc_addr, replsrc_mask;
+	union nf_inet_addr repldst_addr, repldst_mask;
+	__u32 expires_min, expires_max;
+	__u16 l4proto;
+	__be16 origsrc_port, origdst_port;
+	__be16 replsrc_port, repldst_port;
+	__u16 match_flags, invert_flags;
+	__u16 state_mask, status_mask;
+};
+
 #endif /*_XT_CONNTRACK_H*/
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 0b7139f3dd78..fc581800698e 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -129,7 +129,7 @@ conntrack_addrcmp(const union nf_inet_addr *kaddr,
 
 static inline bool
 conntrack_mt_origsrc(const struct nf_conn *ct,
-                     const struct xt_conntrack_mtinfo1 *info,
+                     const struct xt_conntrack_mtinfo2 *info,
 		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
@@ -138,7 +138,7 @@ conntrack_mt_origsrc(const struct nf_conn *ct,
 
 static inline bool
 conntrack_mt_origdst(const struct nf_conn *ct,
-                     const struct xt_conntrack_mtinfo1 *info,
+                     const struct xt_conntrack_mtinfo2 *info,
 		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3,
@@ -147,7 +147,7 @@ conntrack_mt_origdst(const struct nf_conn *ct,
 
 static inline bool
 conntrack_mt_replsrc(const struct nf_conn *ct,
-                     const struct xt_conntrack_mtinfo1 *info,
+                     const struct xt_conntrack_mtinfo2 *info,
 		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3,
@@ -156,7 +156,7 @@ conntrack_mt_replsrc(const struct nf_conn *ct,
 
 static inline bool
 conntrack_mt_repldst(const struct nf_conn *ct,
-                     const struct xt_conntrack_mtinfo1 *info,
+                     const struct xt_conntrack_mtinfo2 *info,
 		     u_int8_t family)
 {
 	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3,
@@ -164,7 +164,7 @@ conntrack_mt_repldst(const struct nf_conn *ct,
 }
 
 static inline bool
-ct_proto_port_check(const struct xt_conntrack_mtinfo1 *info,
+ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
                     const struct nf_conn *ct)
 {
 	const struct nf_conntrack_tuple *tuple;
@@ -204,7 +204,7 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo1 *info,
 static bool
 conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_conntrack_mtinfo1 *info = par->matchinfo;
+	const struct xt_conntrack_mtinfo2 *info = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
 	unsigned int statebit;
@@ -278,6 +278,16 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return true;
 }
 
+static bool
+conntrack_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct xt_conntrack_mtinfo2 *const *info = par->matchinfo;
+	struct xt_match_param newpar = *par;
+
+	newpar.matchinfo = *info;
+	return conntrack_mt(skb, &newpar);
+}
+
 static bool conntrack_mt_check(const struct xt_mtchk_param *par)
 {
 	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
@@ -288,11 +298,45 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
+static bool conntrack_mt_check_v1(const struct xt_mtchk_param *par)
+{
+	struct xt_conntrack_mtinfo1 *info = par->matchinfo;
+	struct xt_conntrack_mtinfo2 *up;
+	int ret = conntrack_mt_check(par);
+
+	if (ret < 0)
+		return ret;
+
+	up = kmalloc(sizeof(*up), GFP_KERNEL);
+	if (up == NULL) {
+		nf_ct_l3proto_module_put(par->family);
+		return -ENOMEM;
+	}
+
+	/*
+	 * The strategy here is to minimize the overhead of v1 matching,
+	 * by prebuilding a v2 struct and putting the pointer into the
+	 * v1 dataspace.
+	 */
+	memcpy(up, info, offsetof(typeof(*info), state_mask));
+	up->state_mask  = info->state_mask;
+	up->status_mask = info->status_mask;
+	*(void **)info  = up;
+	return true;
+}
+
 static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
 {
 	nf_ct_l3proto_module_put(par->family);
 }
 
+static void conntrack_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+	struct xt_conntrack_mtinfo2 **info = par->matchinfo;
+	kfree(*info);
+	conntrack_mt_destroy(par);
+}
+
 #ifdef CONFIG_COMPAT
 struct compat_xt_conntrack_info
 {
@@ -363,6 +407,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
 		.revision   = 1,
 		.family     = NFPROTO_UNSPEC,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
+		.match      = conntrack_mt_v1,
+		.checkentry = conntrack_mt_check_v1,
+		.destroy    = conntrack_mt_destroy_v1,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "conntrack",
+		.revision   = 2,
+		.family     = NFPROTO_UNSPEC,
+		.matchsize  = sizeof(struct xt_conntrack_mtinfo2),
 		.match      = conntrack_mt,
 		.checkentry = conntrack_mt_check,
 		.destroy    = conntrack_mt_destroy,
-- 
cgit v1.2.3-71-gd317


From e6ce3066010a21bde961d8f8cefe0b69cae78a0f Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Mon, 29 Jun 2009 14:31:58 +0800
Subject: fs: allow d_instantiate to be called with negative parent dentry

The new fsnotify infrastructure (starting at 90586523) causes an oops in
spufs, where we populate a directory with files before instantiating the
directory itself. The new changes seem to have introduced an assumption
that a dentry's parent will be positive when instantiating.

This change makes it once again possible to d_instantiate a dentry
with a negative parent, and brings __fsnotify_d_instantiate() into
line with inotify_d_instantiate(), which already has this NULL check.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/fsnotify_backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 44848aa830dc..6c3de999fb34 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -280,7 +280,7 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
 	assert_spin_locked(&dentry->d_lock);
 
 	parent = dentry->d_parent;
-	if (fsnotify_inode_watches_children(parent->d_inode))
+	if (parent->d_inode && fsnotify_inode_watches_children(parent->d_inode))
 		dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
 	else
 		dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
-- 
cgit v1.2.3-71-gd317


From 2bf427b25b79eb7cea27963a66c3d4684cae0e0c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Jun 2009 19:20:42 -0700
Subject: ide: fix resume for CONFIG_BLK_DEV_IDEACPI=y

commit 2f0d0fd2a605666d38e290c5c0d2907484352dc4 ("ide-acpi: cleanup
do_drive_get_GTF()") didn't account for the lack of hwif->acpidata
check in generic_ide_suspend() [ indirect user of do_drive_get_GTF()
through ide_acpi_exec_tfs() ] resulting in broken resume when ACPI
support is enabled but ACPI data is unavailable.

Fix it by adding ide_port_acpi() helper for checking if port needs
ACPI handling and cleaning generic_ide_{suspend,resume}() to use it
instead of hiding hwif->acpidata and ide_noacpi checks in IDE ACPI
helpers (this should help in preventing similar bugs in the future).

While at it:
- kill superfluous debugging printks in ide_acpi_{get,push}_timing()

Reported-and-tested-by: Etienne Basset <etienne.basset@numericable.fr>
Also-reported-and-tested-by: Jeff Chua <jeff.chua.linux@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ide/ide-acpi.c | 37 +++++++------------------------------
 drivers/ide/ide-pm.c   | 30 ++++++++++++++++++------------
 include/linux/ide.h    |  2 ++
 3 files changed, 27 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 77f79d26b264..c509c9916464 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -92,6 +92,11 @@ int ide_acpi_init(void)
 	return 0;
 }
 
+bool ide_port_acpi(ide_hwif_t *hwif)
+{
+	return ide_noacpi == 0 && hwif->acpidata;
+}
+
 /**
  * ide_get_dev_handle - finds acpi_handle and PCI device.function
  * @dev: device to locate
@@ -352,9 +357,6 @@ int ide_acpi_exec_tfs(ide_drive_t *drive)
 	unsigned long	gtf_address;
 	unsigned long	obj_loc;
 
-	if (ide_noacpi)
-		return 0;
-
 	DEBPRINT("call get_GTF, drive=%s port=%d\n", drive->name, drive->dn);
 
 	ret = do_drive_get_GTF(drive, &gtf_length, &gtf_address, &obj_loc);
@@ -389,16 +391,6 @@ void ide_acpi_get_timing(ide_hwif_t *hwif)
 	struct acpi_buffer	output;
 	union acpi_object 	*out_obj;
 
-	if (ide_noacpi)
-		return;
-
-	DEBPRINT("ENTER:\n");
-
-	if (!hwif->acpidata) {
-		DEBPRINT("no ACPI data for %s\n", hwif->name);
-		return;
-	}
-
 	/* Setting up output buffer for _GTM */
 	output.length = ACPI_ALLOCATE_BUFFER;
 	output.pointer = NULL;	/* ACPI-CA sets this; save/free it later */
@@ -479,16 +471,6 @@ void ide_acpi_push_timing(ide_hwif_t *hwif)
 	struct ide_acpi_drive_link	*master = &hwif->acpidata->master;
 	struct ide_acpi_drive_link	*slave = &hwif->acpidata->slave;
 
-	if (ide_noacpi)
-		return;
-
-	DEBPRINT("ENTER:\n");
-
-	if (!hwif->acpidata) {
-		DEBPRINT("no ACPI data for %s\n", hwif->name);
-		return;
-	}
-
 	/* Give the GTM buffer + drive Identify data to the channel via the
 	 * _STM method: */
 	/* setup input parameters buffer for _STM */
@@ -527,16 +509,11 @@ void ide_acpi_set_state(ide_hwif_t *hwif, int on)
 	ide_drive_t *drive;
 	int i;
 
-	if (ide_noacpi || ide_noacpi_psx)
+	if (ide_noacpi_psx)
 		return;
 
 	DEBPRINT("ENTER:\n");
 
-	if (!hwif->acpidata) {
-		DEBPRINT("no ACPI data for %s\n", hwif->name);
-		return;
-	}
-
 	/* channel first and then drives for power on and verse versa for power off */
 	if (on)
 		acpi_bus_set_power(hwif->acpidata->obj_handle, ACPI_STATE_D0);
@@ -616,7 +593,7 @@ void ide_acpi_port_init_devices(ide_hwif_t *hwif)
 				 drive->name, err);
 	}
 
-	if (!ide_acpionboot) {
+	if (ide_noacpi || ide_acpionboot == 0) {
 		DEBPRINT("ACPI methods disabled on boot\n");
 		return;
 	}
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index c14ca144cffe..ad7be2669dcb 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -10,9 +10,11 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	struct request_pm_state rqpm;
 	int ret;
 
-	/* call ACPI _GTM only once */
-	if ((drive->dn & 1) == 0 || pair == NULL)
-		ide_acpi_get_timing(hwif);
+	if (ide_port_acpi(hwif)) {
+		/* call ACPI _GTM only once */
+		if ((drive->dn & 1) == 0 || pair == NULL)
+			ide_acpi_get_timing(hwif);
+	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
@@ -26,9 +28,11 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ret = blk_execute_rq(drive->queue, NULL, rq, 0);
 	blk_put_request(rq);
 
-	/* call ACPI _PS3 only after both devices are suspended */
-	if (ret == 0 && ((drive->dn & 1) || pair == NULL))
-		ide_acpi_set_state(hwif, 0);
+	if (ret == 0 && ide_port_acpi(hwif)) {
+		/* call ACPI _PS3 only after both devices are suspended */
+		if ((drive->dn & 1) || pair == NULL)
+			ide_acpi_set_state(hwif, 0);
+	}
 
 	return ret;
 }
@@ -42,13 +46,15 @@ int generic_ide_resume(struct device *dev)
 	struct request_pm_state rqpm;
 	int err;
 
-	/* call ACPI _PS0 / _STM only once */
-	if ((drive->dn & 1) == 0 || pair == NULL) {
-		ide_acpi_set_state(hwif, 1);
-		ide_acpi_push_timing(hwif);
-	}
+	if (ide_port_acpi(hwif)) {
+		/* call ACPI _PS0 / _STM only once */
+		if ((drive->dn & 1) == 0 || pair == NULL) {
+			ide_acpi_set_state(hwif, 1);
+			ide_acpi_push_timing(hwif);
+		}
 
-	ide_acpi_exec_tfs(drive);
+		ide_acpi_exec_tfs(drive);
+	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c6af7c44d46c..edc93a6d931d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1419,6 +1419,7 @@ static inline void ide_dma_unmap_sg(ide_drive_t *drive,
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
 int ide_acpi_init(void);
+bool ide_port_acpi(ide_hwif_t *hwif);
 extern int ide_acpi_exec_tfs(ide_drive_t *drive);
 extern void ide_acpi_get_timing(ide_hwif_t *hwif);
 extern void ide_acpi_push_timing(ide_hwif_t *hwif);
@@ -1427,6 +1428,7 @@ void ide_acpi_port_init_devices(ide_hwif_t *);
 extern void ide_acpi_set_state(ide_hwif_t *hwif, int on);
 #else
 static inline int ide_acpi_init(void) { return 0; }
+static inline bool ide_port_acpi(ide_hwif_t *hwif) { return 0; }
 static inline int ide_acpi_exec_tfs(ide_drive_t *drive) { return 0; }
 static inline void ide_acpi_get_timing(ide_hwif_t *hwif) { ; }
 static inline void ide_acpi_push_timing(ide_hwif_t *hwif) { ; }
-- 
cgit v1.2.3-71-gd317


From 57e7986ed142417498155ebcd5eaf617ac37136d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 30 Jun 2009 16:07:19 +1000
Subject: perf_counter: Provide a way to enable counters on exec

This provides a way to mark a counter to be enabled on the next
exec. This is useful for measuring the total activity of a
program without including overhead from the process that
launches it.

This also changes the perf stat command to use this new
facility.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <19017.43927.838745.689203@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 ++-
 kernel/perf_counter.c        | 50 ++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/builtin-stat.c    |  6 +++---
 3 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3078e23c91eb..5e970c7d3fd5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -179,8 +179,9 @@ struct perf_counter_attr {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 				inherit_stat   :  1, /* per task counts       */
+				enable_on_exec :  1, /* next exec enables     */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 66ab1e9d1294..d55a50da2347 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1428,6 +1428,53 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 		perf_counter_task_sched_in(curr, cpu);
 }
 
+/*
+ * Enable all of a task's counters that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_counter_enable_on_exec(struct task_struct *task)
+{
+	struct perf_counter_context *ctx;
+	struct perf_counter *counter;
+	unsigned long flags;
+	int enabled = 0;
+
+	local_irq_save(flags);
+	ctx = task->perf_counter_ctxp;
+	if (!ctx || !ctx->nr_counters)
+		goto out;
+
+	__perf_counter_task_sched_out(ctx);
+
+	spin_lock(&ctx->lock);
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (!counter->attr.enable_on_exec)
+			continue;
+		counter->attr.enable_on_exec = 0;
+		if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+			continue;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
+		enabled = 1;
+	}
+
+	/*
+	 * Unclone this context if we enabled any counter.
+	 */
+	if (enabled && ctx->parent_ctx) {
+		put_ctx(ctx->parent_ctx);
+		ctx->parent_ctx = NULL;
+	}
+
+	spin_unlock(&ctx->lock);
+
+	perf_counter_task_sched_in(task, smp_processor_id());
+ out:
+	local_irq_restore(flags);
+}
+
 /*
  * Cross CPU call to read the hardware counter
  */
@@ -2949,6 +2996,9 @@ void perf_counter_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
 
+	if (task->perf_counter_ctxp)
+		perf_counter_enable_on_exec(task);
+
 	if (!atomic_read(&nr_comm_counters))
 		return;
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 201ef2367dcb..2e03524a1de0 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -116,8 +116,9 @@ static void create_perf_stat_counter(int counter, int pid)
 					fd[cpu][counter], strerror(errno));
 		}
 	} else {
-		attr->inherit	= inherit;
-		attr->disabled	= 1;
+		attr->inherit	     = inherit;
+		attr->disabled	     = 1;
+		attr->enable_on_exec = 1;
 
 		fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
 		if (fd[0][counter] < 0 && verbose)
@@ -262,7 +263,6 @@ static int run_perf_stat(int argc, const char **argv)
 	 * Enable counters and exec the command:
 	 */
 	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
 
 	close(go_pipe[1]);
 	wait(&status);
-- 
cgit v1.2.3-71-gd317


From e0a43ddcc08c34dbd666d93600fd23914505f4aa Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 30 Jun 2009 20:12:23 +0200
Subject: fuse: allow umask processing in userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch lets filesystems handle masking the file mode on creation.
This is needed if filesystem is using ACLs.

 - The CREATE, MKDIR and MKNOD requests are extended with a "umask"
   parameter.

 - A new FUSE_DONT_MASK flag is added to the INIT request/reply.  With
   this the filesystem may request that the create mode is not masked.

CC: Jean-Pierre André <jean-pierre.andre@wanadoo.fr>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c        | 20 +++++++++++++++++---
 fs/fuse/fuse_i.h     |  3 +++
 fs/fuse/inode.c      |  9 ++++++++-
 include/linux/fuse.h | 20 ++++++++++++++++++--
 4 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b3089a083d30..6b700734e519 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -375,7 +375,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req;
 	struct fuse_req *forget_req;
-	struct fuse_open_in inarg;
+	struct fuse_create_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
 	struct fuse_file *ff;
@@ -399,15 +399,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (!ff)
 		goto out_put_request;
 
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
 	flags &= ~O_NOCTTY;
 	memset(&inarg, 0, sizeof(inarg));
 	memset(&outentry, 0, sizeof(outentry));
 	inarg.flags = flags;
 	inarg.mode = mode;
+	inarg.umask = current_umask();
 	req->in.h.opcode = FUSE_CREATE;
 	req->in.h.nodeid = get_node_id(dir);
 	req->in.numargs = 2;
-	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
+						sizeof(inarg);
 	req->in.args[0].value = &inarg;
 	req->in.args[1].size = entry->d_name.len + 1;
 	req->in.args[1].value = entry->d_name.name;
@@ -546,12 +551,17 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
 	inarg.rdev = new_encode_dev(rdev);
+	inarg.umask = current_umask();
 	req->in.h.opcode = FUSE_MKNOD;
 	req->in.numargs = 2;
-	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
+						sizeof(inarg);
 	req->in.args[0].value = &inarg;
 	req->in.args[1].size = entry->d_name.len + 1;
 	req->in.args[1].value = entry->d_name.name;
@@ -578,8 +588,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
+	inarg.umask = current_umask();
 	req->in.h.opcode = FUSE_MKDIR;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(inarg);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index aaf2f9ff970e..ede4f77b2d6c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -446,6 +446,9 @@ struct fuse_conn {
 	/** Do multi-page cached writes */
 	unsigned big_writes:1;
 
+	/** Don't apply umask to creation modes */
+	unsigned dont_mask:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d8673ccf90b7..6cc501bd0187 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -725,6 +725,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 			}
 			if (arg->flags & FUSE_BIG_WRITES)
 				fc->big_writes = 1;
+			if (arg->flags & FUSE_DONT_MASK)
+				fc->dont_mask = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 			fc->no_lock = 1;
@@ -748,7 +750,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
 	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
+		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -864,6 +866,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto err_put_conn;
 
+	/* Handle umasking inside the fuse code */
+	if (sb->s_flags & MS_POSIXACL)
+		fc->dont_mask = 1;
+	sb->s_flags |= MS_POSIXACL;
+
 	fc->release = fuse_free_conn;
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index d41ed593f79f..e2b816a62488 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -25,6 +25,9 @@
  *  - add IOCTL message
  *  - add unsolicited notification support
  *  - add POLL message and NOTIFY_POLL notification
+ *
+ * 7.12
+ *  - add umask flag to input argument of open, mknod and mkdir
  */
 
 #ifndef _LINUX_FUSE_H
@@ -36,7 +39,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 11
+#define FUSE_KERNEL_MINOR_VERSION 12
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -112,6 +115,7 @@ struct fuse_file_lock {
  * INIT request/reply flags
  *
  * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
+ * FUSE_DONT_MASK: don't apply umask to file mode on create operations
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -119,6 +123,7 @@ struct fuse_file_lock {
 #define FUSE_ATOMIC_O_TRUNC	(1 << 3)
 #define FUSE_EXPORT_SUPPORT	(1 << 4)
 #define FUSE_BIG_WRITES		(1 << 5)
+#define FUSE_DONT_MASK		(1 << 6)
 
 /**
  * CUSE INIT request/reply flags
@@ -262,14 +267,18 @@ struct fuse_attr_out {
 	struct fuse_attr attr;
 };
 
+#define FUSE_COMPAT_MKNOD_IN_SIZE 8
+
 struct fuse_mknod_in {
 	__u32	mode;
 	__u32	rdev;
+	__u32	umask;
+	__u32	padding;
 };
 
 struct fuse_mkdir_in {
 	__u32	mode;
-	__u32	padding;
+	__u32	umask;
 };
 
 struct fuse_rename_in {
@@ -300,8 +309,15 @@ struct fuse_setattr_in {
 };
 
 struct fuse_open_in {
+	__u32	flags;
+	__u32	unused;
+};
+
+struct fuse_create_in {
 	__u32	flags;
 	__u32	mode;
+	__u32	umask;
+	__u32	padding;
 };
 
 struct fuse_open_out {
-- 
cgit v1.2.3-71-gd317


From 3b463ae0c6264f70e5d4c0a9c46af20fed43c96e Mon Sep 17 00:00:00 2001
From: John Muir <muirj@nortel.com>
Date: Sun, 31 May 2009 11:13:57 -0400
Subject: fuse: invalidation reverse calls

Add notification messages that allow the filesystem to invalidate VFS
caches.

Two notifications are added:

 1) inode invalidation

   - invalidate cached attributes
   - invalidate a range of pages in the page cache (this is optional)

 2) dentry invalidation

   - try to invalidate a subtree in the dentry cache

Care must be taken while accessing the 'struct super_block' for the
mount, as it can go away while an invalidation is in progress.  To
prevent this, introduce a rw-semaphore, that is taken for read during
the invalidation and taken for write in the ->kill_sb callback.

Cc: Csaba Henk <csaba@gluster.com>
Cc: Anand Avati <avati@zresearch.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dir.c        | 37 ++++++++++++++++++++++++
 fs/fuse/fuse_i.h     | 24 ++++++++++++++++
 fs/fuse/inode.c      | 59 ++++++++++++++++++++++++++++++++++++--
 include/linux/fuse.h | 16 +++++++++++
 5 files changed, 214 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8a11a8c67c42..f58ecbc416c8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -849,6 +849,81 @@ err:
 	return err;
 }
 
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_inode_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (!fc->sb)
+		goto err_unlock;
+
+	err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+				       outarg.off, outarg.len);
+
+err_unlock:
+	up_read(&fc->killsb);
+	return err;
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_entry_out outarg;
+	int err = -EINVAL;
+	char buf[FUSE_NAME_MAX+1];
+	struct qstr name;
+
+	if (size < sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	err = -ENAMETOOLONG;
+	if (outarg.namelen > FUSE_NAME_MAX)
+		goto err;
+
+	name.name = buf;
+	name.len = outarg.namelen;
+	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+	buf[outarg.namelen] = 0;
+	name.hash = full_name_hash(name.name, name.len);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (!fc->sb)
+		goto err_unlock;
+
+	err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+
+err_unlock:
+	up_read(&fc->killsb);
+	return err;
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -856,6 +931,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_POLL:
 		return fuse_notify_poll(fc, size, cs);
 
+	case FUSE_NOTIFY_INVAL_INODE:
+		return fuse_notify_inval_inode(fc, size, cs);
+
+	case FUSE_NOTIFY_INVAL_ENTRY:
+		return fuse_notify_inval_entry(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 6b700734e519..e703654e7f40 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -859,6 +859,43 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 	return err;
 }
 
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+			     struct qstr *name)
+{
+	int err = -ENOTDIR;
+	struct inode *parent;
+	struct dentry *dir;
+	struct dentry *entry;
+
+	parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+	if (!parent)
+		return -ENOENT;
+
+	mutex_lock(&parent->i_mutex);
+	if (!S_ISDIR(parent->i_mode))
+		goto unlock;
+
+	err = -ENOENT;
+	dir = d_find_alias(parent);
+	if (!dir)
+		goto unlock;
+
+	entry = d_lookup(dir, name);
+	dput(dir);
+	if (!entry)
+		goto unlock;
+
+	fuse_invalidate_attr(parent);
+	fuse_invalidate_entry(entry);
+	dput(entry);
+	err = 0;
+
+ unlock:
+	mutex_unlock(&parent->i_mutex);
+	iput(parent);
+	return err;
+}
+
 /*
  * Calling into a user-controlled filesystem gives the filesystem
  * daemon ptrace-like capabilities over the requester process.  This
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ede4f77b2d6c..52b641fc0faf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -484,6 +484,12 @@ struct fuse_conn {
 
 	/** Called on final put */
 	void (*release)(struct fuse_conn *);
+
+	/** Super block for this connection. */
+	struct super_block *sb;
+
+	/** Read/write semaphore to hold when accessing sb. */
+	struct rw_semaphore killsb;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -511,6 +517,11 @@ extern const struct file_operations fuse_dev_operations;
 
 extern const struct dentry_operations fuse_dentry_operations;
 
+/**
+ * Inode to nodeid comparison.
+ */
+int fuse_inode_eq(struct inode *inode, void *_nodeidp);
+
 /**
  * Get a filled in inode
  */
@@ -711,6 +722,19 @@ void fuse_release_nowrite(struct inode *inode);
 
 u64 fuse_get_attr_version(struct fuse_conn *fc);
 
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+			     loff_t offset, loff_t len);
+
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ */
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+			     struct qstr *name);
+
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
 ssize_t fuse_direct_io(struct file *file, const char __user *buf,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6cc501bd0187..f91ccc4a189d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -206,7 +206,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 		BUG();
 }
 
-static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+int fuse_inode_eq(struct inode *inode, void *_nodeidp)
 {
 	u64 nodeid = *(u64 *) _nodeidp;
 	if (get_node_id(inode) == nodeid)
@@ -257,6 +257,31 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 	return inode;
 }
 
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+			     loff_t offset, loff_t len)
+{
+	struct inode *inode;
+	pgoff_t pg_start;
+	pgoff_t pg_end;
+
+	inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		return -ENOENT;
+
+	fuse_invalidate_attr(inode);
+	if (offset >= 0) {
+		pg_start = offset >> PAGE_CACHE_SHIFT;
+		if (len <= 0)
+			pg_end = -1;
+		else
+			pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+		invalidate_inode_pages2_range(inode->i_mapping,
+					      pg_start, pg_end);
+	}
+	iput(inode);
+	return 0;
+}
+
 static void fuse_umount_begin(struct super_block *sb)
 {
 	fuse_abort_conn(get_fuse_conn_super(sb));
@@ -480,6 +505,7 @@ void fuse_conn_init(struct fuse_conn *fc)
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
 	mutex_init(&fc->inst_mutex);
+	init_rwsem(&fc->killsb);
 	atomic_set(&fc->count, 1);
 	init_waitqueue_head(&fc->waitq);
 	init_waitqueue_head(&fc->blocked_waitq);
@@ -862,6 +888,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fuse_conn_init(fc);
 
 	fc->dev = sb->s_dev;
+	fc->sb = sb;
 	err = fuse_bdi_init(fc, sb);
 	if (err)
 		goto err_put_conn;
@@ -948,12 +975,25 @@ static int fuse_get_sb(struct file_system_type *fs_type,
 	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (fc) {
+		down_write(&fc->killsb);
+		fc->sb = NULL;
+		up_write(&fc->killsb);
+	}
+
+	kill_anon_super(sb);
+}
+
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
 	.fs_flags	= FS_HAS_SUBTYPE,
 	.get_sb		= fuse_get_sb,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= fuse_kill_sb_anon,
 };
 
 #ifdef CONFIG_BLOCK
@@ -965,11 +1005,24 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type,
 			   mnt);
 }
 
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (fc) {
+		down_write(&fc->killsb);
+		fc->sb = NULL;
+		up_write(&fc->killsb);
+	}
+
+	kill_block_super(sb);
+}
+
 static struct file_system_type fuseblk_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuseblk",
 	.get_sb		= fuse_get_sb_blk,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= fuse_kill_sb_blk,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index e2b816a62488..cf593bf9fd32 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -28,6 +28,8 @@
  *
  * 7.12
  *  - add umask flag to input argument of open, mknod and mkdir
+ *  - add notification messages for invalidation of inodes and
+ *    directory entries
  */
 
 #ifndef _LINUX_FUSE_H
@@ -229,6 +231,8 @@ enum fuse_opcode {
 
 enum fuse_notify_code {
 	FUSE_NOTIFY_POLL   = 1,
+	FUSE_NOTIFY_INVAL_INODE = 2,
+	FUSE_NOTIFY_INVAL_ENTRY = 3,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -524,4 +528,16 @@ struct fuse_dirent {
 #define FUSE_DIRENT_SIZE(d) \
 	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
 
+struct fuse_notify_inval_inode_out {
+	__u64	ino;
+	__s64	off;
+	__s64	len;
+};
+
+struct fuse_notify_inval_entry_out {
+	__u64	parent;
+	__u32	namelen;
+	__u32	padding;
+};
+
 #endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3-71-gd317


From 133890103b9de08904f909995973e4b5c08a780e Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 30 Jun 2009 11:41:11 -0700
Subject: eventfd: revised interface and cleanups

Change the eventfd interface to de-couple the eventfd memory context, from
the file pointer instance.

Without such change, there is no clean way to racely free handle the
POLLHUP event sent when the last instance of the file* goes away.  Also,
now the internal eventfd APIs are using the eventfd context instead of the
file*.

This patch is required by KVM's IRQfd code, which is still under
development.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/lguest/lg.h          |   2 +-
 drivers/lguest/lguest_user.c |   4 +-
 fs/aio.c                     |  24 +++------
 fs/eventfd.c                 | 122 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/aio.h          |   4 +-
 include/linux/eventfd.h      |  35 ++++++++++---
 6 files changed, 149 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index d4e8979735cb..9c3138265f8e 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -82,7 +82,7 @@ struct lg_cpu {
 
 struct lg_eventfd {
 	unsigned long addr;
-	struct file *event;
+	struct eventfd_ctx *event;
 };
 
 struct lg_eventfd_map {
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 32e297121058..9f9a2953b383 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -50,7 +50,7 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
 
 	/* Now append new entry. */
 	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_fget(fd);
+	new->map[new->num].event = eventfd_ctx_fdget(fd);
 	if (IS_ERR(new->map[new->num].event)) {
 		kfree(new);
 		return PTR_ERR(new->map[new->num].event);
@@ -357,7 +357,7 @@ static int close(struct inode *inode, struct file *file)
 
 	/* Release any eventfds they registered. */
 	for (i = 0; i < lg->eventfds->num; i++)
-		fput(lg->eventfds->map[i].event);
+		eventfd_ctx_put(lg->eventfds->map[i].event);
 	kfree(lg->eventfds);
 
 	/* If lg->dead doesn't contain an error code it will be NULL or a
diff --git a/fs/aio.c b/fs/aio.c
index 76da12537956..d065b2c3273e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -485,6 +485,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work_struct *data)
 		/* Complete the fput(s) */
 		if (req->ki_filp != NULL)
 			__fput(req->ki_filp);
-		if (req->ki_eventfd != NULL)
-			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work_struct *data)
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	int schedule_putreq = 0;
-
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	 * we would not be holding the last reference to the file*, so
 	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
-		schedule_putreq++;
-	else
-		req->ki_filp = NULL;
-	if (req->ki_eventfd != NULL) {
-		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-			schedule_putreq++;
-		else
-			req->ki_eventfd = NULL;
-	}
-	if (unlikely(schedule_putreq)) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
 		spin_unlock(&fput_lock);
 		queue_work(aio_wq, &fput_work);
-	} else
+	} else {
+		req->ki_filp = NULL;
 		really_put_req(ctx, req);
+	}
 	return 1;
 }
 
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3f0e1974abdc..31d12de83a2a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -14,35 +14,44 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 
 struct eventfd_ctx {
+	struct kref kref;
 	wait_queue_head_t wqh;
 	/*
 	 * Every time that a write(2) is performed on an eventfd, the
 	 * value of the __u64 being written is added to "count" and a
 	 * wakeup is performed on "wqh". A read(2) will return the "count"
 	 * value to userspace, and will reset "count" to zero. The kernel
-	 * size eventfd_signal() also, adds to the "count" counter and
+	 * side eventfd_signal() also, adds to the "count" counter and
 	 * issue a wakeup.
 	 */
 	__u64 count;
 	unsigned int flags;
 };
 
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *          The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
  */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	unsigned long flags;
 
 	if (n < 0)
@@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, int n)
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	kfree(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
 	return 0;
 }
 
@@ -185,6 +230,16 @@ static const struct file_operations eventfd_fops = {
 	.write		= eventfd_write,
 };
 
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
 	struct file *file;
@@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
 
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file))
+		return (struct eventfd_ctx *) file;
+	ctx = eventfd_ctx_get(file->private_data);
+	fput(file);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+	if (file->f_op != &eventfd_fops)
+		return ERR_PTR(-EINVAL);
+
+	return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
+
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
@@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 	if (!ctx)
 		return -ENOMEM;
 
+	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index b16a957030f8..47f7d932a01d 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -121,9 +121,9 @@ struct kiocb {
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-	 * this is the underlying file* to deliver event to.
+	 * this is the underlying eventfd context to deliver events to.
 	 */
-	struct file		*ki_eventfd;
+	struct eventfd_ctx	*ki_eventfd;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index f45a8ae5f828..3b85ba6479f4 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -8,10 +8,8 @@
 #ifndef _LINUX_EVENTFD_H
 #define _LINUX_EVENTFD_H
 
-#ifdef CONFIG_EVENTFD
-
-/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
+#include <linux/file.h>
 
 /*
  * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -27,16 +25,37 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+#ifdef CONFIG_EVENTFD
+
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
+void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
+struct eventfd_ctx *eventfd_ctx_fdget(int fd);
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
+int eventfd_signal(struct eventfd_ctx *ctx, int n);
 
 #else /* CONFIG_EVENTFD */
 
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-static inline int eventfd_signal(struct file *file, int n)
-{ return 0; }
+/*
+ * Ugly ugly ugly error layer to support modules that uses eventfd but
+ * pretend to work in !CONFIG_EVENTFD configurations. Namely, AIO.
+ */
+static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int eventfd_signal(struct eventfd_ctx *ctx, int n)
+{
+	return -ENOSYS;
+}
+
+static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+
+}
 
-#endif /* CONFIG_EVENTFD */
+#endif
 
 #endif /* _LINUX_EVENTFD_H */
 
-- 
cgit v1.2.3-71-gd317


From b01e8dc34379f4ba2f454390e340a025edbaaa7e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Jun 2009 11:41:18 -0700
Subject: alpha: fix percpu build breakage

alpha percpu access requires custom SHIFT_PERCPU_PTR() definition for
modules to work around addressing range limitation.  This is done via
generating inline assembly using C preprocessing which forces the
assembler to generate external reference.  This happens behind the
compiler's back and makes the compiler think that static percpu variables
in modules are unused.

This used to be worked around by using __unused attribute for percpu
variables which prevent the compiler from omitting the variable; however,
recent declare/definition attribute unification change broke this as
__used can't be used for declaration.  Also, in the process,
PER_CPU_ATTRIBUTES definition in alpha percpu.h got broken.

This patch adds PER_CPU_DEF_ATTRIBUTES which is only used for definitions
and make alpha use it to add __used for percpu variables in modules.  This
also fixes the PER_CPU_ATTRIBUTES double definition bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: maximilian attems <max@stro.at>
Acked-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/percpu.h | 6 +++---
 include/asm-generic/percpu.h    | 4 ++++
 include/linux/percpu-defs.h     | 3 ++-
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h
index 06c5c7a4afd3..b663f1f10b6a 100644
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -30,7 +30,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 
 #ifndef MODULE
 #define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset))
-#define PER_CPU_ATTRIBUTES
+#define PER_CPU_DEF_ATTRIBUTES
 #else
 /*
  * To calculate addresses of locally defined variables, GCC uses 32-bit
@@ -49,7 +49,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 		: "=&r"(__ptr), "=&r"(tmp_gp));		\
 	(typeof(&per_cpu_var(var)))(__ptr + (offset)); })
 
-#define PER_CPU_ATTRIBUTES	__used
+#define PER_CPU_DEF_ATTRIBUTES	__used
 
 #endif /* MODULE */
 
@@ -71,7 +71,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define __get_cpu_var(var)		per_cpu_var(var)
 #define __raw_get_cpu_var(var)		per_cpu_var(var)
 
-#define PER_CPU_ATTRIBUTES
+#define PER_CPU_DEF_ATTRIBUTES
 
 #endif /* SMP */
 
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index d7d50d7ee51e..aa00800adacc 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -97,4 +97,8 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_ATTRIBUTES
 #endif
 
+#ifndef PER_CPU_DEF_ATTRIBUTES
+#define PER_CPU_DEF_ATTRIBUTES
+#endif
+
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f921d74f49f..68438e18fff4 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -24,7 +24,8 @@
 
 #define DEFINE_PER_CPU_SECTION(type, name, section)			\
 	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+	PER_CPU_ATTRIBUTES PER_CPU_DEF_ATTRIBUTES			\
+	__typeof__(type) per_cpu__##name
 
 /*
  * Variant on the per-CPU variable declaration/definition theme used for
-- 
cgit v1.2.3-71-gd317


From c4285b47b0514e2103584ee829246f813e7ae323 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Tue, 30 Jun 2009 11:41:21 -0700
Subject: parport/serial: add support for NetMos 9901 Multi-IO card

Add support for the PCI-Express NetMos 9901 Multi-IO card.

0001:06:00.0 Serial controller [0700]: NetMos Technology Device [9710:9901] (prog-if 02 [16550])
        Subsystem: Device [a000:1000]
        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx-
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
        Latency: 0, Cache Line Size: 64 bytes
        Interrupt: pin A routed to IRQ 65
        Region 0: I/O ports at 0030 [size=8]
        Region 1: Memory at 80105000 (32-bit, non-prefetchable) [size=4K]
        Region 4: Memory at 80104000 (32-bit, non-prefetchable) [size=4K]
        Capabilities: <access denied>
        Kernel driver in use: serial
        Kernel modules: 8250_pci

0001:06:00.1 Serial controller [0700]: NetMos Technology Device [9710:9901] (prog-if 02 [16550])
        Subsystem: Device [a000:1000]
        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx-
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
        Latency: 0, Cache Line Size: 64 bytes
        Interrupt: pin B routed to IRQ 65
        Region 0: I/O ports at 0020 [size=8]
        Region 1: Memory at 80103000 (32-bit, non-prefetchable) [size=4K]
        Region 4: Memory at 80102000 (32-bit, non-prefetchable) [size=4K]
        Capabilities: <access denied>
        Kernel driver in use: serial
        Kernel modules: 8250_pci

0001:06:00.2 Parallel controller [0701]: NetMos Technology Device [9710:9901] (prog-if 03 [IEEE1284])
        Subsystem: Device [a000:2000]
        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx-
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
        Latency: 0, Cache Line Size: 64 bytes
        Interrupt: pin C routed to IRQ 65
        Region 0: I/O ports at 0010 [size=8]
        Region 1: I/O ports at <unassigned>
        Region 2: Memory at 80101000 (32-bit, non-prefetchable) [size=4K]
        Region 4: Memory at 80100000 (32-bit, non-prefetchable) [size=4K]
        Capabilities: <access denied>
        Kernel driver in use: parport_pc
        Kernel modules: parport_pc

[   16.760181] PCI parallel port detected: 416c:0100, I/O at 0x812010(0x0), IRQ 65
[   16.760225] parport0: PC-style at 0x812010, irq 65 [PCSPP,TRISTATE,EPP]
[   16.851842] serial 0001:06:00.0: enabling device (0004 -> 0007)
[   16.883776] 0001:06:00.0: ttyS0 at I/O 0x812030 (irq = 65) is a ST16650V2
[   16.893832] serial 0001:06:00.1: enabling device (0004 -> 0007)
[   16.926537] 0001:06:00.1: ttyS1 at I/O 0x812020 (irq = 65) is a ST16650V2

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/parport_pc.c | 5 ++++-
 drivers/serial/8250_pci.c    | 6 ++++++
 include/linux/pci_ids.h      | 1 +
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 1032d5fdbd42..2597145a066e 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -2907,6 +2907,7 @@ enum parport_pc_pci_cards {
 	netmos_9755,
 	netmos_9805,
 	netmos_9815,
+	netmos_9901,
 	quatech_sppxp100,
 };
 
@@ -2987,7 +2988,7 @@ static struct parport_pc_pci {
 	/* netmos_9755 */               { 2, { { 0, 1 }, { 2, 3 },} },
 	/* netmos_9805 */               { 1, { { 0, -1 }, } },
 	/* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } },
-
+	/* netmos_9901 */               { 1, { { 0, -1 }, } },
 	/* quatech_sppxp100 */		{ 1, { { 0, 1 }, } },
 };
 
@@ -3089,6 +3090,8 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9805 },
 	{ PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9815,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9815 },
+	{ PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9901,
+	  0xA000, 0x2000, 0, 0, netmos_9901 },
 	/* Quatech SPPXP-100 Parallel port PCI ExpressCard */
 	{ PCI_VENDOR_ID_QUATECH, PCI_DEVICE_ID_QUATECH_SPPXP_100,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, quatech_sppxp100 },
diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index a07015d646dd..6160e03f410c 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -759,6 +759,8 @@ static int pci_netmos_init(struct pci_dev *dev)
 	/* subdevice 0x00PS means <P> parallel, <S> serial */
 	unsigned int num_serial = dev->subsystem_device & 0xf;
 
+	if (dev->device == PCI_DEVICE_ID_NETMOS_9901)
+		return 0;
 	if (dev->subsystem_vendor == PCI_VENDOR_ID_IBM &&
 			dev->subsystem_device == 0x0299)
 		return 0;
@@ -3557,6 +3559,10 @@ static struct pci_device_id serial_pci_tbl[] = {
 		PCI_VENDOR_ID_IBM, 0x0299,
 		0, 0, pbn_b0_bt_2_115200 },
 
+	{	PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9901,
+		0xA000, 0x1000,
+		0, 0, pbn_b0_1_115200 },
+
 	/*
 	 * These entries match devices with class COMMUNICATION_SERIAL,
 	 * COMMUNICATION_MODEM or COMMUNICATION_MULTISERIAL
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a3b000365795..73b46b6b904f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2645,6 +2645,7 @@
 #define PCI_DEVICE_ID_NETMOS_9835	0x9835
 #define PCI_DEVICE_ID_NETMOS_9845	0x9845
 #define PCI_DEVICE_ID_NETMOS_9855	0x9855
+#define PCI_DEVICE_ID_NETMOS_9901	0x9901
 
 #define PCI_VENDOR_ID_3COM_2		0xa727
 
-- 
cgit v1.2.3-71-gd317


From 341c87bf346f57748230628c5ad6ee69219250e8 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 30 Jun 2009 11:41:23 -0700
Subject: elf: limit max map count to safe value

With ELF, at generating coredump, some more headers other than used
vmas are added.

When max_map_count == 65536, a core generated by following kinds of
code can be unreadable because the number of ELF's program header is
written in 16bit in Ehdr (please see elf.h) and the number overflows.

==
	... = mmap(); (munmap, mprotect, etc...)
	if (failed)
		abort();
==

This can happen in mmap/munmap/mprotect/etc...which calls split_vma().

I think 65536 is not safe as _default_ and reduce it to 65530 is good
for avoiding unexpected corrupted core.

Anyway, max_map_count can be enlarged by sysctl if a user is brave..

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Jakub Jelinek <jakub@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c       |  5 ++++-
 include/linux/sched.h | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9fa212b014a5..f1867900e459 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1929,7 +1929,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
 	if (!elf)
 		goto out;
-	
+	/*
+	 * The number of segs are recored into ELF header as 16bit value.
+	 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
+	 */
 	segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
 	segs += ELF_CORE_EXTRA_PHDRS;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d0754269884..0085d758d645 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -349,8 +349,20 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 struct nsproxy;
 struct user_namespace;
 
-/* Maximum number of active map areas.. This is a random (large) number */
-#define DEFAULT_MAX_MAP_COUNT	65536
+/*
+ * Default maximum number of active map areas, this limits the number of vmas
+ * per mm struct. Users can overwrite this number by sysctl but there is a
+ * problem.
+ *
+ * When a program's coredump is generated as ELF format, a section is created
+ * per a vma. In ELF, the number of sections is represented in unsigned short.
+ * This means the number of sections should be smaller than 65535 at coredump.
+ * Because the kernel adds some informative sections to a image of program at
+ * generating coredump, we need some margin. The number of extra sections is
+ * 1-3 now and depends on arch. We use "5" as safe margin, here.
+ */
+#define MAPCOUNT_ELF_CORE_MARGIN	(5)
+#define DEFAULT_MAX_MAP_COUNT	(USHORT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
 
 extern int sysctl_max_map_count;
 
-- 
cgit v1.2.3-71-gd317


From b55f627feeb9d48fdbde3835e18afbc76712e49b Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 30 Jun 2009 11:41:26 -0700
Subject: spi: new spi->mode bits

Add two new spi_device.mode bits to accomodate more protocol options, and
pass them through to usermode drivers:

 * SPI_NO_CS ... a second 3-wire variant, where the chipselect
   line is removed instead of a data line; transfers are still
   full duplex.

   This obviously has STRONG protocol implications since the
   chipselect transitions can't be used to synchronize state
   transitions with the SPI master.

 * SPI_READY ... defines open drain signal that's pulled low
   to pause the clock.  This defines a 5-wire variant (normal
   4-wire SPI plus READY) and two 4-wire variants (READY plus
   each of the 3-wire flavors).

   Such hardware flow control can be a big win.  There are ADC
   converters and flash chips that expose READY signals, but not
   many host controllers support it today.

The spi_bitbang code should be changed to use SPI_NO_CS instead of its
current nonportable hack.  That's a mode most hardware can easily support
(unlike SPI_READY).

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: "Paulraj, Sandeep" <s-paulraj@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/spi/spidev_test.c | 10 +++++++++-
 drivers/spi/spidev.c            | 17 +++++++++++------
 include/linux/spi/spi.h         |  2 ++
 include/linux/spi/spidev.h      |  2 ++
 4 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
index cf0e3ce0d526..c1a5aad3c75a 100644
--- a/Documentation/spi/spidev_test.c
+++ b/Documentation/spi/spidev_test.c
@@ -99,11 +99,13 @@ void parse_opts(int argc, char *argv[])
 			{ "lsb",     0, 0, 'L' },
 			{ "cs-high", 0, 0, 'C' },
 			{ "3wire",   0, 0, '3' },
+			{ "no-cs",   0, 0, 'N' },
+			{ "ready",   0, 0, 'R' },
 			{ NULL, 0, 0, 0 },
 		};
 		int c;
 
-		c = getopt_long(argc, argv, "D:s:d:b:lHOLC3", lopts, NULL);
+		c = getopt_long(argc, argv, "D:s:d:b:lHOLC3NR", lopts, NULL);
 
 		if (c == -1)
 			break;
@@ -139,6 +141,12 @@ void parse_opts(int argc, char *argv[])
 		case '3':
 			mode |= SPI_3WIRE;
 			break;
+		case 'N':
+			mode |= SPI_NO_CS;
+			break;
+		case 'R':
+			mode |= SPI_READY;
+			break;
 		default:
 			print_usage(argv[0]);
 			break;
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 5d869c4d3eb2..606e7a40a8da 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -58,15 +58,20 @@ static unsigned long	minors[N_SPI_MINORS / BITS_PER_LONG];
 
 
 /* Bit masks for spi_device.mode management.  Note that incorrect
- * settings for CS_HIGH and 3WIRE can cause *lots* of trouble for other
- * devices on a shared bus:  CS_HIGH, because this device will be
- * active when it shouldn't be;  3WIRE, because when active it won't
- * behave as it should.
+ * settings for some settings can cause *lots* of trouble for other
+ * devices on a shared bus:
  *
- * REVISIT should changing those two modes be privileged?
+ *  - CS_HIGH ... this device will be active when it shouldn't be
+ *  - 3WIRE ... when active, it won't behave as it should
+ *  - NO_CS ... there will be no explicit message boundaries; this
+ *	is completely incompatible with the shared bus model
+ *  - READY ... transfers may proceed when they shouldn't.
+ *
+ * REVISIT should changing those flags be privileged?
  */
 #define SPI_MODE_MASK		(SPI_CPHA | SPI_CPOL | SPI_CS_HIGH \
-				| SPI_LSB_FIRST | SPI_3WIRE | SPI_LOOP)
+				| SPI_LSB_FIRST | SPI_3WIRE | SPI_LOOP \
+				| SPI_NO_CS | SPI_READY)
 
 struct spidev_data {
 	dev_t			devt;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 9c4cd27f4685..743c933ac4e7 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -80,6 +80,8 @@ struct spi_device {
 #define	SPI_LSB_FIRST	0x08			/* per-word bits-on-wire */
 #define	SPI_3WIRE	0x10			/* SI/SO signals shared */
 #define	SPI_LOOP	0x20			/* loopback mode */
+#define	SPI_NO_CS	0x40			/* 1 dev/bus, no chipselect */
+#define	SPI_READY	0x80			/* slave pulls low to pause */
 	u8			bits_per_word;
 	int			irq;
 	void			*controller_state;
diff --git a/include/linux/spi/spidev.h b/include/linux/spi/spidev.h
index 95251ccd5a07..bf0570a84f7a 100644
--- a/include/linux/spi/spidev.h
+++ b/include/linux/spi/spidev.h
@@ -40,6 +40,8 @@
 #define SPI_LSB_FIRST		0x08
 #define SPI_3WIRE		0x10
 #define SPI_LOOP		0x20
+#define SPI_NO_CS		0x40
+#define SPI_READY		0x80
 
 /*---------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3-71-gd317


From 70d6027ff2bc8bab180273b77e7ab3e8a62cca51 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 30 Jun 2009 11:41:27 -0700
Subject: spi: add spi_master flag word

Add a new spi_master.flags word listing constraints relevant to that
controller.  Define the first constraint bit: a half duplex restriction.
Include that constraint in the OMAP1 MicroWire controller driver.

Have the mmc_spi host be the first customer of this flag.  Its coding
relies heavily on full duplex transfers, so it must fail when the
underlying controller driver won't perform them.

(The spi_write_then_read routine could use it too: use the
temporarily-withdrawn full-duplex speedup unless this flag is set, in
which case the existing code applies.  Similarly, any spi_master
implementing only SPI_3WIRE should set the flag.)

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/host/mmc_spi.c | 6 ++++++
 drivers/spi/omap_uwire.c   | 2 ++
 include/linux/spi/spi.h    | 4 ++++
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index 240608cc7ae9..a461017ce5ce 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1313,6 +1313,12 @@ static int mmc_spi_probe(struct spi_device *spi)
 	struct mmc_spi_host	*host;
 	int			status;
 
+	/* We rely on full duplex transfers, mostly to reduce
+	 * per-transfer overheads (by making fewer transfers).
+	 */
+	if (spi->master->flags & SPI_MASTER_HALF_DUPLEX)
+		return -EINVAL;
+
 	/* MMC and SD specs only seem to care that sampling is on the
 	 * rising edge ... meaning SPI modes 0 or 3.  So either SPI mode
 	 * should be legit.  We'll use mode 0 since the steady state is 0,
diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c
index aa90ddb37066..8980a5640bd9 100644
--- a/drivers/spi/omap_uwire.c
+++ b/drivers/spi/omap_uwire.c
@@ -514,6 +514,8 @@ static int __init uwire_probe(struct platform_device *pdev)
 	/* the spi->mode bits understood by this driver: */
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
 
+	master->flags = SPI_MASTER_HALF_DUPLEX;
+
 	master->bus_num = 2;	/* "official" */
 	master->num_chipselect = 4;
 	master->setup = uwire_setup;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 743c933ac4e7..c47c4b4da97e 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -250,6 +250,10 @@ struct spi_master {
 	/* spi_device.mode flags understood by this controller driver */
 	u16			mode_bits;
 
+	/* other constraints relevant to this driver */
+	u16			flags;
+#define SPI_MASTER_HALF_DUPLEX	BIT(0)		/* can't do full duplex */
+
 	/* Setup mode and clock, etc (spi driver may call many times).
 	 *
 	 * IMPORTANT:  this may be called when transfers to another
-- 
cgit v1.2.3-71-gd317


From 537a1bf059fa312355696fa6db80726e655e7f17 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 30 Jun 2009 11:41:29 -0700
Subject: fbdev: add mutex for fb_mmap locking

Add a mutex to avoid a circular locking problem between the mm layer
semaphore and fbdev ioctl mutex through the fb_mmap() call.

Also, add mutex to all places where smem_start and smem_len fields change
so the mutex inside the fb_mmap() is actually used.  Changing of these
fields before calling the framebuffer_register() are not mutexed.

This is 2.6.31 material.  It removes one lockdep (fb_mmap() and
register_framebuffer()) but there is still another one (fb_release() and
register_framebuffer()).  It also cleans up handling of the smem_start and
smem_len fields used by mutexed section of the fb_mmap().

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/atafb.c                 |  7 ++++++-
 drivers/video/atmel_lcdfb.c           |  2 ++
 drivers/video/fbmem.c                 | 13 +++++--------
 drivers/video/fsl-diu-fb.c            | 14 +++++++++-----
 drivers/video/i810/i810_main.c        |  2 ++
 drivers/video/matrox/matroxfb_base.c  |  3 +++
 drivers/video/matrox/matroxfb_crtc2.c |  5 ++++-
 drivers/video/mx3fb.c                 | 17 +++++++++++------
 drivers/video/omap/omapfb_main.c      |  4 ++++
 drivers/video/platinumfb.c            |  2 ++
 drivers/video/pxafb.c                 |  2 ++
 drivers/video/sh7760fb.c              | 19 ++++++-------------
 drivers/video/sis/sis_main.c          |  2 ++
 drivers/video/sm501fb.c               | 21 +++++++++++++--------
 drivers/video/w100fb.c                |  2 ++
 include/linux/fb.h                    |  1 +
 16 files changed, 74 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/atafb.c b/drivers/video/atafb.c
index 018850c116c6..497ff8af03ed 100644
--- a/drivers/video/atafb.c
+++ b/drivers/video/atafb.c
@@ -2414,7 +2414,10 @@ static int atafb_get_fix(struct fb_fix_screeninfo *fix, struct fb_info *info)
 	if (err)
 		return err;
 	memset(fix, 0, sizeof(struct fb_fix_screeninfo));
-	return fbhw->encode_fix(fix, &par);
+	mutex_lock(&info->mm_lock);
+	err = fbhw->encode_fix(fix, &par);
+	mutex_unlock(&info->mm_lock);
+	return err;
 }
 
 static int atafb_get_var(struct fb_var_screeninfo *var, struct fb_info *info)
@@ -2743,7 +2746,9 @@ static int atafb_set_par(struct fb_info *info)
 
 	/* Decode wanted screen parameters */
 	fbhw->decode_var(&info->var, par);
+	mutex_lock(&info->mm_lock);
 	fbhw->encode_fix(&info->fix, par);
+	mutex_unlock(&info->mm_lock);
 
 	/* Set new videomode */
 	ata_set_par(par);
diff --git a/drivers/video/atmel_lcdfb.c b/drivers/video/atmel_lcdfb.c
index 5afd64482f55..cb88394ba995 100644
--- a/drivers/video/atmel_lcdfb.c
+++ b/drivers/video/atmel_lcdfb.c
@@ -270,7 +270,9 @@ static int atmel_lcdfb_alloc_video_memory(struct atmel_lcdfb_info *sinfo)
 
 	smem_len = (var->xres_virtual * var->yres_virtual
 		    * ((var->bits_per_pixel + 7) / 8));
+	mutex_lock(&info->mm_lock);
 	info->fix.smem_len = max(smem_len, sinfo->smem_len);
+	mutex_unlock(&info->mm_lock);
 
 	info->screen_base = dma_alloc_writecombine(info->device, info->fix.smem_len,
 					(dma_addr_t *)&info->fix.smem_start, GFP_KERNEL);
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index f8a09bf8d0cd..53ea05645ff8 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1310,8 +1310,6 @@ static long fb_compat_ioctl(struct file *file, unsigned int cmd,
 
 static int
 fb_mmap(struct file *file, struct vm_area_struct * vma)
-__acquires(&info->lock)
-__releases(&info->lock)
 {
 	int fbidx = iminor(file->f_path.dentry->d_inode);
 	struct fb_info *info = registered_fb[fbidx];
@@ -1325,16 +1323,14 @@ __releases(&info->lock)
 	off = vma->vm_pgoff << PAGE_SHIFT;
 	if (!fb)
 		return -ENODEV;
+	mutex_lock(&info->mm_lock);
 	if (fb->fb_mmap) {
 		int res;
-		mutex_lock(&info->lock);
 		res = fb->fb_mmap(info, vma);
-		mutex_unlock(&info->lock);
+		mutex_unlock(&info->mm_lock);
 		return res;
 	}
 
-	mutex_lock(&info->lock);
-
 	/* frame buffer memory */
 	start = info->fix.smem_start;
 	len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.smem_len);
@@ -1342,13 +1338,13 @@ __releases(&info->lock)
 		/* memory mapped io */
 		off -= len;
 		if (info->var.accel_flags) {
-			mutex_unlock(&info->lock);
+			mutex_unlock(&info->mm_lock);
 			return -EINVAL;
 		}
 		start = info->fix.mmio_start;
 		len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.mmio_len);
 	}
-	mutex_unlock(&info->lock);
+	mutex_unlock(&info->mm_lock);
 	start &= PAGE_MASK;
 	if ((vma->vm_end - vma->vm_start + off) > len)
 		return -EINVAL;
@@ -1518,6 +1514,7 @@ register_framebuffer(struct fb_info *fb_info)
 			break;
 	fb_info->node = i;
 	mutex_init(&fb_info->lock);
+	mutex_init(&fb_info->mm_lock);
 
 	fb_info->dev = device_create(fb_class, fb_info->device,
 				     MKDEV(FB_MAJOR, i), NULL, "fb%d", i);
diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
index f153c581cbd7..0bf2190928d0 100644
--- a/drivers/video/fsl-diu-fb.c
+++ b/drivers/video/fsl-diu-fb.c
@@ -750,24 +750,26 @@ static void update_lcdc(struct fb_info *info)
 static int map_video_memory(struct fb_info *info)
 {
 	phys_addr_t phys;
+	u32 smem_len = info->fix.line_length * info->var.yres_virtual;
 
 	pr_debug("info->var.xres_virtual = %d\n", info->var.xres_virtual);
 	pr_debug("info->var.yres_virtual = %d\n", info->var.yres_virtual);
 	pr_debug("info->fix.line_length  = %d\n", info->fix.line_length);
+	pr_debug("MAP_VIDEO_MEMORY: smem_len = %u\n", smem_len);
 
-	info->fix.smem_len = info->fix.line_length * info->var.yres_virtual;
-	pr_debug("MAP_VIDEO_MEMORY: smem_len = %d\n", info->fix.smem_len);
-	info->screen_base = fsl_diu_alloc(info->fix.smem_len, &phys);
+	info->screen_base = fsl_diu_alloc(smem_len, &phys);
 	if (info->screen_base == NULL) {
 		printk(KERN_ERR "Unable to allocate fb memory\n");
 		return -ENOMEM;
 	}
+	mutex_lock(&info->mm_lock);
 	info->fix.smem_start = (unsigned long) phys;
+	info->fix.smem_len = smem_len;
+	mutex_unlock(&info->mm_lock);
 	info->screen_size = info->fix.smem_len;
 
 	pr_debug("Allocated fb @ paddr=0x%08lx, size=%d.\n",
-				info->fix.smem_start,
-		info->fix.smem_len);
+		 info->fix.smem_start, info->fix.smem_len);
 	pr_debug("screen base %p\n", info->screen_base);
 
 	return 0;
@@ -776,9 +778,11 @@ static int map_video_memory(struct fb_info *info)
 static void unmap_video_memory(struct fb_info *info)
 {
 	fsl_diu_free(info->screen_base, info->fix.smem_len);
+	mutex_lock(&info->mm_lock);
 	info->screen_base = NULL;
 	info->fix.smem_start = 0;
 	info->fix.smem_len = 0;
+	mutex_unlock(&info->mm_lock);
 }
 
 /*
diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c
index 2e940199fc89..71960672d721 100644
--- a/drivers/video/i810/i810_main.c
+++ b/drivers/video/i810/i810_main.c
@@ -1090,8 +1090,10 @@ static int encode_fix(struct fb_fix_screeninfo *fix, struct fb_info *info)
     	memset(fix, 0, sizeof(struct fb_fix_screeninfo));
 
     	strcpy(fix->id, "I810");
+	mutex_lock(&info->mm_lock);
     	fix->smem_start = par->fb.physical;
     	fix->smem_len = par->fb.size;
+	mutex_unlock(&info->mm_lock);
     	fix->type = FB_TYPE_PACKED_PIXELS;
     	fix->type_aux = 0;
 	fix->xpanstep = 8;
diff --git a/drivers/video/matrox/matroxfb_base.c b/drivers/video/matrox/matroxfb_base.c
index 8e7a275df50c..59c3a2e14913 100644
--- a/drivers/video/matrox/matroxfb_base.c
+++ b/drivers/video/matrox/matroxfb_base.c
@@ -724,8 +724,10 @@ static void matroxfb_update_fix(WPMINFO2)
 	struct fb_fix_screeninfo *fix = &ACCESS_FBINFO(fbcon).fix;
 	DBG(__func__)
 
+	mutex_lock(&ACCESS_FBINFO(fbcon).mm_lock);
 	fix->smem_start = ACCESS_FBINFO(video.base) + ACCESS_FBINFO(curr.ydstorg.bytes);
 	fix->smem_len = ACCESS_FBINFO(video.len_usable) - ACCESS_FBINFO(curr.ydstorg.bytes);
+	mutex_unlock(&ACCESS_FBINFO(fbcon).mm_lock);
 }
 
 static int matroxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
@@ -2081,6 +2083,7 @@ static int matroxfb_probe(struct pci_dev* pdev, const struct pci_device_id* dumm
 	spin_lock_init(&ACCESS_FBINFO(lock.accel));
 	init_rwsem(&ACCESS_FBINFO(crtc2.lock));
 	init_rwsem(&ACCESS_FBINFO(altout.lock));
+	mutex_init(&ACCESS_FBINFO(fbcon).mm_lock);
 	ACCESS_FBINFO(irq_flags) = 0;
 	init_waitqueue_head(&ACCESS_FBINFO(crtc1.vsync.wait));
 	init_waitqueue_head(&ACCESS_FBINFO(crtc2.vsync.wait));
diff --git a/drivers/video/matrox/matroxfb_crtc2.c b/drivers/video/matrox/matroxfb_crtc2.c
index 7ac4c5f6145d..909e10a11898 100644
--- a/drivers/video/matrox/matroxfb_crtc2.c
+++ b/drivers/video/matrox/matroxfb_crtc2.c
@@ -289,13 +289,16 @@ static int matroxfb_dh_release(struct fb_info* info, int user) {
 #undef m2info
 }
 
-static void matroxfb_dh_init_fix(struct matroxfb_dh_fb_info *m2info) {
+static void matroxfb_dh_init_fix(struct matroxfb_dh_fb_info *m2info)
+{
 	struct fb_fix_screeninfo *fix = &m2info->fbcon.fix;
 
 	strcpy(fix->id, "MATROX DH");
 
+	mutex_lock(&m2info->fbcon.mm_lock);
 	fix->smem_start = m2info->video.base;
 	fix->smem_len = m2info->video.len_usable;
+	mutex_unlock(&m2info->fbcon.mm_lock);
 	fix->ypanstep = 1;
 	fix->ywrapstep = 0;
 	fix->xpanstep = 8;	/* TBD */
diff --git a/drivers/video/mx3fb.c b/drivers/video/mx3fb.c
index b7af5256e887..567fb944bd2a 100644
--- a/drivers/video/mx3fb.c
+++ b/drivers/video/mx3fb.c
@@ -669,7 +669,7 @@ static uint32_t bpp_to_pixfmt(int bpp)
 }
 
 static int mx3fb_blank(int blank, struct fb_info *fbi);
-static int mx3fb_map_video_memory(struct fb_info *fbi);
+static int mx3fb_map_video_memory(struct fb_info *fbi, unsigned int mem_len);
 static int mx3fb_unmap_video_memory(struct fb_info *fbi);
 
 /**
@@ -742,8 +742,7 @@ static int mx3fb_set_par(struct fb_info *fbi)
 		if (fbi->fix.smem_start)
 			mx3fb_unmap_video_memory(fbi);
 
-		fbi->fix.smem_len = mem_len;
-		if (mx3fb_map_video_memory(fbi) < 0) {
+		if (mx3fb_map_video_memory(fbi, mem_len) < 0) {
 			mutex_unlock(&mx3_fbi->mutex);
 			return -ENOMEM;
 		}
@@ -1198,6 +1197,7 @@ static int mx3fb_resume(struct platform_device *pdev)
 /**
  * mx3fb_map_video_memory() - allocates the DRAM memory for the frame buffer.
  * @fbi:	framebuffer information pointer
+ * @mem_len:	length of mapped memory
  * @return:	Error code indicating success or failure
  *
  * This buffer is remapped into a non-cached, non-buffered, memory region to
@@ -1205,23 +1205,26 @@ static int mx3fb_resume(struct platform_device *pdev)
  * area is remapped, all virtual memory access to the video memory should occur
  * at the new region.
  */
-static int mx3fb_map_video_memory(struct fb_info *fbi)
+static int mx3fb_map_video_memory(struct fb_info *fbi, unsigned int mem_len)
 {
 	int retval = 0;
 	dma_addr_t addr;
 
 	fbi->screen_base = dma_alloc_writecombine(fbi->device,
-						  fbi->fix.smem_len,
+						  mem_len,
 						  &addr, GFP_DMA);
 
 	if (!fbi->screen_base) {
 		dev_err(fbi->device, "Cannot allocate %u bytes framebuffer memory\n",
-			fbi->fix.smem_len);
+			mem_len);
 		retval = -EBUSY;
 		goto err0;
 	}
 
+	mutex_lock(&fbi->mm_lock);
 	fbi->fix.smem_start = addr;
+	fbi->fix.smem_len = mem_len;
+	mutex_unlock(&fbi->mm_lock);
 
 	dev_dbg(fbi->device, "allocated fb @ p=0x%08x, v=0x%p, size=%d.\n",
 		(uint32_t) fbi->fix.smem_start, fbi->screen_base, fbi->fix.smem_len);
@@ -1251,8 +1254,10 @@ static int mx3fb_unmap_video_memory(struct fb_info *fbi)
 			      fbi->screen_base, fbi->fix.smem_start);
 
 	fbi->screen_base = 0;
+	mutex_lock(&fbi->mm_lock);
 	fbi->fix.smem_start = 0;
 	fbi->fix.smem_len = 0;
+	mutex_unlock(&fbi->mm_lock);
 	return 0;
 }
 
diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c
index 060d72fe57cb..4ea99bfc37b4 100644
--- a/drivers/video/omap/omapfb_main.c
+++ b/drivers/video/omap/omapfb_main.c
@@ -393,8 +393,10 @@ static void set_fb_fix(struct fb_info *fbi)
 
 	rg = &plane->fbdev->mem_desc.region[plane->idx];
 	fbi->screen_base	= rg->vaddr;
+	mutex_lock(&fbi->mm_lock);
 	fix->smem_start		= rg->paddr;
 	fix->smem_len		= rg->size;
+	mutex_unlock(&fbi->mm_lock);
 
 	fix->type = FB_TYPE_PACKED_PIXELS;
 	bpp = var->bits_per_pixel;
@@ -886,8 +888,10 @@ static int omapfb_setup_mem(struct fb_info *fbi, struct omapfb_mem_info *mi)
 				 * plane memory is dealloce'd, the other
 				 * screen parameters in var / fix are invalid.
 				 */
+				mutex_lock(&fbi->mm_lock);
 				fbi->fix.smem_start = 0;
 				fbi->fix.smem_len = 0;
+				mutex_unlock(&fbi->mm_lock);
 			}
 		}
 	}
diff --git a/drivers/video/platinumfb.c b/drivers/video/platinumfb.c
index 03b3670130a0..bacfabd9ce16 100644
--- a/drivers/video/platinumfb.c
+++ b/drivers/video/platinumfb.c
@@ -141,7 +141,9 @@ static int platinumfb_set_par (struct fb_info *info)
   		offset = 0x10;
 
 	info->screen_base = pinfo->frame_buffer + init->fb_offset + offset;
+	mutex_lock(&info->mm_lock);
 	info->fix.smem_start = (pinfo->frame_buffer_phys) + init->fb_offset + offset;
+	mutex_unlock(&info->mm_lock);
 	info->fix.visual = (pinfo->cmode == CMODE_8) ?
 		FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_DIRECTCOLOR;
  	info->fix.line_length = vmode_attrs[pinfo->vmode-1].hres * (1<<pinfo->cmode)
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c
index 0889d50c3288..6506117c134b 100644
--- a/drivers/video/pxafb.c
+++ b/drivers/video/pxafb.c
@@ -815,8 +815,10 @@ static int overlayfb_map_video_memory(struct pxafb_layer *ofb)
 	ofb->video_mem_phys = virt_to_phys(ofb->video_mem);
 	ofb->video_mem_size = size;
 
+	mutex_lock(&ofb->fb.mm_lock);
 	ofb->fb.fix.smem_start	= ofb->video_mem_phys;
 	ofb->fb.fix.smem_len	= ofb->fb.fix.line_length * var->yres_virtual;
+	mutex_unlock(&ofb->fb.mm_lock);
 	ofb->fb.screen_base	= ofb->video_mem;
 	return 0;
 }
diff --git a/drivers/video/sh7760fb.c b/drivers/video/sh7760fb.c
index 653bdfee3057..9f6d6e61f0cc 100644
--- a/drivers/video/sh7760fb.c
+++ b/drivers/video/sh7760fb.c
@@ -120,18 +120,6 @@ static int sh7760_setcolreg (u_int regno,
 	return 0;
 }
 
-static void encode_fix(struct fb_fix_screeninfo *fix, struct fb_info *info,
-		       unsigned long stride)
-{
-	memset(fix, 0, sizeof(struct fb_fix_screeninfo));
-	strcpy(fix->id, "sh7760-lcdc");
-
-	fix->smem_start = (unsigned long)info->screen_base;
-	fix->smem_len = info->screen_size;
-
-	fix->line_length = stride;
-}
-
 static int sh7760fb_get_color_info(struct device *dev,
 				   u16 lddfr, int *bpp, int *gray)
 {
@@ -334,7 +322,8 @@ static int sh7760fb_set_par(struct fb_info *info)
 
 	iowrite32(ldsarl, par->base + LDSARL);	/* mem for lower half of DSTN */
 
-	encode_fix(&info->fix, info, stride);
+	info->fix.line_length = stride;
+
 	sh7760fb_check_var(&info->var, info);
 
 	sh7760fb_blank(FB_BLANK_UNBLANK, info);	/* panel on! */
@@ -435,6 +424,8 @@ static int sh7760fb_alloc_mem(struct fb_info *info)
 
 	info->screen_base = fbmem;
 	info->screen_size = vram;
+	info->fix.smem_start = (unsigned long)info->screen_base;
+	info->fix.smem_len = info->screen_size;
 
 	return 0;
 }
@@ -520,6 +511,8 @@ static int __devinit sh7760fb_probe(struct platform_device *pdev)
 	info->var.transp.length = 0;
 	info->var.transp.msb_right = 0;
 
+	strcpy(info->fix.id, "sh7760-lcdc");
+
 	/* set the DON2 bit now, before cmap allocation, as it will randomize
 	 * palette memory.
 	 */
diff --git a/drivers/video/sis/sis_main.c b/drivers/video/sis/sis_main.c
index 7072d19080d5..fd33455389b8 100644
--- a/drivers/video/sis/sis_main.c
+++ b/drivers/video/sis/sis_main.c
@@ -1847,8 +1847,10 @@ sisfb_get_fix(struct fb_fix_screeninfo *fix, int con, struct fb_info *info)
 
 	strcpy(fix->id, ivideo->myid);
 
+	mutex_lock(&info->mm_lock);
 	fix->smem_start  = ivideo->video_base + ivideo->video_offset;
 	fix->smem_len    = ivideo->sisfb_mem;
+	mutex_unlock(&info->mm_lock);
 	fix->type        = FB_TYPE_PACKED_PIXELS;
 	fix->type_aux    = 0;
 	fix->visual      = (ivideo->video_bpp == 8) ? FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_TRUECOLOR;
diff --git a/drivers/video/sm501fb.c b/drivers/video/sm501fb.c
index eb5d73a06702..98f24f0ec00d 100644
--- a/drivers/video/sm501fb.c
+++ b/drivers/video/sm501fb.c
@@ -145,7 +145,7 @@ static inline void sm501fb_sync_regs(struct sm501fb_info *info)
 #define SM501_MEMF_ACCEL		(8)
 
 static int sm501_alloc_mem(struct sm501fb_info *inf, struct sm501_mem *mem,
-			   unsigned int why, size_t size)
+			   unsigned int why, size_t size, u32 smem_len)
 {
 	struct sm501fb_par *par;
 	struct fb_info *fbi;
@@ -172,7 +172,7 @@ static int sm501_alloc_mem(struct sm501fb_info *inf, struct sm501_mem *mem,
 		if (ptr > 0)
 			ptr &= ~(PAGE_SIZE - 1);
 
-		if (fbi && ptr < fbi->fix.smem_len)
+		if (fbi && ptr < smem_len)
 			return -ENOMEM;
 
 		break;
@@ -197,7 +197,7 @@ static int sm501_alloc_mem(struct sm501fb_info *inf, struct sm501_mem *mem,
 
 	case SM501_MEMF_ACCEL:
 		fbi = inf->fb[HEAD_CRT];
-		ptr = fbi ? fbi->fix.smem_len : 0;
+		ptr = fbi ? smem_len : 0;
 
 		fbi = inf->fb[HEAD_PANEL];
 		if (fbi) {
@@ -413,6 +413,7 @@ static int sm501fb_set_par_common(struct fb_info *info,
 	unsigned int mem_type;
 	unsigned int clock_type;
 	unsigned int head_addr;
+	unsigned int smem_len;
 
 	dev_dbg(fbi->dev, "%s: %dx%d, bpp = %d, virtual %dx%d\n",
 		__func__, var->xres, var->yres, var->bits_per_pixel,
@@ -453,18 +454,20 @@ static int sm501fb_set_par_common(struct fb_info *info,
 
 	/* allocate fb memory within 501 */
 	info->fix.line_length = (var->xres_virtual * var->bits_per_pixel)/8;
-	info->fix.smem_len    = info->fix.line_length * var->yres_virtual;
+	smem_len = info->fix.line_length * var->yres_virtual;
 
 	dev_dbg(fbi->dev, "%s: line length = %u\n", __func__,
 		info->fix.line_length);
 
-	if (sm501_alloc_mem(fbi, &par->screen, mem_type,
-			    info->fix.smem_len)) {
+	if (sm501_alloc_mem(fbi, &par->screen, mem_type, smem_len, smem_len)) {
 		dev_err(fbi->dev, "no memory available\n");
 		return -ENOMEM;
 	}
 
+	mutex_lock(&info->mm_lock);
 	info->fix.smem_start = fbi->fbmem_res->start + par->screen.sm_addr;
+	info->fix.smem_len   = smem_len;
+	mutex_unlock(&info->mm_lock);
 
 	info->screen_base = fbi->fbmem + par->screen.sm_addr;
 	info->screen_size = info->fix.smem_len;
@@ -637,7 +640,8 @@ static int sm501fb_set_par_crt(struct fb_info *info)
 	if ((control & SM501_DC_CRT_CONTROL_SEL) == 0) {
 		/* the head is displaying panel data... */
 
-		sm501_alloc_mem(fbi, &par->screen, SM501_MEMF_CRT, 0);
+		sm501_alloc_mem(fbi, &par->screen, SM501_MEMF_CRT, 0,
+				info->fix.smem_len);
 		goto out_update;
 	}
 
@@ -1289,7 +1293,8 @@ static int sm501_init_cursor(struct fb_info *fbi, unsigned int reg_base)
 
 	par->cursor_regs = info->regs + reg_base;
 
-	ret = sm501_alloc_mem(info, &par->cursor, SM501_MEMF_CURSOR, 1024);
+	ret = sm501_alloc_mem(info, &par->cursor, SM501_MEMF_CURSOR, 1024,
+			      fbi->fix.smem_len);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/video/w100fb.c b/drivers/video/w100fb.c
index d0674f1e3f10..8a141c2c637b 100644
--- a/drivers/video/w100fb.c
+++ b/drivers/video/w100fb.c
@@ -523,6 +523,7 @@ static int w100fb_set_par(struct fb_info *info)
 		info->fix.ywrapstep = 0;
 		info->fix.line_length = par->xres * BITS_PER_PIXEL / 8;
 
+		mutex_lock(&info->mm_lock);
 		if ((par->xres*par->yres*BITS_PER_PIXEL/8) > (MEM_INT_SIZE+1)) {
 			par->extmem_active = 1;
 			info->fix.smem_len = par->mach->mem->size+1;
@@ -530,6 +531,7 @@ static int w100fb_set_par(struct fb_info *info)
 			par->extmem_active = 0;
 			info->fix.smem_len = MEM_INT_SIZE+1;
 		}
+		mutex_unlock(&info->mm_lock);
 
 		w100fb_activate_var(par);
 	}
diff --git a/include/linux/fb.h b/include/linux/fb.h
index dd68358996b7..f847df9e99b6 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -819,6 +819,7 @@ struct fb_info {
 	int node;
 	int flags;
 	struct mutex lock;		/* Lock for open/release/ioctl funcs */
+	struct mutex mm_lock;		/* Lock for fb_mmap and smem_* fields */
 	struct fb_var_screeninfo var;	/* Current var */
 	struct fb_fix_screeninfo fix;	/* Current fix */
 	struct fb_monspecs monspecs;	/* Current Monitor specs */
-- 
cgit v1.2.3-71-gd317


From d9d62f3f2c6fa609883714f6fd6cd710a83d307f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 29 Jun 2009 16:54:12 +0000
Subject: usbnet: Remove private stats structure

Now that nothing uses the private stats structure we can remove it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/usb/usbnet.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 5d44059f6d63..310e18a880ff 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -42,7 +42,6 @@ struct usbnet {
 
 	/* protocol/interface state */
 	struct net_device	*net;
-	struct net_device_stats	stats;
 	int			msg_enable;
 	unsigned long		data [5];
 	u32			xid;
-- 
cgit v1.2.3-71-gd317


From 7878cba9f0037f5599004b03a1260b32d9050360 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Jun 2009 15:37:49 +0200
Subject: block: Create bip slabs with embedded integrity vectors

This patch restores stacking ability to the block layer integrity
infrastructure by creating a set of dedicated bip slabs.  Each bip slab
has an embedded bio_vec array at the end.  This cuts down on memory
allocations and also simplifies the code compared to the original bvec
version.  Only the largest bip slab is backed by a mempool.  The pool is
contained in the bio_set so stacking drivers can ensure forward
progress.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.(none)>
---
 block/blk-core.c    |   2 +-
 drivers/md/dm.c     |   4 +-
 fs/bio-integrity.c  | 170 ++++++++++++++++++++++++++++++++++++++--------------
 fs/bio.c            |  11 +++-
 include/linux/bio.h |  22 +++++--
 5 files changed, 152 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index b06cf5c2a829..345d99da8d41 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2365,7 +2365,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		__bio_clone(bio, bio_src);
 
 		if (bio_integrity(bio_src) &&
-		    bio_integrity_clone(bio, bio_src, gfp_mask))
+		    bio_integrity_clone(bio, bio_src, gfp_mask, bs))
 			goto free_and_out;
 
 		if (bio_ctr && bio_ctr(bio, bio_src, data))
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3c6d4ee8921d..9acd54a5cffb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1017,7 +1017,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 	clone->bi_flags |= 1 << BIO_CLONED;
 
 	if (bio_integrity(bio)) {
-		bio_integrity_clone(clone, bio, GFP_NOIO);
+		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
 		bio_integrity_trim(clone,
 				   bio_sector_offset(bio, idx, offset), len);
 	}
@@ -1045,7 +1045,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 	if (bio_integrity(bio)) {
-		bio_integrity_clone(clone, bio, GFP_NOIO);
+		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
 
 		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
 			bio_integrity_trim(clone,
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 31c46a241bac..49a34e7f7306 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -1,7 +1,7 @@
 /*
  * bio-integrity.c - bio data integrity extensions
  *
- * Copyright (C) 2007, 2008 Oracle Corporation
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
  * Written by: Martin K. Petersen <martin.petersen@oracle.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -25,63 +25,121 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 
-static struct kmem_cache *bio_integrity_slab __read_mostly;
-static mempool_t *bio_integrity_pool;
-static struct bio_set *integrity_bio_set;
+struct integrity_slab {
+	struct kmem_cache *slab;
+	unsigned short nr_vecs;
+	char name[8];
+};
+
+#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
+struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
+	IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
+};
+#undef IS
+
 static struct workqueue_struct *kintegrityd_wq;
 
+static inline unsigned int vecs_to_idx(unsigned int nr)
+{
+	switch (nr) {
+	case 1:
+		return 0;
+	case 2 ... 4:
+		return 1;
+	case 5 ... 16:
+		return 2;
+	case 17 ... 64:
+		return 3;
+	case 65 ... 128:
+		return 4;
+	case 129 ... BIO_MAX_PAGES:
+		return 5;
+	default:
+		BUG();
+	}
+}
+
+static inline int use_bip_pool(unsigned int idx)
+{
+	if (idx == BIOVEC_NR_POOLS)
+		return 1;
+
+	return 0;
+}
+
 /**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
  * @bio:	bio to attach integrity metadata to
  * @gfp_mask:	Memory allocation mask
  * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ * @bs:		bio_set to allocate from
  *
  * Description: This function prepares a bio for attaching integrity
  * metadata.  nr_vecs specifies the maximum number of pages containing
  * integrity metadata that can be attached.
  */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-						  gfp_t gfp_mask,
-						  unsigned int nr_vecs)
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+							 gfp_t gfp_mask,
+							 unsigned int nr_vecs,
+							 struct bio_set *bs)
 {
 	struct bio_integrity_payload *bip;
-	struct bio_vec *iv;
-	unsigned long idx;
+	unsigned int idx = vecs_to_idx(nr_vecs);
 
 	BUG_ON(bio == NULL);
+	bip = NULL;
 
-	bip = mempool_alloc(bio_integrity_pool, gfp_mask);
-	if (unlikely(bip == NULL)) {
-		printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-		return NULL;
-	}
+	/* Lower order allocations come straight from slab */
+	if (!use_bip_pool(idx))
+		bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
 
-	memset(bip, 0, sizeof(*bip));
+	/* Use mempool if lower order alloc failed or max vecs were requested */
+	if (bip == NULL) {
+		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
 
-	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
-	if (unlikely(iv == NULL)) {
-		printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
-		mempool_free(bip, bio_integrity_pool);
-		return NULL;
+		if (unlikely(bip == NULL)) {
+			printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+			return NULL;
+		}
 	}
 
-	bip->bip_pool = idx;
-	bip->bip_vec = iv;
+	memset(bip, 0, sizeof(*bip));
+
+	bip->bip_slab = idx;
 	bip->bip_bio = bio;
 	bio->bi_integrity = bip;
 
 	return bip;
 }
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
+{
+	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
 EXPORT_SYMBOL(bio_integrity_alloc);
 
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
+ * @bs:		bio_set this bio was allocated from
  *
  * Description: Used to free the integrity portion of a bio. Usually
  * called from bio_free().
  */
-void bio_integrity_free(struct bio *bio)
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 {
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 
@@ -92,8 +150,10 @@ void bio_integrity_free(struct bio *bio)
 	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
-	bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
-	mempool_free(bip, bio_integrity_pool);
+	if (use_bip_pool(bip->bip_slab))
+		mempool_free(bip, bs->bio_integrity_pool);
+	else
+		kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
 
 	bio->bi_integrity = NULL;
 }
@@ -114,7 +174,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct bio_vec *iv;
 
-	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
 		printk(KERN_ERR "%s: bip_vec full\n", __func__);
 		return 0;
 	}
@@ -647,8 +707,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
 	bp->iv1 = bip->bip_vec[0];
 	bp->iv2 = bip->bip_vec[0];
 
-	bp->bip1.bip_vec = &bp->iv1;
-	bp->bip2.bip_vec = &bp->iv2;
+	bp->bip1.bip_vec[0] = bp->iv1;
+	bp->bip2.bip_vec[0] = bp->iv2;
 
 	bp->iv1.bv_len = sectors * bi->tuple_size;
 	bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -667,17 +727,19 @@ EXPORT_SYMBOL(bio_integrity_split);
  * @bio:	New bio
  * @bio_src:	Original bio
  * @gfp_mask:	Memory allocation mask
+ * @bs:		bio_set to allocate bip from
  *
  * Description:	Called to allocate a bip when cloning a bio
  */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+			gfp_t gfp_mask, struct bio_set *bs)
 {
 	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
 	struct bio_integrity_payload *bip;
 
 	BUG_ON(bip_src == NULL);
 
-	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
+	bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
 
 	if (bip == NULL)
 		return -EIO;
@@ -693,25 +755,43 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_integrity_clone);
 
-static int __init bio_integrity_init(void)
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-	kintegrityd_wq = create_workqueue("kintegrityd");
+	unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
+
+	bs->bio_integrity_pool =
+		mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
 
+	if (!bs->bio_integrity_pool)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+	if (bs->bio_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init(void)
+{
+	unsigned int i;
+
+	kintegrityd_wq = create_workqueue("kintegrityd");
 	if (!kintegrityd_wq)
 		panic("Failed to create kintegrityd\n");
 
-	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
-					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
+		unsigned int size;
 
-	bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
-						      bio_integrity_slab);
-	if (!bio_integrity_pool)
-		panic("bio_integrity: can't allocate bip pool\n");
+		size = sizeof(struct bio_integrity_payload)
+			+ bip_slab[i].nr_vecs * sizeof(struct bio_vec);
 
-	integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
-	if (!integrity_bio_set)
-		panic("bio_integrity: can't allocate bio_set\n");
-
-	return 0;
+		bip_slab[i].slab =
+			kmem_cache_create(bip_slab[i].name, size, 0,
+					  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	}
 }
-subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 24c914043532..1486b19fc431 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -238,7 +238,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
-		bio_integrity_free(bio);
+		bio_integrity_free(bio, bs);
 
 	/*
 	 * If we have front padding, adjust the bio pointer before freeing
@@ -341,7 +341,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 static void bio_kmalloc_destructor(struct bio *bio)
 {
 	if (bio_integrity(bio))
-		bio_integrity_free(bio);
+		bio_integrity_free(bio, fs_bio_set);
 	kfree(bio);
 }
 
@@ -472,7 +472,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 	if (bio_integrity(bio)) {
 		int ret;
 
-		ret = bio_integrity_clone(b, bio, gfp_mask);
+		ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
 
 		if (ret < 0) {
 			bio_put(b);
@@ -1539,6 +1539,7 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
+	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
 	bio_put_slab(bs);
 
@@ -1579,6 +1580,9 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
+	if (bioset_integrity_create(bs, pool_size))
+		goto bad;
+
 	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
@@ -1616,6 +1620,7 @@ static int __init init_bio(void)
 	if (!bio_slabs)
 		panic("bio: can't allocate bios\n");
 
+	bio_integrity_init();
 	biovec_init_slabs();
 
 	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2a04eb54c0dd..2892b710771c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -319,7 +319,6 @@ static inline int bio_has_allocated_vec(struct bio *bio)
  */
 struct bio_integrity_payload {
 	struct bio		*bip_bio;	/* parent bio */
-	struct bio_vec		*bip_vec;	/* integrity data vector */
 
 	sector_t		bip_sector;	/* virtual start sector */
 
@@ -328,11 +327,12 @@ struct bio_integrity_payload {
 
 	unsigned int		bip_size;
 
-	unsigned short		bip_pool;	/* pool the ivec came from */
+	unsigned short		bip_slab;	/* slab the bip came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
 	unsigned short		bip_idx;	/* current bip_vec index */
 
 	struct work_struct	bip_work;	/* I/O completion */
+	struct bio_vec		bip_vec[0];	/* embedded bvec array */
 };
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
@@ -430,6 +430,9 @@ struct bio_set {
 	unsigned int front_pad;
 
 	mempool_t *bio_pool;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	mempool_t *bio_integrity_pool;
+#endif
 	mempool_t *bvec_pool;
 };
 
@@ -634,8 +637,9 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 
 #define bio_integrity(bio) (bio->bi_integrity != NULL)
 
+extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
-extern void bio_integrity_free(struct bio *);
+extern void bio_integrity_free(struct bio *, struct bio_set *);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern int bio_integrity_enabled(struct bio *bio);
 extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
@@ -645,21 +649,27 @@ extern void bio_integrity_endio(struct bio *, int);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
 extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
-extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
+extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t, struct bio_set *);
+extern int bioset_integrity_create(struct bio_set *, int);
+extern void bioset_integrity_free(struct bio_set *);
+extern void bio_integrity_init(void);
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 #define bio_integrity(a)		(0)
+#define bioset_integrity_create(a, b)	(0)
 #define bio_integrity_prep(a)		(0)
 #define bio_integrity_enabled(a)	(0)
-#define bio_integrity_clone(a, b, c)	(0)
-#define bio_integrity_free(a)		do { } while (0)
+#define bio_integrity_clone(a, b, c, d)	(0)
+#define bioset_integrity_free(a)	do { } while (0)
+#define bio_integrity_free(a, b)	do { } while (0)
 #define bio_integrity_endio(a, b)	do { } while (0)
 #define bio_integrity_advance(a, b)	do { } while (0)
 #define bio_integrity_trim(a, b, c)	do { } while (0)
 #define bio_integrity_split(a, b, c)	do { } while (0)
 #define bio_integrity_set_tag(a, b, c)	do { } while (0)
 #define bio_integrity_get_tag(a, b, c)	do { } while (0)
+#define bio_integrity_init(a)		do { } while (0)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-- 
cgit v1.2.3-71-gd317


From 018e0446890661504783f92388ecce7138c1566d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 26 Jun 2009 16:27:10 +0200
Subject: block: get rid of queue-private command filter

The initial patches to support this through sysfs export were broken
and have been if 0'ed out in any release. So lets just kill the code
and reclaim some space in struct request_queue, if anyone would later
like to fixup the sysfs bits, the git history can easily restore
the removed bits.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile         |   2 +-
 block/blk-core.c       |   2 -
 block/bsg.c            |   2 +-
 block/cmd-filter.c     | 233 -------------------------------------------------
 block/scsi_ioctl.c     |  43 +++++++--
 drivers/scsi/sg.c      |   4 +-
 include/linux/blkdev.h |  15 +---
 7 files changed, 42 insertions(+), 259 deletions(-)
 delete mode 100644 block/cmd-filter.c

(limited to 'include/linux')

diff --git a/block/Makefile b/block/Makefile
index e9fa4dd690f2..6c54ed0ff755 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
+			ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 345d99da8d41..02b87134a167 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -595,8 +595,6 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 
 	q->sg_reserved_size = INT_MAX;
 
-	blk_set_cmd_filter_defaults(&q->cmd_filter);
-
 	/*
 	 * all done
 	 */
diff --git a/block/bsg.c b/block/bsg.c
index e7d475254248..5f184bb3ff9e 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -186,7 +186,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(&q->cmd_filter, rq->cmd, has_write_perm))
+		if (blk_verify_command(rq->cmd, has_write_perm))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
diff --git a/block/cmd-filter.c b/block/cmd-filter.c
deleted file mode 100644
index 572bbc2f900d..000000000000
--- a/block/cmd-filter.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright 2004 Peter M. Jones <pjones@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public Licens
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
- *
- */
-
-#include <linux/list.h>
-#include <linux/genhd.h>
-#include <linux/spinlock.h>
-#include <linux/capability.h>
-#include <linux/bitops.h>
-#include <linux/blkdev.h>
-
-#include <scsi/scsi.h>
-#include <linux/cdrom.h>
-
-int blk_verify_command(struct blk_cmd_filter *filter,
-		       unsigned char *cmd, fmode_t has_write_perm)
-{
-	/* root can do any command. */
-	if (capable(CAP_SYS_RAWIO))
-		return 0;
-
-	/* if there's no filter set, assume we're filtering everything out */
-	if (!filter)
-		return -EPERM;
-
-	/* Anybody who can open the device can do a read-safe command */
-	if (test_bit(cmd[0], filter->read_ok))
-		return 0;
-
-	/* Write-safe commands require a writable open */
-	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
-		return 0;
-
-	return -EPERM;
-}
-EXPORT_SYMBOL(blk_verify_command);
-
-#if 0
-/* and now, the sysfs stuff */
-static ssize_t rcf_cmds_show(struct blk_cmd_filter *filter, char *page,
-			     int rw)
-{
-	char *npage = page;
-	unsigned long *okbits;
-	int i;
-
-	if (rw == READ)
-		okbits = filter->read_ok;
-	else
-		okbits = filter->write_ok;
-
-	for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) {
-		if (test_bit(i, okbits)) {
-			npage += sprintf(npage, "0x%02x", i);
-			if (i < BLK_SCSI_MAX_CMDS - 1)
-				sprintf(npage++, " ");
-		}
-	}
-
-	if (npage != page)
-		npage += sprintf(npage, "\n");
-
-	return npage - page;
-}
-
-static ssize_t rcf_readcmds_show(struct blk_cmd_filter *filter, char *page)
-{
-	return rcf_cmds_show(filter, page, READ);
-}
-
-static ssize_t rcf_writecmds_show(struct blk_cmd_filter *filter,
-				 char *page)
-{
-	return rcf_cmds_show(filter, page, WRITE);
-}
-
-static ssize_t rcf_cmds_store(struct blk_cmd_filter *filter,
-			      const char *page, size_t count, int rw)
-{
-	unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits;
-	int cmd, set;
-	char *p, *status;
-
-	if (rw == READ) {
-		memcpy(&okbits, filter->read_ok, sizeof(okbits));
-		target_okbits = filter->read_ok;
-	} else {
-		memcpy(&okbits, filter->write_ok, sizeof(okbits));
-		target_okbits = filter->write_ok;
-	}
-
-	while ((p = strsep((char **)&page, " ")) != NULL) {
-		set = 1;
-
-		if (p[0] == '+') {
-			p++;
-		} else if (p[0] == '-') {
-			set = 0;
-			p++;
-		}
-
-		cmd = simple_strtol(p, &status, 16);
-
-		/* either of these cases means invalid input, so do nothing. */
-		if ((status == p) || cmd >= BLK_SCSI_MAX_CMDS)
-			return -EINVAL;
-
-		if (set)
-			__set_bit(cmd, okbits);
-		else
-			__clear_bit(cmd, okbits);
-	}
-
-	memcpy(target_okbits, okbits, sizeof(okbits));
-	return count;
-}
-
-static ssize_t rcf_readcmds_store(struct blk_cmd_filter *filter,
-				  const char *page, size_t count)
-{
-	return rcf_cmds_store(filter, page, count, READ);
-}
-
-static ssize_t rcf_writecmds_store(struct blk_cmd_filter *filter,
-				   const char *page, size_t count)
-{
-	return rcf_cmds_store(filter, page, count, WRITE);
-}
-
-struct rcf_sysfs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct blk_cmd_filter *, char *);
-	ssize_t (*store)(struct blk_cmd_filter *, const char *, size_t);
-};
-
-static struct rcf_sysfs_entry rcf_readcmds_entry = {
-	.attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR },
-	.show = rcf_readcmds_show,
-	.store = rcf_readcmds_store,
-};
-
-static struct rcf_sysfs_entry rcf_writecmds_entry = {
-	.attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR },
-	.show = rcf_writecmds_show,
-	.store = rcf_writecmds_store,
-};
-
-static struct attribute *default_attrs[] = {
-	&rcf_readcmds_entry.attr,
-	&rcf_writecmds_entry.attr,
-	NULL,
-};
-
-#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr)
-
-static ssize_t
-rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	struct rcf_sysfs_entry *entry = to_rcf(attr);
-	struct blk_cmd_filter *filter;
-
-	filter = container_of(kobj, struct blk_cmd_filter, kobj);
-	if (entry->show)
-		return entry->show(filter, page);
-
-	return 0;
-}
-
-static ssize_t
-rcf_attr_store(struct kobject *kobj, struct attribute *attr,
-			const char *page, size_t length)
-{
-	struct rcf_sysfs_entry *entry = to_rcf(attr);
-	struct blk_cmd_filter *filter;
-
-	if (!capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	if (!entry->store)
-		return -EINVAL;
-
-	filter = container_of(kobj, struct blk_cmd_filter, kobj);
-	return entry->store(filter, page, length);
-}
-
-static struct sysfs_ops rcf_sysfs_ops = {
-	.show = rcf_attr_show,
-	.store = rcf_attr_store,
-};
-
-static struct kobj_type rcf_ktype = {
-	.sysfs_ops = &rcf_sysfs_ops,
-	.default_attrs = default_attrs,
-};
-
-int blk_register_filter(struct gendisk *disk)
-{
-	int ret;
-	struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
-
-	ret = kobject_init_and_add(&filter->kobj, &rcf_ktype,
-				   &disk_to_dev(disk)->kobj,
-				   "%s", "cmd_filter");
-	if (ret < 0)
-		return ret;
-
-	return 0;
-}
-EXPORT_SYMBOL(blk_register_filter);
-
-void blk_unregister_filter(struct gendisk *disk)
-{
-	struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
-
-	kobject_put(&filter->kobj);
-}
-EXPORT_SYMBOL(blk_unregister_filter);
-#endif
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 5f8e798ede4e..f0e0ce0a607d 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -32,6 +32,11 @@
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi_cmnd.h>
 
+struct blk_cmd_filter {
+	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
+	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
+} blk_default_cmd_filter;
+
 /* Command group 3 is reserved and should never be used.  */
 const unsigned char scsi_command_size_tbl[8] =
 {
@@ -105,7 +110,7 @@ static int sg_emulated_host(struct request_queue *q, int __user *p)
 	return put_user(1, p);
 }
 
-void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
+static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
 {
 	/* Basic read-only commands */
 	__set_bit(TEST_UNIT_READY, filter->read_ok);
@@ -187,14 +192,37 @@ void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
 	__set_bit(GPCMD_SET_STREAMING, filter->write_ok);
 	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
 }
-EXPORT_SYMBOL_GPL(blk_set_cmd_filter_defaults);
+
+int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
+{
+	struct blk_cmd_filter *filter = &blk_default_cmd_filter;
+
+	/* root can do any command. */
+	if (capable(CAP_SYS_RAWIO))
+		return 0;
+
+	/* if there's no filter set, assume we're filtering everything out */
+	if (!filter)
+		return -EPERM;
+
+	/* Anybody who can open the device can do a read-safe command */
+	if (test_bit(cmd[0], filter->read_ok))
+		return 0;
+
+	/* Write-safe commands require a writable open */
+	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
+		return 0;
+
+	return -EPERM;
+}
+EXPORT_SYMBOL(blk_verify_command);
 
 static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 			     struct sg_io_hdr *hdr, fmode_t mode)
 {
 	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(&q->cmd_filter, rq->cmd, mode & FMODE_WRITE))
+	if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
 		return -EPERM;
 
 	/*
@@ -427,7 +455,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(&q->cmd_filter, rq->cmd, mode & FMODE_WRITE);
+	err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
 	if (err)
 		goto error;
 
@@ -645,5 +673,10 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 	blk_put_queue(q);
 	return err;
 }
-
 EXPORT_SYMBOL(scsi_cmd_ioctl);
+
+int __init blk_scsi_ioctl_init(void)
+{
+	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
+	return 0;
+}
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 8201387b4daa..ef142fd47a83 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -210,13 +210,11 @@ static void sg_put_dev(Sg_device *sdp);
 static int sg_allow_access(struct file *filp, unsigned char *cmd)
 {
 	struct sg_fd *sfp = (struct sg_fd *)filp->private_data;
-	struct request_queue *q = sfp->parentdp->device->request_queue;
 
 	if (sfp->parentdp->device->type == TYPE_SCANNER)
 		return 0;
 
-	return blk_verify_command(&q->cmd_filter,
-				  cmd, filp->f_mode & FMODE_WRITE);
+	return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE);
 }
 
 static int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8963d9149b5f..49ae07951d55 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -301,12 +301,6 @@ struct blk_queue_tag {
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
 
-struct blk_cmd_filter {
-	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
-	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
-	struct kobject kobj;
-};
-
 struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
@@ -445,7 +439,6 @@ struct request_queue
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
 #endif
-	struct blk_cmd_filter cmd_filter;
 };
 
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
@@ -998,13 +991,7 @@ static inline int sb_issue_discard(struct super_block *sb,
 	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL);
 }
 
-/*
-* command filter functions
-*/
-extern int blk_verify_command(struct blk_cmd_filter *filter,
-			      unsigned char *cmd, fmode_t has_write_perm);
-extern void blk_unregister_filter(struct gendisk *disk);
-extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
+extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
-- 
cgit v1.2.3-71-gd317